1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include <algorithm> // std::min
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring> // memset
27
28 #include "src/dsp/arm/common_neon.h"
29 #include "src/dsp/constants.h"
30 #include "src/dsp/dsp.h"
31 #include "src/utils/common.h"
32
33 namespace libgav1 {
34 namespace dsp {
35 namespace low_bitdepth {
36 namespace {
37
38 // Blend two values based on a 32 bit weight.
WeightedBlend(const uint8x8_t a,const uint8x8_t b,const uint8x8_t a_weight,const uint8x8_t b_weight)39 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
40 const uint8x8_t a_weight,
41 const uint8x8_t b_weight) {
42 const uint16x8_t a_product = vmull_u8(a, a_weight);
43 const uint16x8_t b_product = vmull_u8(b, b_weight);
44
45 return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
46 }
47
48 // For vertical operations the weights are one constant value.
WeightedBlend(const uint8x8_t a,const uint8x8_t b,const uint8_t weight)49 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
50 const uint8_t weight) {
51 return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
52 }
53
54 // Fill |left| and |right| with the appropriate values for a given |base_step|.
LoadStepwise(const uint8_t * const source,const uint8x8_t left_step,const uint8x8_t right_step,uint8x8_t * left,uint8x8_t * right)55 inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step,
56 const uint8x8_t right_step, uint8x8_t* left,
57 uint8x8_t* right) {
58 const uint8x16_t mixed = vld1q_u8(source);
59 *left = VQTbl1U8(mixed, left_step);
60 *right = VQTbl1U8(mixed, right_step);
61 }
62
63 // Handle signed step arguments by ignoring the sign. Negative values are
64 // considered out of range and overwritten later.
LoadStepwise(const uint8_t * const source,const int8x8_t left_step,const int8x8_t right_step,uint8x8_t * left,uint8x8_t * right)65 inline void LoadStepwise(const uint8_t* const source, const int8x8_t left_step,
66 const int8x8_t right_step, uint8x8_t* left,
67 uint8x8_t* right) {
68 LoadStepwise(source, vreinterpret_u8_s8(left_step),
69 vreinterpret_u8_s8(right_step), left, right);
70 }
71
72 // Process 4 or 8 |width| by any |height|.
73 template <int width>
DirectionalZone1_WxH(uint8_t * dst,const ptrdiff_t stride,const int height,const uint8_t * const top,const int xstep,const bool upsampled)74 inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
75 const int height, const uint8_t* const top,
76 const int xstep, const bool upsampled) {
77 assert(width == 4 || width == 8);
78
79 const int upsample_shift = static_cast<int>(upsampled);
80 const int scale_bits = 6 - upsample_shift;
81
82 const int max_base_x = (width + height - 1) << upsample_shift;
83 const int8x8_t max_base = vdup_n_s8(max_base_x);
84 const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
85
86 const int8x8_t all = vcreate_s8(0x0706050403020100);
87 const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
88 const int8x8_t base_step = upsampled ? even : all;
89 const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
90
91 int top_x = xstep;
92 int y = 0;
93 do {
94 const int top_base_x = top_x >> scale_bits;
95
96 if (top_base_x >= max_base_x) {
97 for (int i = y; i < height; ++i) {
98 memset(dst, top[max_base_x], 4 /* width */);
99 dst += stride;
100 }
101 return;
102 }
103
104 const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
105
106 // Zone2 uses negative values for xstep. Use signed values to compare
107 // |top_base_x| to |max_base_x|.
108 const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
109
110 const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
111
112 // 4 wide subsamples the output. 8 wide subsamples the input.
113 if (width == 4) {
114 const uint8x8_t left_values = vld1_u8(top + top_base_x);
115 const uint8x8_t right_values = RightShift<8>(left_values);
116 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
117
118 // If |upsampled| is true then extract every other value for output.
119 const uint8x8_t value_stepped =
120 vtbl1_u8(value, vreinterpret_u8_s8(base_step));
121 const uint8x8_t masked_value =
122 vbsl_u8(max_base_mask, value_stepped, top_max_base);
123
124 StoreLo4(dst, masked_value);
125 } else /* width == 8 */ {
126 uint8x8_t left_values, right_values;
127 // WeightedBlend() steps up to Q registers. Downsample the input to avoid
128 // doing extra calculations.
129 LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
130 &right_values);
131
132 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
133 const uint8x8_t masked_value =
134 vbsl_u8(max_base_mask, value, top_max_base);
135
136 vst1_u8(dst, masked_value);
137 }
138 dst += stride;
139 top_x += xstep;
140 } while (++y < height);
141 }
142
143 // Process a multiple of 8 |width| by any |height|. Processes horizontally
144 // before vertically in the hopes of being a little more cache friendly.
DirectionalZone1_WxH(uint8_t * dst,const ptrdiff_t stride,const int width,const int height,const uint8_t * const top,const int xstep,const bool upsampled)145 inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
146 const int width, const int height,
147 const uint8_t* const top, const int xstep,
148 const bool upsampled) {
149 assert(width % 8 == 0);
150 const int upsample_shift = static_cast<int>(upsampled);
151 const int scale_bits = 6 - upsample_shift;
152
153 const int max_base_x = (width + height - 1) << upsample_shift;
154 const int8x8_t max_base = vdup_n_s8(max_base_x);
155 const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
156
157 const int8x8_t all = vcreate_s8(0x0706050403020100);
158 const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
159 const int8x8_t base_step = upsampled ? even : all;
160 const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
161 const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
162
163 int top_x = xstep;
164 int y = 0;
165 do {
166 const int top_base_x = top_x >> scale_bits;
167
168 if (top_base_x >= max_base_x) {
169 for (int i = y; i < height; ++i) {
170 memset(dst, top[max_base_x], 4 /* width */);
171 dst += stride;
172 }
173 return;
174 }
175
176 const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
177
178 // Zone2 uses negative values for xstep. Use signed values to compare
179 // |top_base_x| to |max_base_x|.
180 int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
181
182 int x = 0;
183 do {
184 const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
185
186 // Extract the input values based on |upsampled| here to avoid doing twice
187 // as many calculations.
188 uint8x8_t left_values, right_values;
189 LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
190 &right_values);
191
192 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
193 const uint8x8_t masked_value =
194 vbsl_u8(max_base_mask, value, top_max_base);
195
196 vst1_u8(dst + x, masked_value);
197
198 base_v = vadd_s8(base_v, block_step);
199 x += 8;
200 } while (x < width);
201 top_x += xstep;
202 dst += stride;
203 } while (++y < height);
204 }
205
DirectionalIntraPredictorZone1_NEON(void * const dest,const ptrdiff_t stride,const void * const top_row,const int width,const int height,const int xstep,const bool upsampled_top)206 void DirectionalIntraPredictorZone1_NEON(void* const dest,
207 const ptrdiff_t stride,
208 const void* const top_row,
209 const int width, const int height,
210 const int xstep,
211 const bool upsampled_top) {
212 const uint8_t* const top = static_cast<const uint8_t*>(top_row);
213 uint8_t* dst = static_cast<uint8_t*>(dest);
214
215 assert(xstep > 0);
216
217 const int upsample_shift = static_cast<int>(upsampled_top);
218
219 const uint8x8_t all = vcreate_u8(0x0706050403020100);
220
221 if (xstep == 64) {
222 assert(!upsampled_top);
223 const uint8_t* top_ptr = top + 1;
224 int y = 0;
225 do {
226 memcpy(dst, top_ptr, width);
227 memcpy(dst + stride, top_ptr + 1, width);
228 memcpy(dst + 2 * stride, top_ptr + 2, width);
229 memcpy(dst + 3 * stride, top_ptr + 3, width);
230 dst += 4 * stride;
231 top_ptr += 4;
232 y += 4;
233 } while (y < height);
234 } else if (width == 4) {
235 DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
236 } else if (xstep > 51) {
237 // 7.11.2.10. Intra edge upsample selection process
238 // if ( d <= 0 || d >= 40 ) useUpsample = 0
239 // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
240 // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
241 // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
242 // Shallower angles have greater xstep values.
243 assert(!upsampled_top);
244 const int max_base_x = ((width + height) - 1);
245 const uint8x8_t max_base = vdup_n_u8(max_base_x);
246 const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
247 const uint8x8_t block_step = vdup_n_u8(8);
248
249 int top_x = xstep;
250 int y = 0;
251 do {
252 const int top_base_x = top_x >> 6;
253 const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
254 uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
255 int x = 0;
256 // Only calculate a block of 8 when at least one of the output values is
257 // within range. Otherwise it can read off the end of |top|.
258 const int must_calculate_width =
259 std::min(width, max_base_x - top_base_x + 7) & ~7;
260 for (; x < must_calculate_width; x += 8) {
261 const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
262
263 // Since these |xstep| values can not be upsampled the load is
264 // simplified.
265 const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
266 const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
267 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
268 const uint8x8_t masked_value =
269 vbsl_u8(max_base_mask, value, top_max_base);
270
271 vst1_u8(dst + x, masked_value);
272 base_v = vadd_u8(base_v, block_step);
273 }
274 memset(dst + x, top[max_base_x], width - x);
275 dst += stride;
276 top_x += xstep;
277 } while (++y < height);
278 } else {
279 DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
280 }
281 }
282
283 // Process 4 or 8 |width| by 4 or 8 |height|.
284 template <int width>
DirectionalZone3_WxH(uint8_t * dest,const ptrdiff_t stride,const int height,const uint8_t * const left_column,const int base_left_y,const int ystep,const int upsample_shift)285 inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
286 const int height,
287 const uint8_t* const left_column,
288 const int base_left_y, const int ystep,
289 const int upsample_shift) {
290 assert(width == 4 || width == 8);
291 assert(height == 4 || height == 8);
292 const int scale_bits = 6 - upsample_shift;
293
294 // Zone3 never runs out of left_column values.
295 assert((width + height - 1) << upsample_shift > // max_base_y
296 ((ystep * width) >> scale_bits) +
297 (/* base_step */ 1 << upsample_shift) *
298 (height - 1)); // left_base_y
299
300 // Limited improvement for 8x8. ~20% faster for 64x64.
301 const uint8x8_t all = vcreate_u8(0x0706050403020100);
302 const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
303 const uint8x8_t base_step = upsample_shift ? even : all;
304 const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
305
306 uint8_t* dst = dest;
307 uint8x8_t left_v[8], right_v[8], value_v[8];
308 const uint8_t* const left = left_column;
309
310 const int index_0 = base_left_y;
311 LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
312 &left_v[0], &right_v[0]);
313 value_v[0] = WeightedBlend(left_v[0], right_v[0],
314 ((index_0 << upsample_shift) & 0x3F) >> 1);
315
316 const int index_1 = base_left_y + ystep;
317 LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
318 &left_v[1], &right_v[1]);
319 value_v[1] = WeightedBlend(left_v[1], right_v[1],
320 ((index_1 << upsample_shift) & 0x3F) >> 1);
321
322 const int index_2 = base_left_y + ystep * 2;
323 LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
324 &left_v[2], &right_v[2]);
325 value_v[2] = WeightedBlend(left_v[2], right_v[2],
326 ((index_2 << upsample_shift) & 0x3F) >> 1);
327
328 const int index_3 = base_left_y + ystep * 3;
329 LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
330 &left_v[3], &right_v[3]);
331 value_v[3] = WeightedBlend(left_v[3], right_v[3],
332 ((index_3 << upsample_shift) & 0x3F) >> 1);
333
334 const int index_4 = base_left_y + ystep * 4;
335 LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
336 &left_v[4], &right_v[4]);
337 value_v[4] = WeightedBlend(left_v[4], right_v[4],
338 ((index_4 << upsample_shift) & 0x3F) >> 1);
339
340 const int index_5 = base_left_y + ystep * 5;
341 LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
342 &left_v[5], &right_v[5]);
343 value_v[5] = WeightedBlend(left_v[5], right_v[5],
344 ((index_5 << upsample_shift) & 0x3F) >> 1);
345
346 const int index_6 = base_left_y + ystep * 6;
347 LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
348 &left_v[6], &right_v[6]);
349 value_v[6] = WeightedBlend(left_v[6], right_v[6],
350 ((index_6 << upsample_shift) & 0x3F) >> 1);
351
352 const int index_7 = base_left_y + ystep * 7;
353 LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
354 &left_v[7], &right_v[7]);
355 value_v[7] = WeightedBlend(left_v[7], right_v[7],
356 ((index_7 << upsample_shift) & 0x3F) >> 1);
357
358 // 8x8 transpose.
359 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
360 vcombine_u8(value_v[1], value_v[5]));
361 const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
362 vcombine_u8(value_v[3], value_v[7]));
363
364 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
365 vreinterpretq_u16_u8(b1.val[0]));
366 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
367 vreinterpretq_u16_u8(b1.val[1]));
368
369 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
370 vreinterpretq_u32_u16(c1.val[0]));
371 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
372 vreinterpretq_u32_u16(c1.val[1]));
373
374 if (width == 4) {
375 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
376 dst += stride;
377 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
378 dst += stride;
379 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
380 dst += stride;
381 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
382 if (height == 4) return;
383 dst += stride;
384 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
385 dst += stride;
386 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
387 dst += stride;
388 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
389 dst += stride;
390 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
391 } else {
392 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
393 dst += stride;
394 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
395 dst += stride;
396 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
397 dst += stride;
398 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
399 if (height == 4) return;
400 dst += stride;
401 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
402 dst += stride;
403 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
404 dst += stride;
405 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
406 dst += stride;
407 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
408 }
409 }
410
411 // Because the source values "move backwards" as the row index increases, the
412 // indices derived from ystep are generally negative. This is accommodated by
413 // making sure the relative indices are within [-15, 0] when the function is
414 // called, and sliding them into the inclusive range [0, 15], relative to a
415 // lower base address.
416 constexpr int kPositiveIndexOffset = 15;
417
418 // Process 4 or 8 |width| by any |height|.
419 template <int width>
DirectionalZone2FromLeftCol_WxH(uint8_t * dst,const ptrdiff_t stride,const int height,const uint8_t * const left_column,const int16x8_t left_y,const int upsample_shift)420 inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst,
421 const ptrdiff_t stride,
422 const int height,
423 const uint8_t* const left_column,
424 const int16x8_t left_y,
425 const int upsample_shift) {
426 assert(width == 4 || width == 8);
427
428 // The shift argument must be a constant.
429 int16x8_t offset_y, shift_upsampled = left_y;
430 if (upsample_shift) {
431 offset_y = vshrq_n_s16(left_y, 5);
432 shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
433 } else {
434 offset_y = vshrq_n_s16(left_y, 6);
435 }
436
437 // Select values to the left of the starting point.
438 // The 15th element (and 16th) will be all the way at the end, to the right.
439 // With a negative ystep everything else will be "left" of them.
440 // This supports cumulative steps up to 15. We could support up to 16 by doing
441 // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
442 // registers as input which would allow for cumulative offsets of 32.
443 const int16x8_t sampler =
444 vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
445 const uint8x8_t left_values = vqmovun_s16(sampler);
446 const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
447
448 const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
449 const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
450 const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
451
452 int y = 0;
453 do {
454 uint8x8_t src_left, src_right;
455 LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
456 left_values, right_values, &src_left, &src_right);
457 const uint8x8_t val =
458 WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
459
460 if (width == 4) {
461 StoreLo4(dst, val);
462 } else {
463 vst1_u8(dst, val);
464 }
465 dst += stride;
466 } while (++y < height);
467 }
468
469 // Process 4 or 8 |width| by any |height|.
470 template <int width>
DirectionalZone1Blend_WxH(uint8_t * dest,const ptrdiff_t stride,const int height,const uint8_t * const top_row,int zone_bounds,int top_x,const int xstep,const int upsample_shift)471 inline void DirectionalZone1Blend_WxH(uint8_t* dest, const ptrdiff_t stride,
472 const int height,
473 const uint8_t* const top_row,
474 int zone_bounds, int top_x,
475 const int xstep,
476 const int upsample_shift) {
477 assert(width == 4 || width == 8);
478
479 const int scale_bits_x = 6 - upsample_shift;
480
481 const uint8x8_t all = vcreate_u8(0x0706050403020100);
482 const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
483 const uint8x8_t base_step = upsample_shift ? even : all;
484 const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
485
486 int y = 0;
487 do {
488 const uint8_t* const src = top_row + (top_x >> scale_bits_x);
489 uint8x8_t left, right;
490 LoadStepwise(src, base_step, right_step, &left, &right);
491
492 const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
493 const uint8x8_t val = WeightedBlend(left, right, shift);
494
495 uint8x8_t dst_blend = vld1_u8(dest);
496 // |zone_bounds| values can be negative.
497 uint8x8_t blend =
498 vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
499 uint8x8_t output = vbsl_u8(blend, val, dst_blend);
500
501 if (width == 4) {
502 StoreLo4(dest, output);
503 } else {
504 vst1_u8(dest, output);
505 }
506 dest += stride;
507 zone_bounds += xstep;
508 top_x -= xstep;
509 } while (++y < height);
510 }
511
512 // The height at which a load of 16 bytes will not contain enough source pixels
513 // from |left_column| to supply an accurate row when computing 8 pixels at a
514 // time. The values are found by inspection. By coincidence, all angles that
515 // satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
516 // by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
517 constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
518 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
519
520 // 7.11.2.4 (8) 90 < angle > 180
521 // The strategy for these functions (4xH and 8+xH) is to know how many blocks
522 // can be processed with just pixels from |top_ptr|, then handle mixed blocks,
523 // then handle only blocks that take from |left_ptr|. Additionally, a fast
524 // index-shuffle approach is used for pred values from |left_column| in sections
525 // that permit it.
DirectionalZone2_4xH(uint8_t * dst,const ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)526 inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
527 const uint8_t* const top_row,
528 const uint8_t* const left_column,
529 const int height, const int xstep,
530 const int ystep, const bool upsampled_top,
531 const bool upsampled_left) {
532 const int upsample_left_shift = static_cast<int>(upsampled_left);
533 const int upsample_top_shift = static_cast<int>(upsampled_top);
534
535 // Helper vector.
536 const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
537
538 // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
539 // it's only 4, it will be finished in the first iteration.
540 const ptrdiff_t stride8 = stride << 3;
541 const int xstep8 = xstep << 3;
542
543 const int min_height = (height == 4) ? 4 : 8;
544
545 // All columns from |min_top_only_x| to the right will only need |top_row| to
546 // compute and can therefore call the Zone1 functions. This assumes |xstep| is
547 // at least 3.
548 assert(xstep >= 3);
549 const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
550
551 // For steep angles, the source pixels from |left_column| may not fit in a
552 // 16-byte load for shuffling.
553 // TODO(petersonab): Find a more precise formula for this subject to x.
554 // TODO(johannkoenig): Revisit this for |width| == 4.
555 const int max_shuffle_height =
556 std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
557
558 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
559 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
560
561 const int left_base_increment = ystep >> 6;
562 const int ystep_remainder = ystep & 0x3F;
563
564 // If the 64 scaling is regarded as a decimal point, the first value of the
565 // left_y vector omits the portion which is covered under the left_column
566 // offset. The following values need the full ystep as a relative offset.
567 int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
568 left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
569
570 // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
571 // The first stage, before the first y-loop, covers blocks that are only
572 // computed from the top row. The second stage, comprising two y-loops, covers
573 // blocks that have a mixture of values computed from top or left. The final
574 // stage covers blocks that are only computed from the left.
575 if (min_top_only_x > 0) {
576 // Round down to the nearest multiple of 8.
577 // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
578 const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
579 DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
580 upsampled_top);
581
582 if (max_top_only_y == height) return;
583
584 int y = max_top_only_y;
585 dst += stride * y;
586 const int xstep_y = xstep * y;
587
588 // All rows from |min_left_only_y| down for this set of columns only need
589 // |left_column| to compute.
590 const int min_left_only_y = std::min((4 << 6) / xstep, height);
591 // At high angles such that min_left_only_y < 8, ystep is low and xstep is
592 // high. This means that max_shuffle_height is unbounded and xstep_bounds
593 // will overflow in 16 bits. This is prevented by stopping the first
594 // blending loop at min_left_only_y for such cases, which means we skip over
595 // the second blending loop as well.
596 const int left_shuffle_stop_y =
597 std::min(max_shuffle_height, min_left_only_y);
598 int xstep_bounds = xstep_bounds_base + xstep_y;
599 int top_x = -xstep - xstep_y;
600
601 // +8 increment is OK because if height is 4 this only goes once.
602 for (; y < left_shuffle_stop_y;
603 y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
604 DirectionalZone2FromLeftCol_WxH<4>(
605 dst, stride, min_height,
606 left_column + ((y - left_base_increment) << upsample_left_shift),
607 left_y, upsample_left_shift);
608
609 DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
610 xstep_bounds, top_x, xstep,
611 upsample_top_shift);
612 }
613
614 // Pick up from the last y-value, using the slower but secure method for
615 // left prediction.
616 const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
617 for (; y < min_left_only_y;
618 y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
619 DirectionalZone3_WxH<4>(
620 dst, stride, min_height,
621 left_column + ((y - left_base_increment) << upsample_left_shift),
622 base_left_y, -ystep, upsample_left_shift);
623
624 DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
625 xstep_bounds, top_x, xstep,
626 upsample_top_shift);
627 }
628 // Loop over y for left_only rows.
629 for (; y < height; y += 8, dst += stride8) {
630 DirectionalZone3_WxH<4>(
631 dst, stride, min_height,
632 left_column + ((y - left_base_increment) << upsample_left_shift),
633 base_left_y, -ystep, upsample_left_shift);
634 }
635 } else {
636 DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
637 upsampled_top);
638 }
639 }
640
641 // Process a multiple of 8 |width|.
DirectionalZone2_8(uint8_t * const dst,const ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)642 inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
643 const uint8_t* const top_row,
644 const uint8_t* const left_column,
645 const int width, const int height,
646 const int xstep, const int ystep,
647 const bool upsampled_top,
648 const bool upsampled_left) {
649 const int upsample_left_shift = static_cast<int>(upsampled_left);
650 const int upsample_top_shift = static_cast<int>(upsampled_top);
651
652 // Helper vector.
653 const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
654
655 // Loop incrementers for moving by block (8x8). This function handles blocks
656 // with height 4 as well. They are calculated in one pass so these variables
657 // do not get used.
658 const ptrdiff_t stride8 = stride << 3;
659 const int xstep8 = xstep << 3;
660 const int ystep8 = ystep << 3;
661
662 // Process Wx4 blocks.
663 const int min_height = (height == 4) ? 4 : 8;
664
665 // All columns from |min_top_only_x| to the right will only need |top_row| to
666 // compute and can therefore call the Zone1 functions. This assumes |xstep| is
667 // at least 3.
668 assert(xstep >= 3);
669 const int min_top_only_x = std::min((height * xstep) >> 6, width);
670
671 // For steep angles, the source pixels from |left_column| may not fit in a
672 // 16-byte load for shuffling.
673 // TODO(petersonab): Find a more precise formula for this subject to x.
674 const int max_shuffle_height =
675 std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
676
677 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
678 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
679
680 const int left_base_increment = ystep >> 6;
681 const int ystep_remainder = ystep & 0x3F;
682
683 const int left_base_increment8 = ystep8 >> 6;
684 const int ystep_remainder8 = ystep8 & 0x3F;
685 const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
686
687 // If the 64 scaling is regarded as a decimal point, the first value of the
688 // left_y vector omits the portion which is covered under the left_column
689 // offset. Following values need the full ystep as a relative offset.
690 int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
691 left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
692
693 // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
694 // The first stage, before the first y-loop, covers blocks that are only
695 // computed from the top row. The second stage, comprising two y-loops, covers
696 // blocks that have a mixture of values computed from top or left. The final
697 // stage covers blocks that are only computed from the left.
698 int x = 0;
699 for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
700 xstep_bounds_base -= (8 << 6),
701 left_y = vsubq_s16(left_y, increment_left8),
702 left_offset -= left_base_increment8) {
703 uint8_t* dst_x = dst + x;
704
705 // Round down to the nearest multiple of 8.
706 const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
707 DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
708 top_row + (x << upsample_top_shift), -xstep,
709 upsampled_top);
710
711 if (max_top_only_y == height) continue;
712
713 int y = max_top_only_y;
714 dst_x += stride * y;
715 const int xstep_y = xstep * y;
716
717 // All rows from |min_left_only_y| down for this set of columns only need
718 // |left_column| to compute.
719 const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
720 // At high angles such that min_left_only_y < 8, ystep is low and xstep is
721 // high. This means that max_shuffle_height is unbounded and xstep_bounds
722 // will overflow in 16 bits. This is prevented by stopping the first
723 // blending loop at min_left_only_y for such cases, which means we skip over
724 // the second blending loop as well.
725 const int left_shuffle_stop_y =
726 std::min(max_shuffle_height, min_left_only_y);
727 int xstep_bounds = xstep_bounds_base + xstep_y;
728 int top_x = -xstep - xstep_y;
729
730 for (; y < left_shuffle_stop_y;
731 y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
732 DirectionalZone2FromLeftCol_WxH<8>(
733 dst_x, stride, min_height,
734 left_column + ((left_offset + y) << upsample_left_shift), left_y,
735 upsample_left_shift);
736
737 DirectionalZone1Blend_WxH<8>(
738 dst_x, stride, min_height, top_row + (x << upsample_top_shift),
739 xstep_bounds, top_x, xstep, upsample_top_shift);
740 }
741
742 // Pick up from the last y-value, using the slower but secure method for
743 // left prediction.
744 const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
745 for (; y < min_left_only_y;
746 y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
747 DirectionalZone3_WxH<8>(
748 dst_x, stride, min_height,
749 left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
750 -ystep, upsample_left_shift);
751
752 DirectionalZone1Blend_WxH<8>(
753 dst_x, stride, min_height, top_row + (x << upsample_top_shift),
754 xstep_bounds, top_x, xstep, upsample_top_shift);
755 }
756 // Loop over y for left_only rows.
757 for (; y < height; y += 8, dst_x += stride8) {
758 DirectionalZone3_WxH<8>(
759 dst_x, stride, min_height,
760 left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
761 -ystep, upsample_left_shift);
762 }
763 }
764 // TODO(johannkoenig): May be able to remove this branch.
765 if (x < width) {
766 DirectionalZone1_WxH(dst + x, stride, width - x, height,
767 top_row + (x << upsample_top_shift), -xstep,
768 upsampled_top);
769 }
770 }
771
DirectionalIntraPredictorZone2_NEON(void * const dest,const ptrdiff_t stride,const void * const top_row,const void * const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)772 void DirectionalIntraPredictorZone2_NEON(
773 void* const dest, const ptrdiff_t stride, const void* const top_row,
774 const void* const left_column, const int width, const int height,
775 const int xstep, const int ystep, const bool upsampled_top,
776 const bool upsampled_left) {
777 // Increasing the negative buffer for this function allows more rows to be
778 // processed at a time without branching in an inner loop to check the base.
779 uint8_t top_buffer[288];
780 uint8_t left_buffer[288];
781 memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
782 memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
783 const uint8_t* top_ptr = top_buffer + 144;
784 const uint8_t* left_ptr = left_buffer + 144;
785 auto* dst = static_cast<uint8_t*>(dest);
786
787 if (width == 4) {
788 DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
789 upsampled_top, upsampled_left);
790 } else {
791 DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
792 ystep, upsampled_top, upsampled_left);
793 }
794 }
795
DirectionalIntraPredictorZone3_NEON(void * const dest,const ptrdiff_t stride,const void * const left_column,const int width,const int height,const int ystep,const bool upsampled_left)796 void DirectionalIntraPredictorZone3_NEON(void* const dest,
797 const ptrdiff_t stride,
798 const void* const left_column,
799 const int width, const int height,
800 const int ystep,
801 const bool upsampled_left) {
802 const auto* const left = static_cast<const uint8_t*>(left_column);
803
804 assert(ystep > 0);
805
806 const int upsample_shift = static_cast<int>(upsampled_left);
807 const int scale_bits = 6 - upsample_shift;
808 const int base_step = 1 << upsample_shift;
809
810 if (width == 4 || height == 4) {
811 // This block can handle all sizes but the specializations for other sizes
812 // are faster.
813 const uint8x8_t all = vcreate_u8(0x0706050403020100);
814 const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
815 const uint8x8_t base_step_v = upsampled_left ? even : all;
816 const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
817
818 int y = 0;
819 do {
820 int x = 0;
821 do {
822 uint8_t* dst = static_cast<uint8_t*>(dest);
823 dst += y * stride + x;
824 uint8x8_t left_v[4], right_v[4], value_v[4];
825 const int ystep_base = ystep * x;
826 const int offset = y * base_step;
827
828 const int index_0 = ystep_base + ystep * 1;
829 LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
830 right_step, &left_v[0], &right_v[0]);
831 value_v[0] = WeightedBlend(left_v[0], right_v[0],
832 ((index_0 << upsample_shift) & 0x3F) >> 1);
833
834 const int index_1 = ystep_base + ystep * 2;
835 LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
836 right_step, &left_v[1], &right_v[1]);
837 value_v[1] = WeightedBlend(left_v[1], right_v[1],
838 ((index_1 << upsample_shift) & 0x3F) >> 1);
839
840 const int index_2 = ystep_base + ystep * 3;
841 LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
842 right_step, &left_v[2], &right_v[2]);
843 value_v[2] = WeightedBlend(left_v[2], right_v[2],
844 ((index_2 << upsample_shift) & 0x3F) >> 1);
845
846 const int index_3 = ystep_base + ystep * 4;
847 LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
848 right_step, &left_v[3], &right_v[3]);
849 value_v[3] = WeightedBlend(left_v[3], right_v[3],
850 ((index_3 << upsample_shift) & 0x3F) >> 1);
851
852 // 8x4 transpose.
853 const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
854 const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
855
856 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
857 vreinterpret_u16_u8(b1.val[0]));
858 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
859 vreinterpret_u16_u8(b1.val[1]));
860
861 StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
862 dst += stride;
863 StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
864 dst += stride;
865 StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
866 dst += stride;
867 StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
868
869 if (height > 4) {
870 dst += stride;
871 StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
872 dst += stride;
873 StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
874 dst += stride;
875 StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
876 dst += stride;
877 StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
878 }
879 x += 4;
880 } while (x < width);
881 y += 8;
882 } while (y < height);
883 } else { // 8x8 at a time.
884 // Limited improvement for 8x8. ~20% faster for 64x64.
885 int y = 0;
886 do {
887 int x = 0;
888 do {
889 uint8_t* dst = static_cast<uint8_t*>(dest);
890 dst += y * stride + x;
891 const int ystep_base = ystep * (x + 1);
892
893 DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
894 ystep_base, ystep, upsample_shift);
895 x += 8;
896 } while (x < width);
897 y += 8;
898 } while (y < height);
899 }
900 }
901
Init8bpp()902 void Init8bpp() {
903 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
904 assert(dsp != nullptr);
905 dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
906 dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
907 dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
908 }
909
910 } // namespace
911 } // namespace low_bitdepth
912
IntraPredDirectionalInit_NEON()913 void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
914
915 } // namespace dsp
916 } // namespace libgav1
917
918 #else // !LIBGAV1_ENABLE_NEON
919 namespace libgav1 {
920 namespace dsp {
921
IntraPredDirectionalInit_NEON()922 void IntraPredDirectionalInit_NEON() {}
923
924 } // namespace dsp
925 } // namespace libgav1
926 #endif // LIBGAV1_ENABLE_NEON
927