1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/convolve.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26
27 #include "src/dsp/arm/common_neon.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32
33 namespace libgav1 {
34 namespace dsp {
35 namespace low_bitdepth {
36 namespace {
37
38 constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
39 constexpr int kHorizontalOffset = 3;
40 constexpr int kFilterIndexShift = 6;
41
42 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
43 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
44 // sum from outranging int16_t.
45 template <int filter_index, bool negative_outside_taps = false>
SumOnePassTaps(const uint8x8_t * const src,const uint8x8_t * const taps)46 int16x8_t SumOnePassTaps(const uint8x8_t* const src,
47 const uint8x8_t* const taps) {
48 uint16x8_t sum;
49 if (filter_index == 0) {
50 // 6 taps. + - + + - +
51 sum = vmull_u8(src[0], taps[0]);
52 // Unsigned overflow will result in a valid int16_t value.
53 sum = vmlsl_u8(sum, src[1], taps[1]);
54 sum = vmlal_u8(sum, src[2], taps[2]);
55 sum = vmlal_u8(sum, src[3], taps[3]);
56 sum = vmlsl_u8(sum, src[4], taps[4]);
57 sum = vmlal_u8(sum, src[5], taps[5]);
58 } else if (filter_index == 1 && negative_outside_taps) {
59 // 6 taps. - + + + + -
60 // Set a base we can subtract from.
61 sum = vmull_u8(src[1], taps[1]);
62 sum = vmlsl_u8(sum, src[0], taps[0]);
63 sum = vmlal_u8(sum, src[2], taps[2]);
64 sum = vmlal_u8(sum, src[3], taps[3]);
65 sum = vmlal_u8(sum, src[4], taps[4]);
66 sum = vmlsl_u8(sum, src[5], taps[5]);
67 } else if (filter_index == 1) {
68 // 6 taps. All are positive.
69 sum = vmull_u8(src[0], taps[0]);
70 sum = vmlal_u8(sum, src[1], taps[1]);
71 sum = vmlal_u8(sum, src[2], taps[2]);
72 sum = vmlal_u8(sum, src[3], taps[3]);
73 sum = vmlal_u8(sum, src[4], taps[4]);
74 sum = vmlal_u8(sum, src[5], taps[5]);
75 } else if (filter_index == 2) {
76 // 8 taps. - + - + + - + -
77 sum = vmull_u8(src[1], taps[1]);
78 sum = vmlsl_u8(sum, src[0], taps[0]);
79 sum = vmlsl_u8(sum, src[2], taps[2]);
80 sum = vmlal_u8(sum, src[3], taps[3]);
81 sum = vmlal_u8(sum, src[4], taps[4]);
82 sum = vmlsl_u8(sum, src[5], taps[5]);
83 sum = vmlal_u8(sum, src[6], taps[6]);
84 sum = vmlsl_u8(sum, src[7], taps[7]);
85 } else if (filter_index == 3) {
86 // 2 taps. All are positive.
87 sum = vmull_u8(src[0], taps[0]);
88 sum = vmlal_u8(sum, src[1], taps[1]);
89 } else if (filter_index == 4) {
90 // 4 taps. - + + -
91 sum = vmull_u8(src[1], taps[1]);
92 sum = vmlsl_u8(sum, src[0], taps[0]);
93 sum = vmlal_u8(sum, src[2], taps[2]);
94 sum = vmlsl_u8(sum, src[3], taps[3]);
95 } else if (filter_index == 5) {
96 // 4 taps. All are positive.
97 sum = vmull_u8(src[0], taps[0]);
98 sum = vmlal_u8(sum, src[1], taps[1]);
99 sum = vmlal_u8(sum, src[2], taps[2]);
100 sum = vmlal_u8(sum, src[3], taps[3]);
101 }
102 return vreinterpretq_s16_u16(sum);
103 }
104
105 template <int filter_index, bool negative_outside_taps>
SumHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)106 int16x8_t SumHorizontalTaps(const uint8_t* const src,
107 const uint8x8_t* const v_tap) {
108 uint8x8_t v_src[8];
109 const uint8x16_t src_long = vld1q_u8(src);
110 int16x8_t sum;
111
112 if (filter_index < 2) {
113 v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
114 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
115 v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
116 v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
117 v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
118 v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
119 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
120 } else if (filter_index == 2) {
121 v_src[0] = vget_low_u8(src_long);
122 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
123 v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
124 v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
125 v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
126 v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
127 v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
128 v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
129 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
130 } else if (filter_index == 3) {
131 v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
132 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
133 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
134 } else if (filter_index > 3) {
135 v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
136 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
137 v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
138 v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
139 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
140 }
141 return sum;
142 }
143
144 template <int filter_index, bool negative_outside_taps>
SimpleHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)145 uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
146 const uint8x8_t* const v_tap) {
147 int16x8_t sum =
148 SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
149
150 // Normally the Horizontal pass does the downshift in two passes:
151 // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
152 // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
153 // requires adding the rounding offset from the skipped shift.
154 constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
155
156 sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
157 return vqrshrun_n_s16(sum, kFilterBits - 1);
158 }
159
160 template <int filter_index, bool negative_outside_taps>
HorizontalTaps8To16(const uint8_t * const src,const uint8x8_t * const v_tap)161 uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
162 const uint8x8_t* const v_tap) {
163 const int16x8_t sum =
164 SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
165
166 return vreinterpretq_u16_s16(
167 vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
168 }
169
170 template <int filter_index>
SumHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)171 int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
172 const uint8x8_t* const v_tap) {
173 uint16x8_t sum;
174 const uint8x8_t input0 = vld1_u8(src);
175 src += src_stride;
176 const uint8x8_t input1 = vld1_u8(src);
177 uint8x8x2_t input = vzip_u8(input0, input1);
178
179 if (filter_index == 3) {
180 // tap signs : + +
181 sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
182 sum = vmlal_u8(sum, input.val[1], v_tap[4]);
183 } else if (filter_index == 4) {
184 // tap signs : - + + -
185 sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
186 sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
187 sum = vmlal_u8(sum, input.val[1], v_tap[4]);
188 sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
189 } else {
190 // tap signs : + + + +
191 sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
192 sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
193 sum = vmlal_u8(sum, input.val[1], v_tap[4]);
194 sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
195 }
196
197 return vreinterpretq_s16_u16(sum);
198 }
199
200 template <int filter_index>
SimpleHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)201 uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
202 const ptrdiff_t src_stride,
203 const uint8x8_t* const v_tap) {
204 int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
205
206 // Normally the Horizontal pass does the downshift in two passes:
207 // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
208 // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
209 // requires adding the rounding offset from the skipped shift.
210 constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
211
212 sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
213 return vqrshrun_n_s16(sum, kFilterBits - 1);
214 }
215
216 template <int filter_index>
HorizontalTaps8To16_2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)217 uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
218 const ptrdiff_t src_stride,
219 const uint8x8_t* const v_tap) {
220 const int16x8_t sum =
221 SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
222
223 return vreinterpretq_u16_s16(
224 vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
225 }
226
227 template <int num_taps, int step, int filter_index,
228 bool negative_outside_taps = true, bool is_2d = false,
229 bool is_compound = false>
FilterHorizontal(const uint8_t * src,const ptrdiff_t src_stride,void * const dest,const ptrdiff_t pred_stride,const int width,const int height,const uint8x8_t * const v_tap)230 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
231 void* const dest, const ptrdiff_t pred_stride,
232 const int width, const int height,
233 const uint8x8_t* const v_tap) {
234 auto* dest8 = static_cast<uint8_t*>(dest);
235 auto* dest16 = static_cast<uint16_t*>(dest);
236
237 // 4 tap filters are never used when width > 4.
238 if (num_taps != 4 && width > 4) {
239 int y = 0;
240 do {
241 int x = 0;
242 do {
243 if (is_2d || is_compound) {
244 const uint16x8_t v_sum =
245 HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
246 v_tap);
247 vst1q_u16(&dest16[x], v_sum);
248 } else {
249 const uint8x8_t result =
250 SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
251 v_tap);
252 vst1_u8(&dest8[x], result);
253 }
254 x += step;
255 } while (x < width);
256 src += src_stride;
257 dest8 += pred_stride;
258 dest16 += pred_stride;
259 } while (++y < height);
260 return;
261 }
262
263 // Horizontal passes only needs to account for |num_taps| 2 and 4 when
264 // |width| <= 4.
265 assert(width <= 4);
266 assert(num_taps <= 4);
267 if (num_taps <= 4) {
268 if (width == 4) {
269 int y = 0;
270 do {
271 if (is_2d || is_compound) {
272 const uint16x8_t v_sum =
273 HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
274 v_tap);
275 vst1_u16(dest16, vget_low_u16(v_sum));
276 } else {
277 const uint8x8_t result =
278 SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
279 v_tap);
280 StoreLo4(&dest8[0], result);
281 }
282 src += src_stride;
283 dest8 += pred_stride;
284 dest16 += pred_stride;
285 } while (++y < height);
286 return;
287 }
288
289 if (!is_compound) {
290 int y = 0;
291 do {
292 if (is_2d) {
293 const uint16x8_t sum =
294 HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
295 dest16[0] = vgetq_lane_u16(sum, 0);
296 dest16[1] = vgetq_lane_u16(sum, 2);
297 dest16 += pred_stride;
298 dest16[0] = vgetq_lane_u16(sum, 1);
299 dest16[1] = vgetq_lane_u16(sum, 3);
300 dest16 += pred_stride;
301 } else {
302 const uint8x8_t sum =
303 SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
304
305 dest8[0] = vget_lane_u8(sum, 0);
306 dest8[1] = vget_lane_u8(sum, 2);
307 dest8 += pred_stride;
308
309 dest8[0] = vget_lane_u8(sum, 1);
310 dest8[1] = vget_lane_u8(sum, 3);
311 dest8 += pred_stride;
312 }
313
314 src += src_stride << 1;
315 y += 2;
316 } while (y < height - 1);
317
318 // The 2d filters have an odd |height| because the horizontal pass
319 // generates context for the vertical pass.
320 if (is_2d) {
321 assert(height % 2 == 1);
322 uint16x8_t sum;
323 const uint8x8_t input = vld1_u8(src);
324 if (filter_index == 3) { // |num_taps| == 2
325 sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
326 sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
327 } else if (filter_index == 4) {
328 sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
329 sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
330 sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
331 sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
332 } else {
333 assert(filter_index == 5);
334 sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
335 sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
336 sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
337 sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
338 }
339 // |sum| contains an int16_t value.
340 sum = vreinterpretq_u16_s16(vrshrq_n_s16(
341 vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
342 Store2<0>(dest16, sum);
343 }
344 }
345 }
346 }
347
348 // Process 16 bit inputs and output 32 bits.
349 template <int num_taps, bool is_compound>
Sum2DVerticalTaps4(const int16x4_t * const src,const int16x8_t taps)350 inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
351 const int16x8_t taps) {
352 const int16x4_t taps_lo = vget_low_s16(taps);
353 const int16x4_t taps_hi = vget_high_s16(taps);
354 int32x4_t sum;
355 if (num_taps == 8) {
356 sum = vmull_lane_s16(src[0], taps_lo, 0);
357 sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
358 sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
359 sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
360 sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
361 sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
362 sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
363 sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
364 } else if (num_taps == 6) {
365 sum = vmull_lane_s16(src[0], taps_lo, 1);
366 sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
367 sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
368 sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
369 sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
370 sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
371 } else if (num_taps == 4) {
372 sum = vmull_lane_s16(src[0], taps_lo, 2);
373 sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
374 sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
375 sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
376 } else if (num_taps == 2) {
377 sum = vmull_lane_s16(src[0], taps_lo, 3);
378 sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
379 }
380
381 if (is_compound) {
382 return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
383 }
384
385 return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
386 }
387
388 template <int num_taps, bool is_compound>
SimpleSum2DVerticalTaps(const int16x8_t * const src,const int16x8_t taps)389 int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
390 const int16x8_t taps) {
391 const int16x4_t taps_lo = vget_low_s16(taps);
392 const int16x4_t taps_hi = vget_high_s16(taps);
393 int32x4_t sum_lo, sum_hi;
394 if (num_taps == 8) {
395 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
396 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
397 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
398 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
399 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
400 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
401 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
402 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
403
404 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
405 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
406 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
407 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
408 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
409 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
410 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
411 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
412 } else if (num_taps == 6) {
413 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
414 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
415 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
416 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
417 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
418 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
419
420 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
421 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
422 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
423 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
424 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
425 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
426 } else if (num_taps == 4) {
427 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
428 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
429 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
430 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
431
432 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
433 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
434 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
435 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
436 } else if (num_taps == 2) {
437 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
438 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
439
440 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
441 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
442 }
443
444 if (is_compound) {
445 return vcombine_s16(
446 vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
447 vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
448 }
449
450 return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
451 vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
452 }
453
454 template <int num_taps, bool is_compound = false>
Filter2DVertical(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int16x8_t taps)455 void Filter2DVertical(const uint16_t* src, void* const dst,
456 const ptrdiff_t dst_stride, const int width,
457 const int height, const int16x8_t taps) {
458 assert(width >= 8);
459 constexpr int next_row = num_taps - 1;
460 // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
461 const ptrdiff_t src_stride = width;
462
463 auto* dst8 = static_cast<uint8_t*>(dst);
464 auto* dst16 = static_cast<uint16_t*>(dst);
465
466 int x = 0;
467 do {
468 int16x8_t srcs[8];
469 const uint16_t* src_x = src + x;
470 srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
471 src_x += src_stride;
472 if (num_taps >= 4) {
473 srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
474 src_x += src_stride;
475 srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
476 src_x += src_stride;
477 if (num_taps >= 6) {
478 srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
479 src_x += src_stride;
480 srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
481 src_x += src_stride;
482 if (num_taps == 8) {
483 srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
484 src_x += src_stride;
485 srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
486 src_x += src_stride;
487 }
488 }
489 }
490
491 int y = 0;
492 do {
493 srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
494 src_x += src_stride;
495
496 const int16x8_t sum =
497 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
498 if (is_compound) {
499 vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
500 } else {
501 vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
502 }
503
504 srcs[0] = srcs[1];
505 if (num_taps >= 4) {
506 srcs[1] = srcs[2];
507 srcs[2] = srcs[3];
508 if (num_taps >= 6) {
509 srcs[3] = srcs[4];
510 srcs[4] = srcs[5];
511 if (num_taps == 8) {
512 srcs[5] = srcs[6];
513 srcs[6] = srcs[7];
514 }
515 }
516 }
517 } while (++y < height);
518 x += 8;
519 } while (x < width);
520 }
521
522 // Take advantage of |src_stride| == |width| to process two rows at a time.
523 template <int num_taps, bool is_compound = false>
Filter2DVertical4xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)524 void Filter2DVertical4xH(const uint16_t* src, void* const dst,
525 const ptrdiff_t dst_stride, const int height,
526 const int16x8_t taps) {
527 auto* dst8 = static_cast<uint8_t*>(dst);
528 auto* dst16 = static_cast<uint16_t*>(dst);
529
530 int16x8_t srcs[9];
531 srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
532 src += 8;
533 if (num_taps >= 4) {
534 srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
535 src += 8;
536 srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
537 if (num_taps >= 6) {
538 srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
539 src += 8;
540 srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
541 if (num_taps == 8) {
542 srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
543 src += 8;
544 srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
545 }
546 }
547 }
548
549 int y = 0;
550 do {
551 srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
552 src += 8;
553 srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
554 vget_low_s16(srcs[num_taps]));
555
556 const int16x8_t sum =
557 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
558 if (is_compound) {
559 const uint16x8_t results = vreinterpretq_u16_s16(sum);
560 vst1q_u16(dst16, results);
561 dst16 += 4 << 1;
562 } else {
563 const uint8x8_t results = vqmovun_s16(sum);
564
565 StoreLo4(dst8, results);
566 dst8 += dst_stride;
567 StoreHi4(dst8, results);
568 dst8 += dst_stride;
569 }
570
571 srcs[0] = srcs[2];
572 if (num_taps >= 4) {
573 srcs[1] = srcs[3];
574 srcs[2] = srcs[4];
575 if (num_taps >= 6) {
576 srcs[3] = srcs[5];
577 srcs[4] = srcs[6];
578 if (num_taps == 8) {
579 srcs[5] = srcs[7];
580 srcs[6] = srcs[8];
581 }
582 }
583 }
584 y += 2;
585 } while (y < height);
586 }
587
588 // Take advantage of |src_stride| == |width| to process four rows at a time.
589 template <int num_taps>
Filter2DVertical2xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)590 void Filter2DVertical2xH(const uint16_t* src, void* const dst,
591 const ptrdiff_t dst_stride, const int height,
592 const int16x8_t taps) {
593 constexpr int next_row = (num_taps < 6) ? 4 : 8;
594
595 auto* dst8 = static_cast<uint8_t*>(dst);
596
597 int16x8_t srcs[9];
598 srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
599 src += 8;
600 if (num_taps >= 6) {
601 srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
602 src += 8;
603 srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
604 if (num_taps == 8) {
605 srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
606 srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
607 }
608 }
609
610 int y = 0;
611 do {
612 srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
613 src += 8;
614 if (num_taps == 2) {
615 srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
616 } else if (num_taps == 4) {
617 srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
618 srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
619 srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
620 } else if (num_taps == 6) {
621 srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
622 srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
623 srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
624 } else if (num_taps == 8) {
625 srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
626 srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
627 srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
628 }
629
630 const int16x8_t sum =
631 SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
632 const uint8x8_t results = vqmovun_s16(sum);
633
634 Store2<0>(dst8, results);
635 dst8 += dst_stride;
636 Store2<1>(dst8, results);
637 // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
638 // Therefore we don't need to check this condition when |height| > 4.
639 if (num_taps <= 4 && height == 2) return;
640 dst8 += dst_stride;
641 Store2<2>(dst8, results);
642 dst8 += dst_stride;
643 Store2<3>(dst8, results);
644 dst8 += dst_stride;
645
646 srcs[0] = srcs[4];
647 if (num_taps == 6) {
648 srcs[1] = srcs[5];
649 srcs[4] = srcs[8];
650 } else if (num_taps == 8) {
651 srcs[1] = srcs[5];
652 srcs[2] = srcs[6];
653 srcs[3] = srcs[7];
654 srcs[4] = srcs[8];
655 }
656
657 y += 4;
658 } while (y < height);
659 }
660
661 template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass(const uint8_t * const src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int subpixel,const int filter_index)662 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
663 const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
664 const ptrdiff_t dst_stride, const int width, const int height,
665 const int subpixel, const int filter_index) {
666 // Duplicate the absolute value for each tap. Negative taps are corrected
667 // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
668 uint8x8_t v_tap[kSubPixelTaps];
669 const int filter_id = (subpixel >> 6) & kSubPixelMask;
670 assert(filter_id != 0);
671
672 for (int k = 0; k < kSubPixelTaps; ++k) {
673 v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
674 }
675
676 if (filter_index == 2) { // 8 tap.
677 FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
678 src, src_stride, dst, dst_stride, width, height, v_tap);
679 } else if (filter_index == 1) { // 6 tap.
680 // Check if outside taps are positive.
681 if ((filter_id == 1) | (filter_id == 15)) {
682 FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
683 src, src_stride, dst, dst_stride, width, height, v_tap);
684 } else {
685 FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
686 src, src_stride, dst, dst_stride, width, height, v_tap);
687 }
688 } else if (filter_index == 0) { // 6 tap.
689 FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
690 src, src_stride, dst, dst_stride, width, height, v_tap);
691 } else if (filter_index == 4) { // 4 tap.
692 FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
693 src, src_stride, dst, dst_stride, width, height, v_tap);
694 } else if (filter_index == 5) { // 4 tap.
695 FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
696 src, src_stride, dst, dst_stride, width, height, v_tap);
697 } else { // 2 tap.
698 FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
699 src, src_stride, dst, dst_stride, width, height, v_tap);
700 }
701 }
702
GetNumTapsInFilter(const int filter_index)703 int GetNumTapsInFilter(const int filter_index) {
704 if (filter_index < 2) {
705 // Despite the names these only use 6 taps.
706 // kInterpolationFilterEightTap
707 // kInterpolationFilterEightTapSmooth
708 return 6;
709 }
710
711 if (filter_index == 2) {
712 // kInterpolationFilterEightTapSharp
713 return 8;
714 }
715
716 if (filter_index == 3) {
717 // kInterpolationFilterBilinear
718 return 2;
719 }
720
721 assert(filter_index > 3);
722 // For small sizes (width/height <= 4) the large filters are replaced with 4
723 // tap options.
724 // If the original filters were |kInterpolationFilterEightTap| or
725 // |kInterpolationFilterEightTapSharp| then it becomes
726 // |kInterpolationFilterSwitchable|.
727 // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
728 // tap filter.
729 return 4;
730 }
731
Convolve2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)732 void Convolve2D_NEON(const void* const reference,
733 const ptrdiff_t reference_stride,
734 const int horizontal_filter_index,
735 const int vertical_filter_index, const int subpixel_x,
736 const int subpixel_y, const int width, const int height,
737 void* prediction, const ptrdiff_t pred_stride) {
738 const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
739 const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
740 const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
741
742 // The output of the horizontal filter is guaranteed to fit in 16 bits.
743 uint16_t
744 intermediate_result[kMaxSuperBlockSizeInPixels *
745 (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
746 const int intermediate_height = height + vertical_taps - 1;
747
748 const ptrdiff_t src_stride = reference_stride;
749 const auto* src = static_cast<const uint8_t*>(reference) -
750 (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
751
752 DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
753 width, intermediate_height, subpixel_x,
754 horiz_filter_index);
755
756 // Vertical filter.
757 auto* dest = static_cast<uint8_t*>(prediction);
758 const ptrdiff_t dest_stride = pred_stride;
759 const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
760 assert(filter_id != 0);
761
762 const int16x8_t taps =
763 vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
764
765 if (vertical_taps == 8) {
766 if (width == 2) {
767 Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
768 taps);
769 } else if (width == 4) {
770 Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
771 taps);
772 } else {
773 Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
774 taps);
775 }
776 } else if (vertical_taps == 6) {
777 if (width == 2) {
778 Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
779 taps);
780 } else if (width == 4) {
781 Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
782 taps);
783 } else {
784 Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
785 taps);
786 }
787 } else if (vertical_taps == 4) {
788 if (width == 2) {
789 Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
790 taps);
791 } else if (width == 4) {
792 Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
793 taps);
794 } else {
795 Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
796 taps);
797 }
798 } else { // |vertical_taps| == 2
799 if (width == 2) {
800 Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
801 taps);
802 } else if (width == 4) {
803 Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
804 taps);
805 } else {
806 Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
807 taps);
808 }
809 }
810 }
811
812 // There are many opportunities for overreading in scaled convolve, because the
813 // range of starting points for filter windows is anywhere from 0 to 16 for 8
814 // destination pixels, and the window sizes range from 2 to 8. To accommodate
815 // this range concisely, we use |grade_x| to mean the most steps in src that can
816 // be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
817 // we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
818 // increments. The first load covers the initial elements of src_x, while the
819 // final load covers the taps.
820 template <int grade_x>
LoadSrcVals(const uint8_t * src_x)821 inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
822 uint8x8x3_t ret;
823 const uint8x16_t src_val = vld1q_u8(src_x);
824 ret.val[0] = vget_low_u8(src_val);
825 ret.val[1] = vget_high_u8(src_val);
826 if (grade_x > 1) {
827 ret.val[2] = vld1_u8(src_x + 16);
828 }
829 return ret;
830 }
831
832 // Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
GetPositive2TapFilter(const int tap_index)833 inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
834 assert(tap_index < 2);
835 alignas(
836 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
837 {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
838 {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
839
840 return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
841 }
842
843 template <int grade_x>
ConvolveKernelHorizontal2Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)844 inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
845 const ptrdiff_t src_stride,
846 const int width, const int subpixel_x,
847 const int step_x,
848 const int intermediate_height,
849 int16_t* intermediate) {
850 // Account for the 0-taps that precede the 2 nonzero taps.
851 const int kernel_offset = 3;
852 const int ref_x = subpixel_x >> kScaleSubPixelBits;
853 const int step_x8 = step_x << 3;
854 const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
855 const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
856 const uint16x8_t index_steps = vmulq_n_u16(
857 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
858 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
859
860 int p = subpixel_x;
861 if (width <= 4) {
862 const uint8_t* src_x =
863 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
864 // Only add steps to the 10-bit truncated p to avoid overflow.
865 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
866 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
867 const uint8x8_t filter_indices =
868 vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
869 // This is a special case. The 2-tap filter has no negative taps, so we
870 // can use unsigned values.
871 // For each x, a lane of tapsK has
872 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
873 // on x.
874 const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
875 VQTbl1U8(filter_taps1, filter_indices)};
876 int y = 0;
877 do {
878 // Load a pool of samples to select from using stepped indices.
879 const uint8x16_t src_vals = vld1q_u8(src_x);
880 const uint8x8_t src_indices =
881 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
882
883 // For each x, a lane of srcK contains src_x[k].
884 const uint8x8_t src[2] = {
885 VQTbl1U8(src_vals, src_indices),
886 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
887
888 vst1q_s16(intermediate,
889 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
890 kInterRoundBitsHorizontal - 1));
891 src_x += src_stride;
892 intermediate += kIntermediateStride;
893 } while (++y < intermediate_height);
894 return;
895 }
896
897 // |width| >= 8
898 int x = 0;
899 do {
900 const uint8_t* src_x =
901 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
902 int16_t* intermediate_x = intermediate + x;
903 // Only add steps to the 10-bit truncated p to avoid overflow.
904 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
905 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
906 const uint8x8_t filter_indices =
907 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
908 filter_index_mask);
909 // This is a special case. The 2-tap filter has no negative taps, so we
910 // can use unsigned values.
911 // For each x, a lane of tapsK has
912 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
913 // on x.
914 const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
915 VQTbl1U8(filter_taps1, filter_indices)};
916 int y = 0;
917 do {
918 // Load a pool of samples to select from using stepped indices.
919 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
920 const uint8x8_t src_indices =
921 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
922
923 // For each x, a lane of srcK contains src_x[k].
924 const uint8x8_t src[2] = {
925 vtbl3_u8(src_vals, src_indices),
926 vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
927
928 vst1q_s16(intermediate_x,
929 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
930 kInterRoundBitsHorizontal - 1));
931 src_x += src_stride;
932 intermediate_x += kIntermediateStride;
933 } while (++y < intermediate_height);
934 x += 8;
935 p += step_x8;
936 } while (x < width);
937 }
938
939 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
GetPositive4TapFilter(const int tap_index)940 inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
941 assert(tap_index < 4);
942 alignas(
943 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
944 {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
945 {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
946 {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
947 {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
948
949 return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
950 }
951
952 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalPositive4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)953 void ConvolveKernelHorizontalPositive4Tap(
954 const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
955 const int step_x, const int intermediate_height, int16_t* intermediate) {
956 const int kernel_offset = 2;
957 const int ref_x = subpixel_x >> kScaleSubPixelBits;
958 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
959 const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
960 const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
961 const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
962 const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
963 const uint16x8_t index_steps = vmulq_n_u16(
964 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
965 const int p = subpixel_x;
966 // First filter is special, just a 128 tap on the center.
967 const uint8_t* src_x =
968 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
969 // Only add steps to the 10-bit truncated p to avoid overflow.
970 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
971 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
972 const uint8x8_t filter_indices = vand_u8(
973 vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
974 // Note that filter_id depends on x.
975 // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
976 const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
977 VQTbl1U8(filter_taps1, filter_indices),
978 VQTbl1U8(filter_taps2, filter_indices),
979 VQTbl1U8(filter_taps3, filter_indices)};
980
981 const uint8x8_t src_indices =
982 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
983 int y = 0;
984 do {
985 // Load a pool of samples to select from using stepped index vectors.
986 const uint8x16_t src_vals = vld1q_u8(src_x);
987
988 // For each x, srcK contains src_x[k] where k=1.
989 // Whereas taps come from different arrays, src pixels are drawn from the
990 // same contiguous line.
991 const uint8x8_t src[4] = {
992 VQTbl1U8(src_vals, src_indices),
993 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
994 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
995 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
996
997 vst1q_s16(intermediate,
998 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
999 kInterRoundBitsHorizontal - 1));
1000
1001 src_x += src_stride;
1002 intermediate += kIntermediateStride;
1003 } while (++y < intermediate_height);
1004 }
1005
1006 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
GetSigned4TapFilter(const int tap_index)1007 inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
1008 assert(tap_index < 4);
1009 alignas(16) static constexpr uint8_t
1010 kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
1011 {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
1012 {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1013 {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1014 {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
1015
1016 return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
1017 }
1018
1019 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalSigned4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1020 inline void ConvolveKernelHorizontalSigned4Tap(
1021 const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
1022 const int step_x, const int intermediate_height, int16_t* intermediate) {
1023 const int kernel_offset = 2;
1024 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1025 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1026 const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
1027 const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
1028 const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
1029 const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
1030 const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
1031 static_cast<uint16_t>(step_x));
1032
1033 const int p = subpixel_x;
1034 const uint8_t* src_x =
1035 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1036 // Only add steps to the 10-bit truncated p to avoid overflow.
1037 const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
1038 const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
1039 const uint8x8_t filter_index_offsets = vshrn_n_u16(
1040 vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
1041 const uint8x8_t filter_indices =
1042 vand_u8(filter_index_offsets, filter_index_mask);
1043 // Note that filter_id depends on x.
1044 // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
1045 const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
1046 VQTbl1U8(filter_taps1, filter_indices),
1047 VQTbl1U8(filter_taps2, filter_indices),
1048 VQTbl1U8(filter_taps3, filter_indices)};
1049
1050 const uint8x8_t src_indices_base =
1051 vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
1052
1053 const uint8x8_t src_indices[4] = {src_indices_base,
1054 vadd_u8(src_indices_base, vdup_n_u8(1)),
1055 vadd_u8(src_indices_base, vdup_n_u8(2)),
1056 vadd_u8(src_indices_base, vdup_n_u8(3))};
1057
1058 int y = 0;
1059 do {
1060 // Load a pool of samples to select from using stepped indices.
1061 const uint8x16_t src_vals = vld1q_u8(src_x);
1062
1063 // For each x, srcK contains src_x[k] where k=1.
1064 // Whereas taps come from different arrays, src pixels are drawn from the
1065 // same contiguous line.
1066 const uint8x8_t src[4] = {
1067 VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
1068 VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
1069
1070 vst1q_s16(intermediate,
1071 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
1072 kInterRoundBitsHorizontal - 1));
1073 src_x += src_stride;
1074 intermediate += kIntermediateStride;
1075 } while (++y < intermediate_height);
1076 }
1077
1078 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
GetSigned6TapFilter(const int tap_index)1079 inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
1080 assert(tap_index < 6);
1081 alignas(16) static constexpr uint8_t
1082 kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
1083 {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
1084 {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
1085 {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1086 {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1087 {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
1088 {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
1089
1090 return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
1091 }
1092
1093 // This filter is only possible when width >= 8.
1094 template <int grade_x>
ConvolveKernelHorizontalSigned6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1095 inline void ConvolveKernelHorizontalSigned6Tap(
1096 const uint8_t* src, const ptrdiff_t src_stride, const int width,
1097 const int subpixel_x, const int step_x, const int intermediate_height,
1098 int16_t* intermediate) {
1099 const int kernel_offset = 1;
1100 const uint8x8_t one = vdup_n_u8(1);
1101 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1102 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1103 const int step_x8 = step_x << 3;
1104 uint8x16_t filter_taps[6];
1105 for (int i = 0; i < 6; ++i) {
1106 filter_taps[i] = GetSigned6TapFilter(i);
1107 }
1108 const uint16x8_t index_steps = vmulq_n_u16(
1109 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1110
1111 int x = 0;
1112 int p = subpixel_x;
1113 do {
1114 // Avoid overloading outside the reference boundaries. This means
1115 // |trailing_width| can be up to 24.
1116 const uint8_t* src_x =
1117 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1118 int16_t* intermediate_x = intermediate + x;
1119 // Only add steps to the 10-bit truncated p to avoid overflow.
1120 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1121 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1122 const uint8x8_t src_indices =
1123 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1124 uint8x8_t src_lookup[6];
1125 src_lookup[0] = src_indices;
1126 for (int i = 1; i < 6; ++i) {
1127 src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1128 }
1129
1130 const uint8x8_t filter_indices =
1131 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1132 filter_index_mask);
1133 // For each x, a lane of taps[k] has
1134 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1135 // on x.
1136 uint8x8_t taps[6];
1137 for (int i = 0; i < 6; ++i) {
1138 taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1139 }
1140 int y = 0;
1141 do {
1142 // Load a pool of samples to select from using stepped indices.
1143 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1144
1145 const uint8x8_t src[6] = {
1146 vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1147 vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1148 vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
1149
1150 vst1q_s16(intermediate_x,
1151 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
1152 kInterRoundBitsHorizontal - 1));
1153 src_x += src_stride;
1154 intermediate_x += kIntermediateStride;
1155 } while (++y < intermediate_height);
1156 x += 8;
1157 p += step_x8;
1158 } while (x < width);
1159 }
1160
1161 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
1162 // has mixed positive and negative outer taps which are handled in
1163 // GetMixed6TapFilter().
GetPositive6TapFilter(const int tap_index)1164 inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
1165 assert(tap_index < 6);
1166 alignas(16) static constexpr uint8_t
1167 kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
1168 {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
1169 {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
1170 {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
1171 {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
1172
1173 return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
1174 }
1175
GetMixed6TapFilter(const int tap_index)1176 inline int8x16_t GetMixed6TapFilter(const int tap_index) {
1177 assert(tap_index < 2);
1178 alignas(
1179 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
1180 {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
1181 {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
1182
1183 return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
1184 }
1185
1186 // This filter is only possible when width >= 8.
1187 template <int grade_x>
ConvolveKernelHorizontalMixed6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1188 inline void ConvolveKernelHorizontalMixed6Tap(
1189 const uint8_t* src, const ptrdiff_t src_stride, const int width,
1190 const int subpixel_x, const int step_x, const int intermediate_height,
1191 int16_t* intermediate) {
1192 const int kernel_offset = 1;
1193 const uint8x8_t one = vdup_n_u8(1);
1194 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1195 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1196 const int step_x8 = step_x << 3;
1197 uint8x8_t taps[4];
1198 int16x8_t mixed_taps[2];
1199 uint8x16_t positive_filter_taps[4];
1200 for (int i = 0; i < 4; ++i) {
1201 positive_filter_taps[i] = GetPositive6TapFilter(i);
1202 }
1203 int8x16_t mixed_filter_taps[2];
1204 mixed_filter_taps[0] = GetMixed6TapFilter(0);
1205 mixed_filter_taps[1] = GetMixed6TapFilter(1);
1206 const uint16x8_t index_steps = vmulq_n_u16(
1207 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1208
1209 int x = 0;
1210 int p = subpixel_x;
1211 do {
1212 const uint8_t* src_x =
1213 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1214 int16_t* intermediate_x = intermediate + x;
1215 // Only add steps to the 10-bit truncated p to avoid overflow.
1216 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1217 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1218 const uint8x8_t src_indices =
1219 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1220 uint8x8_t src_lookup[6];
1221 src_lookup[0] = src_indices;
1222 for (int i = 1; i < 6; ++i) {
1223 src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1224 }
1225
1226 const uint8x8_t filter_indices =
1227 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1228 filter_index_mask);
1229 // For each x, a lane of taps[k] has
1230 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1231 // on x.
1232 for (int i = 0; i < 4; ++i) {
1233 taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
1234 }
1235 mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
1236 mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
1237
1238 int y = 0;
1239 do {
1240 // Load a pool of samples to select from using stepped indices.
1241 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1242
1243 int16x8_t sum_mixed = vmulq_s16(
1244 mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
1245 sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
1246 ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
1247 uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
1248 sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
1249 sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
1250 sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
1251 sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
1252
1253 vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
1254 kInterRoundBitsHorizontal - 1));
1255 src_x += src_stride;
1256 intermediate_x += kIntermediateStride;
1257 } while (++y < intermediate_height);
1258 x += 8;
1259 p += step_x8;
1260 } while (x < width);
1261 }
1262
1263 // Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
GetSigned8TapFilter(const int tap_index)1264 inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
1265 assert(tap_index < 8);
1266 alignas(16) static constexpr uint8_t
1267 kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
1268 {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
1269 {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
1270 {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
1271 {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
1272 {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
1273 {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
1274 {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
1275 {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
1276
1277 return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
1278 }
1279
1280 // This filter is only possible when width >= 8.
1281 template <int grade_x>
ConvolveKernelHorizontalSigned8Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1282 inline void ConvolveKernelHorizontalSigned8Tap(
1283 const uint8_t* src, const ptrdiff_t src_stride, const int width,
1284 const int subpixel_x, const int step_x, const int intermediate_height,
1285 int16_t* intermediate) {
1286 const uint8x8_t one = vdup_n_u8(1);
1287 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1288 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1289 const int step_x8 = step_x << 3;
1290 uint8x8_t taps[8];
1291 uint8x16_t filter_taps[8];
1292 for (int i = 0; i < 8; ++i) {
1293 filter_taps[i] = GetSigned8TapFilter(i);
1294 }
1295 const uint16x8_t index_steps = vmulq_n_u16(
1296 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1297 int x = 0;
1298 int p = subpixel_x;
1299 do {
1300 const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
1301 int16_t* intermediate_x = intermediate + x;
1302 // Only add steps to the 10-bit truncated p to avoid overflow.
1303 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1304 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1305 const uint8x8_t src_indices =
1306 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1307 uint8x8_t src_lookup[8];
1308 src_lookup[0] = src_indices;
1309 for (int i = 1; i < 8; ++i) {
1310 src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1311 }
1312
1313 const uint8x8_t filter_indices =
1314 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1315 filter_index_mask);
1316 // For each x, a lane of taps[k] has
1317 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1318 // on x.
1319 for (int i = 0; i < 8; ++i) {
1320 taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1321 }
1322
1323 int y = 0;
1324 do {
1325 // Load a pool of samples to select from using stepped indices.
1326 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1327
1328 const uint8x8_t src[8] = {
1329 vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1330 vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1331 vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
1332 vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
1333
1334 vst1q_s16(intermediate_x,
1335 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
1336 kInterRoundBitsHorizontal - 1));
1337 src_x += src_stride;
1338 intermediate_x += kIntermediateStride;
1339 } while (++y < intermediate_height);
1340 x += 8;
1341 p += step_x8;
1342 } while (x < width);
1343 }
1344
1345 // This function handles blocks of width 2 or 4.
1346 template <int num_taps, int grade_y, int width, bool is_compound>
ConvolveVerticalScale4xH(const int16_t * src,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1347 void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
1348 const int filter_index, const int step_y,
1349 const int height, void* dest,
1350 const ptrdiff_t dest_stride) {
1351 constexpr ptrdiff_t src_stride = kIntermediateStride;
1352 const int16_t* src_y = src;
1353 // |dest| is 16-bit in compound mode, Pixel otherwise.
1354 uint16_t* dest16_y = static_cast<uint16_t*>(dest);
1355 uint8_t* dest_y = static_cast<uint8_t*>(dest);
1356 int16x4_t s[num_taps + grade_y];
1357
1358 int p = subpixel_y & 1023;
1359 int prev_p = p;
1360 int y = 0;
1361 do { // y < height
1362 for (int i = 0; i < num_taps; ++i) {
1363 s[i] = vld1_s16(src_y + i * src_stride);
1364 }
1365 int filter_id = (p >> 6) & kSubPixelMask;
1366 int16x8_t filter =
1367 vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1368 int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
1369 if (is_compound) {
1370 assert(width != 2);
1371 const uint16x4_t result = vreinterpret_u16_s16(sums);
1372 vst1_u16(dest16_y, result);
1373 } else {
1374 const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1375 if (width == 2) {
1376 Store2<0>(dest_y, result);
1377 } else {
1378 StoreLo4(dest_y, result);
1379 }
1380 }
1381 p += step_y;
1382 const int p_diff =
1383 (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1384 prev_p = p;
1385 // Here we load extra source in case it is needed. If |p_diff| == 0, these
1386 // values will be unused, but it's faster to load than to branch.
1387 s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
1388 if (grade_y > 1) {
1389 s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
1390 }
1391 dest16_y += dest_stride;
1392 dest_y += dest_stride;
1393
1394 filter_id = (p >> 6) & kSubPixelMask;
1395 filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1396 sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
1397 if (is_compound) {
1398 assert(width != 2);
1399 const uint16x4_t result = vreinterpret_u16_s16(sums);
1400 vst1_u16(dest16_y, result);
1401 } else {
1402 const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1403 if (width == 2) {
1404 Store2<0>(dest_y, result);
1405 } else {
1406 StoreLo4(dest_y, result);
1407 }
1408 }
1409 p += step_y;
1410 src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1411 prev_p = p;
1412 dest16_y += dest_stride;
1413 dest_y += dest_stride;
1414
1415 y += 2;
1416 } while (y < height);
1417 }
1418
1419 template <int num_taps, int grade_y, bool is_compound>
ConvolveVerticalScale(const int16_t * src,const int width,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1420 inline void ConvolveVerticalScale(const int16_t* src, const int width,
1421 const int subpixel_y, const int filter_index,
1422 const int step_y, const int height,
1423 void* dest, const ptrdiff_t dest_stride) {
1424 constexpr ptrdiff_t src_stride = kIntermediateStride;
1425 // A possible improvement is to use arithmetic to decide how many times to
1426 // apply filters to same source before checking whether to load new srcs.
1427 // However, this will only improve performance with very small step sizes.
1428 int16x8_t s[num_taps + grade_y];
1429 // |dest| is 16-bit in compound mode, Pixel otherwise.
1430 uint16_t* dest16_y;
1431 uint8_t* dest_y;
1432
1433 int x = 0;
1434 do { // x < width
1435 const int16_t* src_x = src + x;
1436 const int16_t* src_y = src_x;
1437 dest16_y = static_cast<uint16_t*>(dest) + x;
1438 dest_y = static_cast<uint8_t*>(dest) + x;
1439 int p = subpixel_y & 1023;
1440 int prev_p = p;
1441 int y = 0;
1442 do { // y < height
1443 for (int i = 0; i < num_taps; ++i) {
1444 s[i] = vld1q_s16(src_y + i * src_stride);
1445 }
1446 int filter_id = (p >> 6) & kSubPixelMask;
1447 int16x8_t filter =
1448 vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1449 int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
1450 if (is_compound) {
1451 vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1452 } else {
1453 vst1_u8(dest_y, vqmovun_s16(sum));
1454 }
1455 p += step_y;
1456 const int p_diff =
1457 (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1458 // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
1459 // needed. Otherwise, we only need to load one vector because |p_diff|
1460 // can't exceed 1.
1461 s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
1462 if (grade_y > 1) {
1463 s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
1464 }
1465 dest16_y += dest_stride;
1466 dest_y += dest_stride;
1467
1468 filter_id = (p >> 6) & kSubPixelMask;
1469 filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1470 sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
1471 if (is_compound) {
1472 vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1473 } else {
1474 vst1_u8(dest_y, vqmovun_s16(sum));
1475 }
1476 p += step_y;
1477 src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
1478 prev_p = p;
1479 dest16_y += dest_stride;
1480 dest_y += dest_stride;
1481
1482 y += 2;
1483 } while (y < height);
1484 x += 8;
1485 } while (x < width);
1486 }
1487
1488 template <bool is_compound>
ConvolveScale2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1489 void ConvolveScale2D_NEON(const void* const reference,
1490 const ptrdiff_t reference_stride,
1491 const int horizontal_filter_index,
1492 const int vertical_filter_index, const int subpixel_x,
1493 const int subpixel_y, const int step_x,
1494 const int step_y, const int width, const int height,
1495 void* prediction, const ptrdiff_t pred_stride) {
1496 const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
1497 const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
1498 assert(step_x <= 2048);
1499 const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
1500 const int intermediate_height =
1501 (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
1502 kScaleSubPixelBits) +
1503 num_vert_taps;
1504 assert(step_x <= 2048);
1505 // The output of the horizontal filter, i.e. the intermediate_result, is
1506 // guaranteed to fit in int16_t.
1507 int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
1508 (2 * kMaxSuperBlockSizeInPixels + 8)];
1509
1510 // Horizontal filter.
1511 // Filter types used for width <= 4 are different from those for width > 4.
1512 // When width > 4, the valid filter index range is always [0, 3].
1513 // When width <= 4, the valid filter index range is always [3, 5].
1514 // Similarly for height.
1515 int filter_index = GetFilterIndex(horizontal_filter_index, width);
1516 int16_t* intermediate = intermediate_result;
1517 const ptrdiff_t src_stride = reference_stride;
1518 const auto* src = static_cast<const uint8_t*>(reference);
1519 const int vert_kernel_offset = (8 - num_vert_taps) / 2;
1520 src += vert_kernel_offset * src_stride;
1521
1522 // Derive the maximum value of |step_x| at which all source values fit in one
1523 // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
1524 // step_x*7 is the final base subpel index for the shuffle mask for filter
1525 // inputs in each iteration on large blocks. When step_x is large, we need a
1526 // larger structure and use a larger table lookup in order to gather all
1527 // filter inputs.
1528 // |num_taps| - 1 is the shuffle index of the final filter input.
1529 const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
1530 const int kernel_start_ceiling = 16 - num_horiz_taps;
1531 // This truncated quotient |grade_x_threshold| selects |step_x| such that:
1532 // (step_x * 7) >> kScaleSubPixelBits < single load limit
1533 const int grade_x_threshold =
1534 (kernel_start_ceiling << kScaleSubPixelBits) / 7;
1535 switch (filter_index) {
1536 case 0:
1537 if (step_x > grade_x_threshold) {
1538 ConvolveKernelHorizontalSigned6Tap<2>(
1539 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1540 intermediate);
1541 } else {
1542 ConvolveKernelHorizontalSigned6Tap<1>(
1543 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1544 intermediate);
1545 }
1546 break;
1547 case 1:
1548 if (step_x > grade_x_threshold) {
1549 ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
1550 step_x, intermediate_height,
1551 intermediate);
1552
1553 } else {
1554 ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
1555 step_x, intermediate_height,
1556 intermediate);
1557 }
1558 break;
1559 case 2:
1560 if (step_x > grade_x_threshold) {
1561 ConvolveKernelHorizontalSigned8Tap<2>(
1562 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1563 intermediate);
1564 } else {
1565 ConvolveKernelHorizontalSigned8Tap<1>(
1566 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1567 intermediate);
1568 }
1569 break;
1570 case 3:
1571 if (step_x > grade_x_threshold) {
1572 ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
1573 step_x, intermediate_height,
1574 intermediate);
1575 } else {
1576 ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
1577 step_x, intermediate_height,
1578 intermediate);
1579 }
1580 break;
1581 case 4:
1582 assert(width <= 4);
1583 ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
1584 intermediate_height, intermediate);
1585 break;
1586 default:
1587 assert(filter_index == 5);
1588 ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
1589 intermediate_height, intermediate);
1590 }
1591 // Vertical filter.
1592 filter_index = GetFilterIndex(vertical_filter_index, height);
1593 intermediate = intermediate_result;
1594
1595 switch (filter_index) {
1596 case 0:
1597 case 1:
1598 if (step_y <= 1024) {
1599 if (!is_compound && width == 2) {
1600 ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
1601 intermediate, subpixel_y, filter_index, step_y, height,
1602 prediction, pred_stride);
1603 } else if (width == 4) {
1604 ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
1605 intermediate, subpixel_y, filter_index, step_y, height,
1606 prediction, pred_stride);
1607 } else {
1608 ConvolveVerticalScale<6, 1, is_compound>(
1609 intermediate, width, subpixel_y, filter_index, step_y, height,
1610 prediction, pred_stride);
1611 }
1612 } else {
1613 if (!is_compound && width == 2) {
1614 ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
1615 intermediate, subpixel_y, filter_index, step_y, height,
1616 prediction, pred_stride);
1617 } else if (width == 4) {
1618 ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
1619 intermediate, subpixel_y, filter_index, step_y, height,
1620 prediction, pred_stride);
1621 } else {
1622 ConvolveVerticalScale<6, 2, is_compound>(
1623 intermediate, width, subpixel_y, filter_index, step_y, height,
1624 prediction, pred_stride);
1625 }
1626 }
1627 break;
1628 case 2:
1629 if (step_y <= 1024) {
1630 if (!is_compound && width == 2) {
1631 ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
1632 intermediate, subpixel_y, filter_index, step_y, height,
1633 prediction, pred_stride);
1634 } else if (width == 4) {
1635 ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
1636 intermediate, subpixel_y, filter_index, step_y, height,
1637 prediction, pred_stride);
1638 } else {
1639 ConvolveVerticalScale<8, 1, is_compound>(
1640 intermediate, width, subpixel_y, filter_index, step_y, height,
1641 prediction, pred_stride);
1642 }
1643 } else {
1644 if (!is_compound && width == 2) {
1645 ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
1646 intermediate, subpixel_y, filter_index, step_y, height,
1647 prediction, pred_stride);
1648 } else if (width == 4) {
1649 ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
1650 intermediate, subpixel_y, filter_index, step_y, height,
1651 prediction, pred_stride);
1652 } else {
1653 ConvolveVerticalScale<8, 2, is_compound>(
1654 intermediate, width, subpixel_y, filter_index, step_y, height,
1655 prediction, pred_stride);
1656 }
1657 }
1658 break;
1659 case 3:
1660 if (step_y <= 1024) {
1661 if (!is_compound && width == 2) {
1662 ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
1663 intermediate, subpixel_y, filter_index, step_y, height,
1664 prediction, pred_stride);
1665 } else if (width == 4) {
1666 ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
1667 intermediate, subpixel_y, filter_index, step_y, height,
1668 prediction, pred_stride);
1669 } else {
1670 ConvolveVerticalScale<2, 1, is_compound>(
1671 intermediate, width, subpixel_y, filter_index, step_y, height,
1672 prediction, pred_stride);
1673 }
1674 } else {
1675 if (!is_compound && width == 2) {
1676 ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
1677 intermediate, subpixel_y, filter_index, step_y, height,
1678 prediction, pred_stride);
1679 } else if (width == 4) {
1680 ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
1681 intermediate, subpixel_y, filter_index, step_y, height,
1682 prediction, pred_stride);
1683 } else {
1684 ConvolveVerticalScale<2, 2, is_compound>(
1685 intermediate, width, subpixel_y, filter_index, step_y, height,
1686 prediction, pred_stride);
1687 }
1688 }
1689 break;
1690 case 4:
1691 default:
1692 assert(filter_index == 4 || filter_index == 5);
1693 assert(height <= 4);
1694 if (step_y <= 1024) {
1695 if (!is_compound && width == 2) {
1696 ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
1697 intermediate, subpixel_y, filter_index, step_y, height,
1698 prediction, pred_stride);
1699 } else if (width == 4) {
1700 ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
1701 intermediate, subpixel_y, filter_index, step_y, height,
1702 prediction, pred_stride);
1703 } else {
1704 ConvolveVerticalScale<4, 1, is_compound>(
1705 intermediate, width, subpixel_y, filter_index, step_y, height,
1706 prediction, pred_stride);
1707 }
1708 } else {
1709 if (!is_compound && width == 2) {
1710 ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
1711 intermediate, subpixel_y, filter_index, step_y, height,
1712 prediction, pred_stride);
1713 } else if (width == 4) {
1714 ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
1715 intermediate, subpixel_y, filter_index, step_y, height,
1716 prediction, pred_stride);
1717 } else {
1718 ConvolveVerticalScale<4, 2, is_compound>(
1719 intermediate, width, subpixel_y, filter_index, step_y, height,
1720 prediction, pred_stride);
1721 }
1722 }
1723 }
1724 }
1725
ConvolveHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int subpixel_x,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1726 void ConvolveHorizontal_NEON(const void* const reference,
1727 const ptrdiff_t reference_stride,
1728 const int horizontal_filter_index,
1729 const int /*vertical_filter_index*/,
1730 const int subpixel_x, const int /*subpixel_y*/,
1731 const int width, const int height,
1732 void* prediction, const ptrdiff_t pred_stride) {
1733 const int filter_index = GetFilterIndex(horizontal_filter_index, width);
1734 // Set |src| to the outermost tap.
1735 const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
1736 auto* dest = static_cast<uint8_t*>(prediction);
1737
1738 DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
1739 subpixel_x, filter_index);
1740 }
1741
1742 // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
1743 // Vertical calculations.
Compound1DShift(const int16x8_t sum)1744 uint16x8_t Compound1DShift(const int16x8_t sum) {
1745 return vreinterpretq_u16_s16(
1746 vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
1747 }
1748
1749 template <int filter_index, bool is_compound = false,
1750 bool negative_outside_taps = false>
FilterVertical(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const uint8x8_t * const taps)1751 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
1752 void* const dst, const ptrdiff_t dst_stride,
1753 const int width, const int height,
1754 const uint8x8_t* const taps) {
1755 const int num_taps = GetNumTapsInFilter(filter_index);
1756 const int next_row = num_taps - 1;
1757 auto* dst8 = static_cast<uint8_t*>(dst);
1758 auto* dst16 = static_cast<uint16_t*>(dst);
1759 assert(width >= 8);
1760
1761 int x = 0;
1762 do {
1763 const uint8_t* src_x = src + x;
1764 uint8x8_t srcs[8];
1765 srcs[0] = vld1_u8(src_x);
1766 src_x += src_stride;
1767 if (num_taps >= 4) {
1768 srcs[1] = vld1_u8(src_x);
1769 src_x += src_stride;
1770 srcs[2] = vld1_u8(src_x);
1771 src_x += src_stride;
1772 if (num_taps >= 6) {
1773 srcs[3] = vld1_u8(src_x);
1774 src_x += src_stride;
1775 srcs[4] = vld1_u8(src_x);
1776 src_x += src_stride;
1777 if (num_taps == 8) {
1778 srcs[5] = vld1_u8(src_x);
1779 src_x += src_stride;
1780 srcs[6] = vld1_u8(src_x);
1781 src_x += src_stride;
1782 }
1783 }
1784 }
1785
1786 int y = 0;
1787 do {
1788 srcs[next_row] = vld1_u8(src_x);
1789 src_x += src_stride;
1790
1791 const int16x8_t sums =
1792 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1793 if (is_compound) {
1794 const uint16x8_t results = Compound1DShift(sums);
1795 vst1q_u16(dst16 + x + y * dst_stride, results);
1796 } else {
1797 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1798 vst1_u8(dst8 + x + y * dst_stride, results);
1799 }
1800
1801 srcs[0] = srcs[1];
1802 if (num_taps >= 4) {
1803 srcs[1] = srcs[2];
1804 srcs[2] = srcs[3];
1805 if (num_taps >= 6) {
1806 srcs[3] = srcs[4];
1807 srcs[4] = srcs[5];
1808 if (num_taps == 8) {
1809 srcs[5] = srcs[6];
1810 srcs[6] = srcs[7];
1811 }
1812 }
1813 }
1814 } while (++y < height);
1815 x += 8;
1816 } while (x < width);
1817 }
1818
1819 template <int filter_index, bool is_compound = false,
1820 bool negative_outside_taps = false>
FilterVertical4xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)1821 void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
1822 void* const dst, const ptrdiff_t dst_stride,
1823 const int height, const uint8x8_t* const taps) {
1824 const int num_taps = GetNumTapsInFilter(filter_index);
1825 auto* dst8 = static_cast<uint8_t*>(dst);
1826 auto* dst16 = static_cast<uint16_t*>(dst);
1827
1828 uint8x8_t srcs[9];
1829
1830 if (num_taps == 2) {
1831 srcs[2] = vdup_n_u8(0);
1832
1833 srcs[0] = Load4(src);
1834 src += src_stride;
1835
1836 int y = 0;
1837 do {
1838 srcs[0] = Load4<1>(src, srcs[0]);
1839 src += src_stride;
1840 srcs[2] = Load4<0>(src, srcs[2]);
1841 src += src_stride;
1842 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1843
1844 const int16x8_t sums =
1845 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1846 if (is_compound) {
1847 const uint16x8_t results = Compound1DShift(sums);
1848
1849 vst1q_u16(dst16, results);
1850 dst16 += 4 << 1;
1851 } else {
1852 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1853
1854 StoreLo4(dst8, results);
1855 dst8 += dst_stride;
1856 StoreHi4(dst8, results);
1857 dst8 += dst_stride;
1858 }
1859
1860 srcs[0] = srcs[2];
1861 y += 2;
1862 } while (y < height);
1863 } else if (num_taps == 4) {
1864 srcs[4] = vdup_n_u8(0);
1865
1866 srcs[0] = Load4(src);
1867 src += src_stride;
1868 srcs[0] = Load4<1>(src, srcs[0]);
1869 src += src_stride;
1870 srcs[2] = Load4(src);
1871 src += src_stride;
1872 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1873
1874 int y = 0;
1875 do {
1876 srcs[2] = Load4<1>(src, srcs[2]);
1877 src += src_stride;
1878 srcs[4] = Load4<0>(src, srcs[4]);
1879 src += src_stride;
1880 srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1881
1882 const int16x8_t sums =
1883 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1884 if (is_compound) {
1885 const uint16x8_t results = Compound1DShift(sums);
1886
1887 vst1q_u16(dst16, results);
1888 dst16 += 4 << 1;
1889 } else {
1890 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1891
1892 StoreLo4(dst8, results);
1893 dst8 += dst_stride;
1894 StoreHi4(dst8, results);
1895 dst8 += dst_stride;
1896 }
1897
1898 srcs[0] = srcs[2];
1899 srcs[1] = srcs[3];
1900 srcs[2] = srcs[4];
1901 y += 2;
1902 } while (y < height);
1903 } else if (num_taps == 6) {
1904 srcs[6] = vdup_n_u8(0);
1905
1906 srcs[0] = Load4(src);
1907 src += src_stride;
1908 srcs[0] = Load4<1>(src, srcs[0]);
1909 src += src_stride;
1910 srcs[2] = Load4(src);
1911 src += src_stride;
1912 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1913 srcs[2] = Load4<1>(src, srcs[2]);
1914 src += src_stride;
1915 srcs[4] = Load4(src);
1916 src += src_stride;
1917 srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1918
1919 int y = 0;
1920 do {
1921 srcs[4] = Load4<1>(src, srcs[4]);
1922 src += src_stride;
1923 srcs[6] = Load4<0>(src, srcs[6]);
1924 src += src_stride;
1925 srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1926
1927 const int16x8_t sums =
1928 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1929 if (is_compound) {
1930 const uint16x8_t results = Compound1DShift(sums);
1931
1932 vst1q_u16(dst16, results);
1933 dst16 += 4 << 1;
1934 } else {
1935 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1936
1937 StoreLo4(dst8, results);
1938 dst8 += dst_stride;
1939 StoreHi4(dst8, results);
1940 dst8 += dst_stride;
1941 }
1942
1943 srcs[0] = srcs[2];
1944 srcs[1] = srcs[3];
1945 srcs[2] = srcs[4];
1946 srcs[3] = srcs[5];
1947 srcs[4] = srcs[6];
1948 y += 2;
1949 } while (y < height);
1950 } else if (num_taps == 8) {
1951 srcs[8] = vdup_n_u8(0);
1952
1953 srcs[0] = Load4(src);
1954 src += src_stride;
1955 srcs[0] = Load4<1>(src, srcs[0]);
1956 src += src_stride;
1957 srcs[2] = Load4(src);
1958 src += src_stride;
1959 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1960 srcs[2] = Load4<1>(src, srcs[2]);
1961 src += src_stride;
1962 srcs[4] = Load4(src);
1963 src += src_stride;
1964 srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1965 srcs[4] = Load4<1>(src, srcs[4]);
1966 src += src_stride;
1967 srcs[6] = Load4(src);
1968 src += src_stride;
1969 srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1970
1971 int y = 0;
1972 do {
1973 srcs[6] = Load4<1>(src, srcs[6]);
1974 src += src_stride;
1975 srcs[8] = Load4<0>(src, srcs[8]);
1976 src += src_stride;
1977 srcs[7] = vext_u8(srcs[6], srcs[8], 4);
1978
1979 const int16x8_t sums =
1980 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1981 if (is_compound) {
1982 const uint16x8_t results = Compound1DShift(sums);
1983
1984 vst1q_u16(dst16, results);
1985 dst16 += 4 << 1;
1986 } else {
1987 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1988
1989 StoreLo4(dst8, results);
1990 dst8 += dst_stride;
1991 StoreHi4(dst8, results);
1992 dst8 += dst_stride;
1993 }
1994
1995 srcs[0] = srcs[2];
1996 srcs[1] = srcs[3];
1997 srcs[2] = srcs[4];
1998 srcs[3] = srcs[5];
1999 srcs[4] = srcs[6];
2000 srcs[5] = srcs[7];
2001 srcs[6] = srcs[8];
2002 y += 2;
2003 } while (y < height);
2004 }
2005 }
2006
2007 template <int filter_index, bool negative_outside_taps = false>
FilterVertical2xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)2008 void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
2009 void* const dst, const ptrdiff_t dst_stride,
2010 const int height, const uint8x8_t* const taps) {
2011 const int num_taps = GetNumTapsInFilter(filter_index);
2012 auto* dst8 = static_cast<uint8_t*>(dst);
2013
2014 uint8x8_t srcs[9];
2015
2016 if (num_taps == 2) {
2017 srcs[2] = vdup_n_u8(0);
2018
2019 srcs[0] = Load2(src);
2020 src += src_stride;
2021
2022 int y = 0;
2023 do {
2024 srcs[0] = Load2<1>(src, srcs[0]);
2025 src += src_stride;
2026 srcs[0] = Load2<2>(src, srcs[0]);
2027 src += src_stride;
2028 srcs[0] = Load2<3>(src, srcs[0]);
2029 src += src_stride;
2030 srcs[2] = Load2<0>(src, srcs[2]);
2031 src += src_stride;
2032 srcs[1] = vext_u8(srcs[0], srcs[2], 2);
2033
2034 // This uses srcs[0]..srcs[1].
2035 const int16x8_t sums =
2036 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2037 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2038
2039 Store2<0>(dst8, results);
2040 dst8 += dst_stride;
2041 Store2<1>(dst8, results);
2042 if (height == 2) return;
2043 dst8 += dst_stride;
2044 Store2<2>(dst8, results);
2045 dst8 += dst_stride;
2046 Store2<3>(dst8, results);
2047 dst8 += dst_stride;
2048
2049 srcs[0] = srcs[2];
2050 y += 4;
2051 } while (y < height);
2052 } else if (num_taps == 4) {
2053 srcs[4] = vdup_n_u8(0);
2054
2055 srcs[0] = Load2(src);
2056 src += src_stride;
2057 srcs[0] = Load2<1>(src, srcs[0]);
2058 src += src_stride;
2059 srcs[0] = Load2<2>(src, srcs[0]);
2060 src += src_stride;
2061
2062 int y = 0;
2063 do {
2064 srcs[0] = Load2<3>(src, srcs[0]);
2065 src += src_stride;
2066 srcs[4] = Load2<0>(src, srcs[4]);
2067 src += src_stride;
2068 srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2069 srcs[4] = Load2<1>(src, srcs[4]);
2070 src += src_stride;
2071 srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2072 srcs[4] = Load2<2>(src, srcs[4]);
2073 src += src_stride;
2074 srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2075
2076 // This uses srcs[0]..srcs[3].
2077 const int16x8_t sums =
2078 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2079 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2080
2081 Store2<0>(dst8, results);
2082 dst8 += dst_stride;
2083 Store2<1>(dst8, results);
2084 if (height == 2) return;
2085 dst8 += dst_stride;
2086 Store2<2>(dst8, results);
2087 dst8 += dst_stride;
2088 Store2<3>(dst8, results);
2089 dst8 += dst_stride;
2090
2091 srcs[0] = srcs[4];
2092 y += 4;
2093 } while (y < height);
2094 } else if (num_taps == 6) {
2095 // During the vertical pass the number of taps is restricted when
2096 // |height| <= 4.
2097 assert(height > 4);
2098 srcs[8] = vdup_n_u8(0);
2099
2100 srcs[0] = Load2(src);
2101 src += src_stride;
2102 srcs[0] = Load2<1>(src, srcs[0]);
2103 src += src_stride;
2104 srcs[0] = Load2<2>(src, srcs[0]);
2105 src += src_stride;
2106 srcs[0] = Load2<3>(src, srcs[0]);
2107 src += src_stride;
2108 srcs[4] = Load2(src);
2109 src += src_stride;
2110 srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2111
2112 int y = 0;
2113 do {
2114 srcs[4] = Load2<1>(src, srcs[4]);
2115 src += src_stride;
2116 srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2117 srcs[4] = Load2<2>(src, srcs[4]);
2118 src += src_stride;
2119 srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2120 srcs[4] = Load2<3>(src, srcs[4]);
2121 src += src_stride;
2122 srcs[8] = Load2<0>(src, srcs[8]);
2123 src += src_stride;
2124 srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2125
2126 // This uses srcs[0]..srcs[5].
2127 const int16x8_t sums =
2128 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2129 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2130
2131 Store2<0>(dst8, results);
2132 dst8 += dst_stride;
2133 Store2<1>(dst8, results);
2134 dst8 += dst_stride;
2135 Store2<2>(dst8, results);
2136 dst8 += dst_stride;
2137 Store2<3>(dst8, results);
2138 dst8 += dst_stride;
2139
2140 srcs[0] = srcs[4];
2141 srcs[1] = srcs[5];
2142 srcs[4] = srcs[8];
2143 y += 4;
2144 } while (y < height);
2145 } else if (num_taps == 8) {
2146 // During the vertical pass the number of taps is restricted when
2147 // |height| <= 4.
2148 assert(height > 4);
2149 srcs[8] = vdup_n_u8(0);
2150
2151 srcs[0] = Load2(src);
2152 src += src_stride;
2153 srcs[0] = Load2<1>(src, srcs[0]);
2154 src += src_stride;
2155 srcs[0] = Load2<2>(src, srcs[0]);
2156 src += src_stride;
2157 srcs[0] = Load2<3>(src, srcs[0]);
2158 src += src_stride;
2159 srcs[4] = Load2(src);
2160 src += src_stride;
2161 srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2162 srcs[4] = Load2<1>(src, srcs[4]);
2163 src += src_stride;
2164 srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2165 srcs[4] = Load2<2>(src, srcs[4]);
2166 src += src_stride;
2167 srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2168
2169 int y = 0;
2170 do {
2171 srcs[4] = Load2<3>(src, srcs[4]);
2172 src += src_stride;
2173 srcs[8] = Load2<0>(src, srcs[8]);
2174 src += src_stride;
2175 srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2176 srcs[8] = Load2<1>(src, srcs[8]);
2177 src += src_stride;
2178 srcs[6] = vext_u8(srcs[4], srcs[8], 4);
2179 srcs[8] = Load2<2>(src, srcs[8]);
2180 src += src_stride;
2181 srcs[7] = vext_u8(srcs[4], srcs[8], 6);
2182
2183 // This uses srcs[0]..srcs[7].
2184 const int16x8_t sums =
2185 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2186 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2187
2188 Store2<0>(dst8, results);
2189 dst8 += dst_stride;
2190 Store2<1>(dst8, results);
2191 dst8 += dst_stride;
2192 Store2<2>(dst8, results);
2193 dst8 += dst_stride;
2194 Store2<3>(dst8, results);
2195 dst8 += dst_stride;
2196
2197 srcs[0] = srcs[4];
2198 srcs[1] = srcs[5];
2199 srcs[2] = srcs[6];
2200 srcs[3] = srcs[7];
2201 srcs[4] = srcs[8];
2202 y += 4;
2203 } while (y < height);
2204 }
2205 }
2206
2207 // This function is a simplified version of Convolve2D_C.
2208 // It is called when it is single prediction mode, where only vertical
2209 // filtering is required.
2210 // The output is the single prediction of the block, clipped to valid pixel
2211 // range.
ConvolveVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)2212 void ConvolveVertical_NEON(const void* const reference,
2213 const ptrdiff_t reference_stride,
2214 const int /*horizontal_filter_index*/,
2215 const int vertical_filter_index,
2216 const int /*subpixel_x*/, const int subpixel_y,
2217 const int width, const int height, void* prediction,
2218 const ptrdiff_t pred_stride) {
2219 const int filter_index = GetFilterIndex(vertical_filter_index, height);
2220 const int vertical_taps = GetNumTapsInFilter(filter_index);
2221 const ptrdiff_t src_stride = reference_stride;
2222 const auto* src = static_cast<const uint8_t*>(reference) -
2223 (vertical_taps / 2 - 1) * src_stride;
2224 auto* dest = static_cast<uint8_t*>(prediction);
2225 const ptrdiff_t dest_stride = pred_stride;
2226 const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
2227 assert(filter_id != 0);
2228
2229 uint8x8_t taps[8];
2230 for (int k = 0; k < kSubPixelTaps; ++k) {
2231 taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
2232 }
2233
2234 if (filter_index == 0) { // 6 tap.
2235 if (width == 2) {
2236 FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
2237 taps + 1);
2238 } else if (width == 4) {
2239 FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
2240 taps + 1);
2241 } else {
2242 FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
2243 taps + 1);
2244 }
2245 } else if ((filter_index == 1) &
2246 ((filter_id == 1) | (filter_id == 15))) { // 5 tap.
2247 if (width == 2) {
2248 FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
2249 taps + 1);
2250 } else if (width == 4) {
2251 FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
2252 taps + 1);
2253 } else {
2254 FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
2255 taps + 1);
2256 }
2257 } else if ((filter_index == 1) &
2258 ((filter_id == 7) | (filter_id == 8) |
2259 (filter_id == 9))) { // 6 tap with weird negative taps.
2260 if (width == 2) {
2261 FilterVertical2xH<1,
2262 /*negative_outside_taps=*/true>(
2263 src, src_stride, dest, dest_stride, height, taps + 1);
2264 } else if (width == 4) {
2265 FilterVertical4xH<1, /*is_compound=*/false,
2266 /*negative_outside_taps=*/true>(
2267 src, src_stride, dest, dest_stride, height, taps + 1);
2268 } else {
2269 FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
2270 src, src_stride, dest, dest_stride, width, height, taps + 1);
2271 }
2272 } else if (filter_index == 2) { // 8 tap.
2273 if (width == 2) {
2274 FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
2275 } else if (width == 4) {
2276 FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
2277 } else {
2278 FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
2279 taps);
2280 }
2281 } else if (filter_index == 3) { // 2 tap.
2282 if (width == 2) {
2283 FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
2284 taps + 3);
2285 } else if (width == 4) {
2286 FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
2287 taps + 3);
2288 } else {
2289 FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
2290 taps + 3);
2291 }
2292 } else if (filter_index == 4) { // 4 tap.
2293 // Outside taps are negative.
2294 if (width == 2) {
2295 FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
2296 taps + 2);
2297 } else if (width == 4) {
2298 FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
2299 taps + 2);
2300 } else {
2301 FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
2302 taps + 2);
2303 }
2304 } else {
2305 // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
2306 // to 4 tap filters.
2307 assert(filter_index == 5 ||
2308 (filter_index == 1 &&
2309 (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
2310 filter_id == 5 || filter_id == 6 || filter_id == 10 ||
2311 filter_id == 11 || filter_id == 12 || filter_id == 13 ||
2312 filter_id == 14)));
2313 // According to GetNumTapsInFilter() this has 6 taps but here we are
2314 // treating it as though it has 4.
2315 if (filter_index == 1) src += src_stride;
2316 if (width == 2) {
2317 FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
2318 taps + 2);
2319 } else if (width == 4) {
2320 FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
2321 taps + 2);
2322 } else {
2323 FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
2324 taps + 2);
2325 }
2326 }
2327 }
2328
ConvolveCompoundCopy_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t)2329 void ConvolveCompoundCopy_NEON(
2330 const void* const reference, const ptrdiff_t reference_stride,
2331 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2332 const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2333 const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2334 const auto* src = static_cast<const uint8_t*>(reference);
2335 const ptrdiff_t src_stride = reference_stride;
2336 auto* dest = static_cast<uint16_t*>(prediction);
2337 constexpr int final_shift =
2338 kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
2339
2340 if (width >= 16) {
2341 int y = 0;
2342 do {
2343 int x = 0;
2344 do {
2345 const uint8x16_t v_src = vld1q_u8(&src[x]);
2346 const uint16x8_t v_dest_lo =
2347 vshll_n_u8(vget_low_u8(v_src), final_shift);
2348 const uint16x8_t v_dest_hi =
2349 vshll_n_u8(vget_high_u8(v_src), final_shift);
2350 vst1q_u16(&dest[x], v_dest_lo);
2351 x += 8;
2352 vst1q_u16(&dest[x], v_dest_hi);
2353 x += 8;
2354 } while (x < width);
2355 src += src_stride;
2356 dest += width;
2357 } while (++y < height);
2358 } else if (width == 8) {
2359 int y = 0;
2360 do {
2361 const uint8x8_t v_src = vld1_u8(&src[0]);
2362 const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2363 vst1q_u16(&dest[0], v_dest);
2364 src += src_stride;
2365 dest += width;
2366 } while (++y < height);
2367 } else { /* width == 4 */
2368 uint8x8_t v_src = vdup_n_u8(0);
2369
2370 int y = 0;
2371 do {
2372 v_src = Load4<0>(&src[0], v_src);
2373 src += src_stride;
2374 v_src = Load4<1>(&src[0], v_src);
2375 src += src_stride;
2376 const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2377 vst1q_u16(&dest[0], v_dest);
2378 dest += 4 << 1;
2379 y += 2;
2380 } while (y < height);
2381 }
2382 }
2383
ConvolveCompoundVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t)2384 void ConvolveCompoundVertical_NEON(
2385 const void* const reference, const ptrdiff_t reference_stride,
2386 const int /*horizontal_filter_index*/, const int vertical_filter_index,
2387 const int /*subpixel_x*/, const int subpixel_y, const int width,
2388 const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2389 const int filter_index = GetFilterIndex(vertical_filter_index, height);
2390 const int vertical_taps = GetNumTapsInFilter(filter_index);
2391 const ptrdiff_t src_stride = reference_stride;
2392 const auto* src = static_cast<const uint8_t*>(reference) -
2393 (vertical_taps / 2 - 1) * src_stride;
2394 auto* dest = static_cast<uint16_t*>(prediction);
2395 const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
2396 assert(filter_id != 0);
2397
2398 uint8x8_t taps[8];
2399 for (int k = 0; k < kSubPixelTaps; ++k) {
2400 taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
2401 }
2402
2403 if (filter_index == 0) { // 6 tap.
2404 if (width == 4) {
2405 FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
2406 height, taps + 1);
2407 } else {
2408 FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
2409 width, height, taps + 1);
2410 }
2411 } else if ((filter_index == 1) &
2412 ((filter_id == 1) | (filter_id == 15))) { // 5 tap.
2413 if (width == 4) {
2414 FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
2415 height, taps + 1);
2416 } else {
2417 FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
2418 width, height, taps + 1);
2419 }
2420 } else if ((filter_index == 1) &
2421 ((filter_id == 7) | (filter_id == 8) |
2422 (filter_id == 9))) { // 6 tap with weird negative taps.
2423 if (width == 4) {
2424 FilterVertical4xH<1, /*is_compound=*/true,
2425 /*negative_outside_taps=*/true>(src, src_stride, dest,
2426 4, height, taps + 1);
2427 } else {
2428 FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
2429 src, src_stride, dest, width, width, height, taps + 1);
2430 }
2431 } else if (filter_index == 2) { // 8 tap.
2432 if (width == 4) {
2433 FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
2434 height, taps);
2435 } else {
2436 FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
2437 width, height, taps);
2438 }
2439 } else if (filter_index == 3) { // 2 tap.
2440 if (width == 4) {
2441 FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
2442 height, taps + 3);
2443 } else {
2444 FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
2445 width, height, taps + 3);
2446 }
2447 } else if (filter_index == 4) { // 4 tap.
2448 if (width == 4) {
2449 FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
2450 height, taps + 2);
2451 } else {
2452 FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
2453 width, height, taps + 2);
2454 }
2455 } else {
2456 // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
2457 // to 4 tap filters.
2458 assert(filter_index == 5 ||
2459 (filter_index == 1 &&
2460 (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
2461 filter_id == 5 || filter_id == 6 || filter_id == 10 ||
2462 filter_id == 11 || filter_id == 12 || filter_id == 13 ||
2463 filter_id == 14)));
2464 // According to GetNumTapsInFilter() this has 6 taps but here we are
2465 // treating it as though it has 4.
2466 if (filter_index == 1) src += src_stride;
2467 if (width == 4) {
2468 FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
2469 height, taps + 2);
2470 } else {
2471 FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
2472 width, height, taps + 2);
2473 }
2474 }
2475 }
2476
ConvolveCompoundHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int subpixel_x,const int,const int width,const int height,void * prediction,const ptrdiff_t)2477 void ConvolveCompoundHorizontal_NEON(
2478 const void* const reference, const ptrdiff_t reference_stride,
2479 const int horizontal_filter_index, const int /*vertical_filter_index*/,
2480 const int subpixel_x, const int /*subpixel_y*/, const int width,
2481 const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2482 const int filter_index = GetFilterIndex(horizontal_filter_index, width);
2483 const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
2484 auto* dest = static_cast<uint16_t*>(prediction);
2485
2486 DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
2487 src, reference_stride, dest, width, width, height, subpixel_x,
2488 filter_index);
2489 }
2490
ConvolveCompound2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t)2491 void ConvolveCompound2D_NEON(
2492 const void* const reference, const ptrdiff_t reference_stride,
2493 const int horizontal_filter_index, const int vertical_filter_index,
2494 const int subpixel_x, const int subpixel_y, const int width,
2495 const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2496 // The output of the horizontal filter, i.e. the intermediate_result, is
2497 // guaranteed to fit in int16_t.
2498 uint16_t
2499 intermediate_result[kMaxSuperBlockSizeInPixels *
2500 (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
2501
2502 // Horizontal filter.
2503 // Filter types used for width <= 4 are different from those for width > 4.
2504 // When width > 4, the valid filter index range is always [0, 3].
2505 // When width <= 4, the valid filter index range is always [4, 5].
2506 // Similarly for height.
2507 const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
2508 const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
2509 const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
2510 const int intermediate_height = height + vertical_taps - 1;
2511 const ptrdiff_t src_stride = reference_stride;
2512 const auto* const src = static_cast<const uint8_t*>(reference) -
2513 (vertical_taps / 2 - 1) * src_stride -
2514 kHorizontalOffset;
2515
2516 DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
2517 src, src_stride, intermediate_result, width, width, intermediate_height,
2518 subpixel_x, horiz_filter_index);
2519
2520 // Vertical filter.
2521 auto* dest = static_cast<uint16_t*>(prediction);
2522 const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
2523 assert(filter_id != 0);
2524
2525 const ptrdiff_t dest_stride = width;
2526 const int16x8_t taps =
2527 vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
2528
2529 if (vertical_taps == 8) {
2530 if (width == 4) {
2531 Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
2532 dest_stride, height, taps);
2533 } else {
2534 Filter2DVertical<8, /*is_compound=*/true>(
2535 intermediate_result, dest, dest_stride, width, height, taps);
2536 }
2537 } else if (vertical_taps == 6) {
2538 if (width == 4) {
2539 Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
2540 dest_stride, height, taps);
2541 } else {
2542 Filter2DVertical<6, /*is_compound=*/true>(
2543 intermediate_result, dest, dest_stride, width, height, taps);
2544 }
2545 } else if (vertical_taps == 4) {
2546 if (width == 4) {
2547 Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
2548 dest_stride, height, taps);
2549 } else {
2550 Filter2DVertical<4, /*is_compound=*/true>(
2551 intermediate_result, dest, dest_stride, width, height, taps);
2552 }
2553 } else { // |vertical_taps| == 2
2554 if (width == 4) {
2555 Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
2556 dest_stride, height, taps);
2557 } else {
2558 Filter2DVertical<2, /*is_compound=*/true>(
2559 intermediate_result, dest, dest_stride, width, height, taps);
2560 }
2561 }
2562 }
2563
HalfAddHorizontal(const uint8_t * src,uint8_t * dst)2564 inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
2565 const uint8x16_t left = vld1q_u8(src);
2566 const uint8x16_t right = vld1q_u8(src + 1);
2567 vst1q_u8(dst, vrhaddq_u8(left, right));
2568 }
2569
2570 template <int width>
IntraBlockCopyHorizontal(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2571 inline void IntraBlockCopyHorizontal(const uint8_t* src,
2572 const ptrdiff_t src_stride,
2573 const int height, uint8_t* dst,
2574 const ptrdiff_t dst_stride) {
2575 const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2576 const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2577
2578 int y = 0;
2579 do {
2580 HalfAddHorizontal(src, dst);
2581 if (width >= 32) {
2582 src += 16;
2583 dst += 16;
2584 HalfAddHorizontal(src, dst);
2585 if (width >= 64) {
2586 src += 16;
2587 dst += 16;
2588 HalfAddHorizontal(src, dst);
2589 src += 16;
2590 dst += 16;
2591 HalfAddHorizontal(src, dst);
2592 if (width == 128) {
2593 src += 16;
2594 dst += 16;
2595 HalfAddHorizontal(src, dst);
2596 src += 16;
2597 dst += 16;
2598 HalfAddHorizontal(src, dst);
2599 src += 16;
2600 dst += 16;
2601 HalfAddHorizontal(src, dst);
2602 src += 16;
2603 dst += 16;
2604 HalfAddHorizontal(src, dst);
2605 }
2606 }
2607 }
2608 src += src_remainder_stride;
2609 dst += dst_remainder_stride;
2610 } while (++y < height);
2611 }
2612
ConvolveIntraBlockCopyHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2613 void ConvolveIntraBlockCopyHorizontal_NEON(
2614 const void* const reference, const ptrdiff_t reference_stride,
2615 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2616 const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2617 const int height, void* const prediction, const ptrdiff_t pred_stride) {
2618 const auto* src = static_cast<const uint8_t*>(reference);
2619 auto* dest = static_cast<uint8_t*>(prediction);
2620
2621 if (width == 128) {
2622 IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
2623 pred_stride);
2624 } else if (width == 64) {
2625 IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
2626 pred_stride);
2627 } else if (width == 32) {
2628 IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
2629 pred_stride);
2630 } else if (width == 16) {
2631 IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
2632 pred_stride);
2633 } else if (width == 8) {
2634 int y = 0;
2635 do {
2636 const uint8x8_t left = vld1_u8(src);
2637 const uint8x8_t right = vld1_u8(src + 1);
2638 vst1_u8(dest, vrhadd_u8(left, right));
2639
2640 src += reference_stride;
2641 dest += pred_stride;
2642 } while (++y < height);
2643 } else if (width == 4) {
2644 uint8x8_t left = vdup_n_u8(0);
2645 uint8x8_t right = vdup_n_u8(0);
2646 int y = 0;
2647 do {
2648 left = Load4<0>(src, left);
2649 right = Load4<0>(src + 1, right);
2650 src += reference_stride;
2651 left = Load4<1>(src, left);
2652 right = Load4<1>(src + 1, right);
2653 src += reference_stride;
2654
2655 const uint8x8_t result = vrhadd_u8(left, right);
2656
2657 StoreLo4(dest, result);
2658 dest += pred_stride;
2659 StoreHi4(dest, result);
2660 dest += pred_stride;
2661 y += 2;
2662 } while (y < height);
2663 } else {
2664 assert(width == 2);
2665 uint8x8_t left = vdup_n_u8(0);
2666 uint8x8_t right = vdup_n_u8(0);
2667 int y = 0;
2668 do {
2669 left = Load2<0>(src, left);
2670 right = Load2<0>(src + 1, right);
2671 src += reference_stride;
2672 left = Load2<1>(src, left);
2673 right = Load2<1>(src + 1, right);
2674 src += reference_stride;
2675
2676 const uint8x8_t result = vrhadd_u8(left, right);
2677
2678 Store2<0>(dest, result);
2679 dest += pred_stride;
2680 Store2<1>(dest, result);
2681 dest += pred_stride;
2682 y += 2;
2683 } while (y < height);
2684 }
2685 }
2686
2687 template <int width>
IntraBlockCopyVertical(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2688 inline void IntraBlockCopyVertical(const uint8_t* src,
2689 const ptrdiff_t src_stride, const int height,
2690 uint8_t* dst, const ptrdiff_t dst_stride) {
2691 const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2692 const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2693 uint8x16_t row[8], below[8];
2694
2695 row[0] = vld1q_u8(src);
2696 if (width >= 32) {
2697 src += 16;
2698 row[1] = vld1q_u8(src);
2699 if (width >= 64) {
2700 src += 16;
2701 row[2] = vld1q_u8(src);
2702 src += 16;
2703 row[3] = vld1q_u8(src);
2704 if (width == 128) {
2705 src += 16;
2706 row[4] = vld1q_u8(src);
2707 src += 16;
2708 row[5] = vld1q_u8(src);
2709 src += 16;
2710 row[6] = vld1q_u8(src);
2711 src += 16;
2712 row[7] = vld1q_u8(src);
2713 }
2714 }
2715 }
2716 src += src_remainder_stride;
2717
2718 int y = 0;
2719 do {
2720 below[0] = vld1q_u8(src);
2721 if (width >= 32) {
2722 src += 16;
2723 below[1] = vld1q_u8(src);
2724 if (width >= 64) {
2725 src += 16;
2726 below[2] = vld1q_u8(src);
2727 src += 16;
2728 below[3] = vld1q_u8(src);
2729 if (width == 128) {
2730 src += 16;
2731 below[4] = vld1q_u8(src);
2732 src += 16;
2733 below[5] = vld1q_u8(src);
2734 src += 16;
2735 below[6] = vld1q_u8(src);
2736 src += 16;
2737 below[7] = vld1q_u8(src);
2738 }
2739 }
2740 }
2741 src += src_remainder_stride;
2742
2743 vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
2744 row[0] = below[0];
2745 if (width >= 32) {
2746 dst += 16;
2747 vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
2748 row[1] = below[1];
2749 if (width >= 64) {
2750 dst += 16;
2751 vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
2752 row[2] = below[2];
2753 dst += 16;
2754 vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
2755 row[3] = below[3];
2756 if (width >= 128) {
2757 dst += 16;
2758 vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
2759 row[4] = below[4];
2760 dst += 16;
2761 vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
2762 row[5] = below[5];
2763 dst += 16;
2764 vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
2765 row[6] = below[6];
2766 dst += 16;
2767 vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
2768 row[7] = below[7];
2769 }
2770 }
2771 }
2772 dst += dst_remainder_stride;
2773 } while (++y < height);
2774 }
2775
ConvolveIntraBlockCopyVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2776 void ConvolveIntraBlockCopyVertical_NEON(
2777 const void* const reference, const ptrdiff_t reference_stride,
2778 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2779 const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2780 const int height, void* const prediction, const ptrdiff_t pred_stride) {
2781 const auto* src = static_cast<const uint8_t*>(reference);
2782 auto* dest = static_cast<uint8_t*>(prediction);
2783
2784 if (width == 128) {
2785 IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
2786 pred_stride);
2787 } else if (width == 64) {
2788 IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
2789 pred_stride);
2790 } else if (width == 32) {
2791 IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
2792 pred_stride);
2793 } else if (width == 16) {
2794 IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
2795 pred_stride);
2796 } else if (width == 8) {
2797 uint8x8_t row, below;
2798 row = vld1_u8(src);
2799 src += reference_stride;
2800
2801 int y = 0;
2802 do {
2803 below = vld1_u8(src);
2804 src += reference_stride;
2805
2806 vst1_u8(dest, vrhadd_u8(row, below));
2807 dest += pred_stride;
2808
2809 row = below;
2810 } while (++y < height);
2811 } else if (width == 4) {
2812 uint8x8_t row = Load4(src);
2813 uint8x8_t below = vdup_n_u8(0);
2814 src += reference_stride;
2815
2816 int y = 0;
2817 do {
2818 below = Load4<0>(src, below);
2819 src += reference_stride;
2820
2821 StoreLo4(dest, vrhadd_u8(row, below));
2822 dest += pred_stride;
2823
2824 row = below;
2825 } while (++y < height);
2826 } else {
2827 assert(width == 2);
2828 uint8x8_t row = Load2(src);
2829 uint8x8_t below = vdup_n_u8(0);
2830 src += reference_stride;
2831
2832 int y = 0;
2833 do {
2834 below = Load2<0>(src, below);
2835 src += reference_stride;
2836
2837 Store2<0>(dest, vrhadd_u8(row, below));
2838 dest += pred_stride;
2839
2840 row = below;
2841 } while (++y < height);
2842 }
2843 }
2844
2845 template <int width>
IntraBlockCopy2D(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2846 inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
2847 const int height, uint8_t* dst,
2848 const ptrdiff_t dst_stride) {
2849 const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
2850 const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
2851 uint16x8_t row[16];
2852 row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2853 if (width >= 16) {
2854 src += 8;
2855 row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2856 if (width >= 32) {
2857 src += 8;
2858 row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2859 src += 8;
2860 row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2861 if (width >= 64) {
2862 src += 8;
2863 row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2864 src += 8;
2865 row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2866 src += 8;
2867 row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2868 src += 8;
2869 row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2870 if (width == 128) {
2871 src += 8;
2872 row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2873 src += 8;
2874 row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2875 src += 8;
2876 row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2877 src += 8;
2878 row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2879 src += 8;
2880 row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2881 src += 8;
2882 row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2883 src += 8;
2884 row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2885 src += 8;
2886 row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2887 }
2888 }
2889 }
2890 }
2891 src += src_remainder_stride;
2892
2893 int y = 0;
2894 do {
2895 const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2896 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
2897 row[0] = below_0;
2898 if (width >= 16) {
2899 src += 8;
2900 dst += 8;
2901
2902 const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2903 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
2904 row[1] = below_1;
2905 if (width >= 32) {
2906 src += 8;
2907 dst += 8;
2908
2909 const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2910 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
2911 row[2] = below_2;
2912 src += 8;
2913 dst += 8;
2914
2915 const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2916 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
2917 row[3] = below_3;
2918 if (width >= 64) {
2919 src += 8;
2920 dst += 8;
2921
2922 const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2923 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
2924 row[4] = below_4;
2925 src += 8;
2926 dst += 8;
2927
2928 const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2929 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
2930 row[5] = below_5;
2931 src += 8;
2932 dst += 8;
2933
2934 const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2935 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
2936 row[6] = below_6;
2937 src += 8;
2938 dst += 8;
2939
2940 const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2941 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
2942 row[7] = below_7;
2943 if (width == 128) {
2944 src += 8;
2945 dst += 8;
2946
2947 const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2948 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
2949 row[8] = below_8;
2950 src += 8;
2951 dst += 8;
2952
2953 const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2954 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
2955 row[9] = below_9;
2956 src += 8;
2957 dst += 8;
2958
2959 const uint16x8_t below_10 =
2960 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2961 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
2962 row[10] = below_10;
2963 src += 8;
2964 dst += 8;
2965
2966 const uint16x8_t below_11 =
2967 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2968 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
2969 row[11] = below_11;
2970 src += 8;
2971 dst += 8;
2972
2973 const uint16x8_t below_12 =
2974 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2975 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
2976 row[12] = below_12;
2977 src += 8;
2978 dst += 8;
2979
2980 const uint16x8_t below_13 =
2981 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2982 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
2983 row[13] = below_13;
2984 src += 8;
2985 dst += 8;
2986
2987 const uint16x8_t below_14 =
2988 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2989 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
2990 row[14] = below_14;
2991 src += 8;
2992 dst += 8;
2993
2994 const uint16x8_t below_15 =
2995 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2996 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
2997 row[15] = below_15;
2998 }
2999 }
3000 }
3001 }
3002 src += src_remainder_stride;
3003 dst += dst_remainder_stride;
3004 } while (++y < height);
3005 }
3006
ConvolveIntraBlockCopy2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)3007 void ConvolveIntraBlockCopy2D_NEON(
3008 const void* const reference, const ptrdiff_t reference_stride,
3009 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
3010 const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
3011 const int height, void* const prediction, const ptrdiff_t pred_stride) {
3012 const auto* src = static_cast<const uint8_t*>(reference);
3013 auto* dest = static_cast<uint8_t*>(prediction);
3014 // Note: allow vertical access to height + 1. Because this function is only
3015 // for u/v plane of intra block copy, such access is guaranteed to be within
3016 // the prediction block.
3017
3018 if (width == 128) {
3019 IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
3020 } else if (width == 64) {
3021 IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
3022 } else if (width == 32) {
3023 IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
3024 } else if (width == 16) {
3025 IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
3026 } else if (width == 8) {
3027 IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
3028 } else if (width == 4) {
3029 uint8x8_t left = Load4(src);
3030 uint8x8_t right = Load4(src + 1);
3031 src += reference_stride;
3032
3033 uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3034
3035 int y = 0;
3036 do {
3037 left = Load4<0>(src, left);
3038 right = Load4<0>(src + 1, right);
3039 src += reference_stride;
3040 left = Load4<1>(src, left);
3041 right = Load4<1>(src + 1, right);
3042 src += reference_stride;
3043
3044 const uint16x8_t below = vaddl_u8(left, right);
3045
3046 const uint8x8_t result = vrshrn_n_u16(
3047 vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3048 StoreLo4(dest, result);
3049 dest += pred_stride;
3050 StoreHi4(dest, result);
3051 dest += pred_stride;
3052
3053 row = vget_high_u16(below);
3054 y += 2;
3055 } while (y < height);
3056 } else {
3057 uint8x8_t left = Load2(src);
3058 uint8x8_t right = Load2(src + 1);
3059 src += reference_stride;
3060
3061 uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3062
3063 int y = 0;
3064 do {
3065 left = Load2<0>(src, left);
3066 right = Load2<0>(src + 1, right);
3067 src += reference_stride;
3068 left = Load2<2>(src, left);
3069 right = Load2<2>(src + 1, right);
3070 src += reference_stride;
3071
3072 const uint16x8_t below = vaddl_u8(left, right);
3073
3074 const uint8x8_t result = vrshrn_n_u16(
3075 vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3076 Store2<0>(dest, result);
3077 dest += pred_stride;
3078 Store2<2>(dest, result);
3079 dest += pred_stride;
3080
3081 row = vget_high_u16(below);
3082 y += 2;
3083 } while (y < height);
3084 }
3085 }
3086
Init8bpp()3087 void Init8bpp() {
3088 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
3089 assert(dsp != nullptr);
3090 dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
3091 dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
3092 dsp->convolve[0][0][1][1] = Convolve2D_NEON;
3093
3094 dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
3095 dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
3096 dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
3097 dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
3098
3099 dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
3100 dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
3101 dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
3102
3103 dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
3104 dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
3105 }
3106
3107 } // namespace
3108 } // namespace low_bitdepth
3109
ConvolveInit_NEON()3110 void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
3111
3112 } // namespace dsp
3113 } // namespace libgav1
3114
3115 #else // !LIBGAV1_ENABLE_NEON
3116
3117 namespace libgav1 {
3118 namespace dsp {
3119
ConvolveInit_NEON()3120 void ConvolveInit_NEON() {}
3121
3122 } // namespace dsp
3123 } // namespace libgav1
3124 #endif // LIBGAV1_ENABLE_NEON
3125