• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/convolve.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 
27 #include "src/dsp/arm/common_neon.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32 
33 namespace libgav1 {
34 namespace dsp {
35 namespace low_bitdepth {
36 namespace {
37 
38 constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
39 constexpr int kHorizontalOffset = 3;
40 constexpr int kFilterIndexShift = 6;
41 
42 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
43 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
44 // sum from outranging int16_t.
45 template <int filter_index, bool negative_outside_taps = false>
SumOnePassTaps(const uint8x8_t * const src,const uint8x8_t * const taps)46 int16x8_t SumOnePassTaps(const uint8x8_t* const src,
47                          const uint8x8_t* const taps) {
48   uint16x8_t sum;
49   if (filter_index == 0) {
50     // 6 taps. + - + + - +
51     sum = vmull_u8(src[0], taps[0]);
52     // Unsigned overflow will result in a valid int16_t value.
53     sum = vmlsl_u8(sum, src[1], taps[1]);
54     sum = vmlal_u8(sum, src[2], taps[2]);
55     sum = vmlal_u8(sum, src[3], taps[3]);
56     sum = vmlsl_u8(sum, src[4], taps[4]);
57     sum = vmlal_u8(sum, src[5], taps[5]);
58   } else if (filter_index == 1 && negative_outside_taps) {
59     // 6 taps. - + + + + -
60     // Set a base we can subtract from.
61     sum = vmull_u8(src[1], taps[1]);
62     sum = vmlsl_u8(sum, src[0], taps[0]);
63     sum = vmlal_u8(sum, src[2], taps[2]);
64     sum = vmlal_u8(sum, src[3], taps[3]);
65     sum = vmlal_u8(sum, src[4], taps[4]);
66     sum = vmlsl_u8(sum, src[5], taps[5]);
67   } else if (filter_index == 1) {
68     // 6 taps. All are positive.
69     sum = vmull_u8(src[0], taps[0]);
70     sum = vmlal_u8(sum, src[1], taps[1]);
71     sum = vmlal_u8(sum, src[2], taps[2]);
72     sum = vmlal_u8(sum, src[3], taps[3]);
73     sum = vmlal_u8(sum, src[4], taps[4]);
74     sum = vmlal_u8(sum, src[5], taps[5]);
75   } else if (filter_index == 2) {
76     // 8 taps. - + - + + - + -
77     sum = vmull_u8(src[1], taps[1]);
78     sum = vmlsl_u8(sum, src[0], taps[0]);
79     sum = vmlsl_u8(sum, src[2], taps[2]);
80     sum = vmlal_u8(sum, src[3], taps[3]);
81     sum = vmlal_u8(sum, src[4], taps[4]);
82     sum = vmlsl_u8(sum, src[5], taps[5]);
83     sum = vmlal_u8(sum, src[6], taps[6]);
84     sum = vmlsl_u8(sum, src[7], taps[7]);
85   } else if (filter_index == 3) {
86     // 2 taps. All are positive.
87     sum = vmull_u8(src[0], taps[0]);
88     sum = vmlal_u8(sum, src[1], taps[1]);
89   } else if (filter_index == 4) {
90     // 4 taps. - + + -
91     sum = vmull_u8(src[1], taps[1]);
92     sum = vmlsl_u8(sum, src[0], taps[0]);
93     sum = vmlal_u8(sum, src[2], taps[2]);
94     sum = vmlsl_u8(sum, src[3], taps[3]);
95   } else if (filter_index == 5) {
96     // 4 taps. All are positive.
97     sum = vmull_u8(src[0], taps[0]);
98     sum = vmlal_u8(sum, src[1], taps[1]);
99     sum = vmlal_u8(sum, src[2], taps[2]);
100     sum = vmlal_u8(sum, src[3], taps[3]);
101   }
102   return vreinterpretq_s16_u16(sum);
103 }
104 
105 template <int filter_index, bool negative_outside_taps>
SumHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)106 int16x8_t SumHorizontalTaps(const uint8_t* const src,
107                             const uint8x8_t* const v_tap) {
108   uint8x8_t v_src[8];
109   const uint8x16_t src_long = vld1q_u8(src);
110   int16x8_t sum;
111 
112   if (filter_index < 2) {
113     v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
114     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
115     v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
116     v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
117     v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
118     v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
119     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
120   } else if (filter_index == 2) {
121     v_src[0] = vget_low_u8(src_long);
122     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
123     v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
124     v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
125     v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
126     v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
127     v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
128     v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
129     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
130   } else if (filter_index == 3) {
131     v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
132     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
133     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
134   } else if (filter_index > 3) {
135     v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
136     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
137     v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
138     v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
139     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
140   }
141   return sum;
142 }
143 
144 template <int filter_index, bool negative_outside_taps>
SimpleHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)145 uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
146                                const uint8x8_t* const v_tap) {
147   int16x8_t sum =
148       SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
149 
150   // Normally the Horizontal pass does the downshift in two passes:
151   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
152   // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
153   // requires adding the rounding offset from the skipped shift.
154   constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
155 
156   sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
157   return vqrshrun_n_s16(sum, kFilterBits - 1);
158 }
159 
160 template <int filter_index, bool negative_outside_taps>
HorizontalTaps8To16(const uint8_t * const src,const uint8x8_t * const v_tap)161 uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
162                                const uint8x8_t* const v_tap) {
163   const int16x8_t sum =
164       SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
165 
166   return vreinterpretq_u16_s16(
167       vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
168 }
169 
170 template <int filter_index>
SumHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)171 int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
172                                const uint8x8_t* const v_tap) {
173   uint16x8_t sum;
174   const uint8x8_t input0 = vld1_u8(src);
175   src += src_stride;
176   const uint8x8_t input1 = vld1_u8(src);
177   uint8x8x2_t input = vzip_u8(input0, input1);
178 
179   if (filter_index == 3) {
180     // tap signs : + +
181     sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
182     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
183   } else if (filter_index == 4) {
184     // tap signs : - + + -
185     sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
186     sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
187     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
188     sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
189   } else {
190     // tap signs : + + + +
191     sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
192     sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
193     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
194     sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
195   }
196 
197   return vreinterpretq_s16_u16(sum);
198 }
199 
200 template <int filter_index>
SimpleHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)201 uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
202                                   const ptrdiff_t src_stride,
203                                   const uint8x8_t* const v_tap) {
204   int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
205 
206   // Normally the Horizontal pass does the downshift in two passes:
207   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
208   // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
209   // requires adding the rounding offset from the skipped shift.
210   constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
211 
212   sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
213   return vqrshrun_n_s16(sum, kFilterBits - 1);
214 }
215 
216 template <int filter_index>
HorizontalTaps8To16_2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)217 uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
218                                    const ptrdiff_t src_stride,
219                                    const uint8x8_t* const v_tap) {
220   const int16x8_t sum =
221       SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
222 
223   return vreinterpretq_u16_s16(
224       vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
225 }
226 
227 template <int num_taps, int step, int filter_index,
228           bool negative_outside_taps = true, bool is_2d = false,
229           bool is_compound = false>
FilterHorizontal(const uint8_t * src,const ptrdiff_t src_stride,void * const dest,const ptrdiff_t pred_stride,const int width,const int height,const uint8x8_t * const v_tap)230 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
231                       void* const dest, const ptrdiff_t pred_stride,
232                       const int width, const int height,
233                       const uint8x8_t* const v_tap) {
234   auto* dest8 = static_cast<uint8_t*>(dest);
235   auto* dest16 = static_cast<uint16_t*>(dest);
236 
237   // 4 tap filters are never used when width > 4.
238   if (num_taps != 4 && width > 4) {
239     int y = 0;
240     do {
241       int x = 0;
242       do {
243         if (is_2d || is_compound) {
244           const uint16x8_t v_sum =
245               HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
246                                                                        v_tap);
247           vst1q_u16(&dest16[x], v_sum);
248         } else {
249           const uint8x8_t result =
250               SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
251                                                                         v_tap);
252           vst1_u8(&dest8[x], result);
253         }
254         x += step;
255       } while (x < width);
256       src += src_stride;
257       dest8 += pred_stride;
258       dest16 += pred_stride;
259     } while (++y < height);
260     return;
261   }
262 
263   // Horizontal passes only needs to account for |num_taps| 2 and 4 when
264   // |width| <= 4.
265   assert(width <= 4);
266   assert(num_taps <= 4);
267   if (num_taps <= 4) {
268     if (width == 4) {
269       int y = 0;
270       do {
271         if (is_2d || is_compound) {
272           const uint16x8_t v_sum =
273               HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
274                                                                        v_tap);
275           vst1_u16(dest16, vget_low_u16(v_sum));
276         } else {
277           const uint8x8_t result =
278               SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
279                                                                         v_tap);
280           StoreLo4(&dest8[0], result);
281         }
282         src += src_stride;
283         dest8 += pred_stride;
284         dest16 += pred_stride;
285       } while (++y < height);
286       return;
287     }
288 
289     if (!is_compound) {
290       int y = 0;
291       do {
292         if (is_2d) {
293           const uint16x8_t sum =
294               HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
295           dest16[0] = vgetq_lane_u16(sum, 0);
296           dest16[1] = vgetq_lane_u16(sum, 2);
297           dest16 += pred_stride;
298           dest16[0] = vgetq_lane_u16(sum, 1);
299           dest16[1] = vgetq_lane_u16(sum, 3);
300           dest16 += pred_stride;
301         } else {
302           const uint8x8_t sum =
303               SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
304 
305           dest8[0] = vget_lane_u8(sum, 0);
306           dest8[1] = vget_lane_u8(sum, 2);
307           dest8 += pred_stride;
308 
309           dest8[0] = vget_lane_u8(sum, 1);
310           dest8[1] = vget_lane_u8(sum, 3);
311           dest8 += pred_stride;
312         }
313 
314         src += src_stride << 1;
315         y += 2;
316       } while (y < height - 1);
317 
318       // The 2d filters have an odd |height| because the horizontal pass
319       // generates context for the vertical pass.
320       if (is_2d) {
321         assert(height % 2 == 1);
322         uint16x8_t sum;
323         const uint8x8_t input = vld1_u8(src);
324         if (filter_index == 3) {  // |num_taps| == 2
325           sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
326           sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
327         } else if (filter_index == 4) {
328           sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
329           sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
330           sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
331           sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
332         } else {
333           assert(filter_index == 5);
334           sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
335           sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
336           sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
337           sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
338         }
339         // |sum| contains an int16_t value.
340         sum = vreinterpretq_u16_s16(vrshrq_n_s16(
341             vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
342         Store2<0>(dest16, sum);
343       }
344     }
345   }
346 }
347 
348 // Process 16 bit inputs and output 32 bits.
349 template <int num_taps, bool is_compound>
Sum2DVerticalTaps4(const int16x4_t * const src,const int16x8_t taps)350 inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
351                                     const int16x8_t taps) {
352   const int16x4_t taps_lo = vget_low_s16(taps);
353   const int16x4_t taps_hi = vget_high_s16(taps);
354   int32x4_t sum;
355   if (num_taps == 8) {
356     sum = vmull_lane_s16(src[0], taps_lo, 0);
357     sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
358     sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
359     sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
360     sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
361     sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
362     sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
363     sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
364   } else if (num_taps == 6) {
365     sum = vmull_lane_s16(src[0], taps_lo, 1);
366     sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
367     sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
368     sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
369     sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
370     sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
371   } else if (num_taps == 4) {
372     sum = vmull_lane_s16(src[0], taps_lo, 2);
373     sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
374     sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
375     sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
376   } else if (num_taps == 2) {
377     sum = vmull_lane_s16(src[0], taps_lo, 3);
378     sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
379   }
380 
381   if (is_compound) {
382     return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
383   }
384 
385   return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
386 }
387 
388 template <int num_taps, bool is_compound>
SimpleSum2DVerticalTaps(const int16x8_t * const src,const int16x8_t taps)389 int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
390                                   const int16x8_t taps) {
391   const int16x4_t taps_lo = vget_low_s16(taps);
392   const int16x4_t taps_hi = vget_high_s16(taps);
393   int32x4_t sum_lo, sum_hi;
394   if (num_taps == 8) {
395     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
396     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
397     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
398     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
399     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
400     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
401     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
402     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
403 
404     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
405     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
406     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
407     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
408     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
409     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
410     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
411     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
412   } else if (num_taps == 6) {
413     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
414     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
415     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
416     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
417     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
418     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
419 
420     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
421     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
422     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
423     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
424     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
425     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
426   } else if (num_taps == 4) {
427     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
428     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
429     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
430     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
431 
432     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
433     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
434     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
435     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
436   } else if (num_taps == 2) {
437     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
438     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
439 
440     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
441     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
442   }
443 
444   if (is_compound) {
445     return vcombine_s16(
446         vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
447         vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
448   }
449 
450   return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
451                       vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
452 }
453 
454 template <int num_taps, bool is_compound = false>
Filter2DVertical(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int16x8_t taps)455 void Filter2DVertical(const uint16_t* src, void* const dst,
456                       const ptrdiff_t dst_stride, const int width,
457                       const int height, const int16x8_t taps) {
458   assert(width >= 8);
459   constexpr int next_row = num_taps - 1;
460   // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
461   const ptrdiff_t src_stride = width;
462 
463   auto* dst8 = static_cast<uint8_t*>(dst);
464   auto* dst16 = static_cast<uint16_t*>(dst);
465 
466   int x = 0;
467   do {
468     int16x8_t srcs[8];
469     const uint16_t* src_x = src + x;
470     srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
471     src_x += src_stride;
472     if (num_taps >= 4) {
473       srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
474       src_x += src_stride;
475       srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
476       src_x += src_stride;
477       if (num_taps >= 6) {
478         srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
479         src_x += src_stride;
480         srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
481         src_x += src_stride;
482         if (num_taps == 8) {
483           srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
484           src_x += src_stride;
485           srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
486           src_x += src_stride;
487         }
488       }
489     }
490 
491     int y = 0;
492     do {
493       srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
494       src_x += src_stride;
495 
496       const int16x8_t sum =
497           SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
498       if (is_compound) {
499         vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
500       } else {
501         vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
502       }
503 
504       srcs[0] = srcs[1];
505       if (num_taps >= 4) {
506         srcs[1] = srcs[2];
507         srcs[2] = srcs[3];
508         if (num_taps >= 6) {
509           srcs[3] = srcs[4];
510           srcs[4] = srcs[5];
511           if (num_taps == 8) {
512             srcs[5] = srcs[6];
513             srcs[6] = srcs[7];
514           }
515         }
516       }
517     } while (++y < height);
518     x += 8;
519   } while (x < width);
520 }
521 
522 // Take advantage of |src_stride| == |width| to process two rows at a time.
523 template <int num_taps, bool is_compound = false>
Filter2DVertical4xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)524 void Filter2DVertical4xH(const uint16_t* src, void* const dst,
525                          const ptrdiff_t dst_stride, const int height,
526                          const int16x8_t taps) {
527   auto* dst8 = static_cast<uint8_t*>(dst);
528   auto* dst16 = static_cast<uint16_t*>(dst);
529 
530   int16x8_t srcs[9];
531   srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
532   src += 8;
533   if (num_taps >= 4) {
534     srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
535     src += 8;
536     srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
537     if (num_taps >= 6) {
538       srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
539       src += 8;
540       srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
541       if (num_taps == 8) {
542         srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
543         src += 8;
544         srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
545       }
546     }
547   }
548 
549   int y = 0;
550   do {
551     srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
552     src += 8;
553     srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
554                                       vget_low_s16(srcs[num_taps]));
555 
556     const int16x8_t sum =
557         SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
558     if (is_compound) {
559       const uint16x8_t results = vreinterpretq_u16_s16(sum);
560       vst1q_u16(dst16, results);
561       dst16 += 4 << 1;
562     } else {
563       const uint8x8_t results = vqmovun_s16(sum);
564 
565       StoreLo4(dst8, results);
566       dst8 += dst_stride;
567       StoreHi4(dst8, results);
568       dst8 += dst_stride;
569     }
570 
571     srcs[0] = srcs[2];
572     if (num_taps >= 4) {
573       srcs[1] = srcs[3];
574       srcs[2] = srcs[4];
575       if (num_taps >= 6) {
576         srcs[3] = srcs[5];
577         srcs[4] = srcs[6];
578         if (num_taps == 8) {
579           srcs[5] = srcs[7];
580           srcs[6] = srcs[8];
581         }
582       }
583     }
584     y += 2;
585   } while (y < height);
586 }
587 
588 // Take advantage of |src_stride| == |width| to process four rows at a time.
589 template <int num_taps>
Filter2DVertical2xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)590 void Filter2DVertical2xH(const uint16_t* src, void* const dst,
591                          const ptrdiff_t dst_stride, const int height,
592                          const int16x8_t taps) {
593   constexpr int next_row = (num_taps < 6) ? 4 : 8;
594 
595   auto* dst8 = static_cast<uint8_t*>(dst);
596 
597   int16x8_t srcs[9];
598   srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
599   src += 8;
600   if (num_taps >= 6) {
601     srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
602     src += 8;
603     srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
604     if (num_taps == 8) {
605       srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
606       srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
607     }
608   }
609 
610   int y = 0;
611   do {
612     srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
613     src += 8;
614     if (num_taps == 2) {
615       srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
616     } else if (num_taps == 4) {
617       srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
618       srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
619       srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
620     } else if (num_taps == 6) {
621       srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
622       srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
623       srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
624     } else if (num_taps == 8) {
625       srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
626       srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
627       srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
628     }
629 
630     const int16x8_t sum =
631         SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
632     const uint8x8_t results = vqmovun_s16(sum);
633 
634     Store2<0>(dst8, results);
635     dst8 += dst_stride;
636     Store2<1>(dst8, results);
637     // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
638     // Therefore we don't need to check this condition when |height| > 4.
639     if (num_taps <= 4 && height == 2) return;
640     dst8 += dst_stride;
641     Store2<2>(dst8, results);
642     dst8 += dst_stride;
643     Store2<3>(dst8, results);
644     dst8 += dst_stride;
645 
646     srcs[0] = srcs[4];
647     if (num_taps == 6) {
648       srcs[1] = srcs[5];
649       srcs[4] = srcs[8];
650     } else if (num_taps == 8) {
651       srcs[1] = srcs[5];
652       srcs[2] = srcs[6];
653       srcs[3] = srcs[7];
654       srcs[4] = srcs[8];
655     }
656 
657     y += 4;
658   } while (y < height);
659 }
660 
661 template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass(const uint8_t * const src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int subpixel,const int filter_index)662 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
663     const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
664     const ptrdiff_t dst_stride, const int width, const int height,
665     const int subpixel, const int filter_index) {
666   // Duplicate the absolute value for each tap.  Negative taps are corrected
667   // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
668   uint8x8_t v_tap[kSubPixelTaps];
669   const int filter_id = (subpixel >> 6) & kSubPixelMask;
670   assert(filter_id != 0);
671 
672   for (int k = 0; k < kSubPixelTaps; ++k) {
673     v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
674   }
675 
676   if (filter_index == 2) {  // 8 tap.
677     FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
678         src, src_stride, dst, dst_stride, width, height, v_tap);
679   } else if (filter_index == 1) {  // 6 tap.
680     // Check if outside taps are positive.
681     if ((filter_id == 1) | (filter_id == 15)) {
682       FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
683           src, src_stride, dst, dst_stride, width, height, v_tap);
684     } else {
685       FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
686           src, src_stride, dst, dst_stride, width, height, v_tap);
687     }
688   } else if (filter_index == 0) {  // 6 tap.
689     FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
690         src, src_stride, dst, dst_stride, width, height, v_tap);
691   } else if (filter_index == 4) {  // 4 tap.
692     FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
693         src, src_stride, dst, dst_stride, width, height, v_tap);
694   } else if (filter_index == 5) {  // 4 tap.
695     FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
696         src, src_stride, dst, dst_stride, width, height, v_tap);
697   } else {  // 2 tap.
698     FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
699         src, src_stride, dst, dst_stride, width, height, v_tap);
700   }
701 }
702 
GetNumTapsInFilter(const int filter_index)703 int GetNumTapsInFilter(const int filter_index) {
704   if (filter_index < 2) {
705     // Despite the names these only use 6 taps.
706     // kInterpolationFilterEightTap
707     // kInterpolationFilterEightTapSmooth
708     return 6;
709   }
710 
711   if (filter_index == 2) {
712     // kInterpolationFilterEightTapSharp
713     return 8;
714   }
715 
716   if (filter_index == 3) {
717     // kInterpolationFilterBilinear
718     return 2;
719   }
720 
721   assert(filter_index > 3);
722   // For small sizes (width/height <= 4) the large filters are replaced with 4
723   // tap options.
724   // If the original filters were |kInterpolationFilterEightTap| or
725   // |kInterpolationFilterEightTapSharp| then it becomes
726   // |kInterpolationFilterSwitchable|.
727   // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
728   // tap filter.
729   return 4;
730 }
731 
Convolve2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)732 void Convolve2D_NEON(const void* const reference,
733                      const ptrdiff_t reference_stride,
734                      const int horizontal_filter_index,
735                      const int vertical_filter_index, const int subpixel_x,
736                      const int subpixel_y, const int width, const int height,
737                      void* prediction, const ptrdiff_t pred_stride) {
738   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
739   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
740   const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
741 
742   // The output of the horizontal filter is guaranteed to fit in 16 bits.
743   uint16_t
744       intermediate_result[kMaxSuperBlockSizeInPixels *
745                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
746   const int intermediate_height = height + vertical_taps - 1;
747 
748   const ptrdiff_t src_stride = reference_stride;
749   const auto* src = static_cast<const uint8_t*>(reference) -
750                     (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
751 
752   DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
753                                    width, intermediate_height, subpixel_x,
754                                    horiz_filter_index);
755 
756   // Vertical filter.
757   auto* dest = static_cast<uint8_t*>(prediction);
758   const ptrdiff_t dest_stride = pred_stride;
759   const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
760   assert(filter_id != 0);
761 
762   const int16x8_t taps =
763       vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
764 
765   if (vertical_taps == 8) {
766     if (width == 2) {
767       Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
768                              taps);
769     } else if (width == 4) {
770       Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
771                              taps);
772     } else {
773       Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
774                           taps);
775     }
776   } else if (vertical_taps == 6) {
777     if (width == 2) {
778       Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
779                              taps);
780     } else if (width == 4) {
781       Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
782                              taps);
783     } else {
784       Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
785                           taps);
786     }
787   } else if (vertical_taps == 4) {
788     if (width == 2) {
789       Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
790                              taps);
791     } else if (width == 4) {
792       Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
793                              taps);
794     } else {
795       Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
796                           taps);
797     }
798   } else {  // |vertical_taps| == 2
799     if (width == 2) {
800       Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
801                              taps);
802     } else if (width == 4) {
803       Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
804                              taps);
805     } else {
806       Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
807                           taps);
808     }
809   }
810 }
811 
812 // There are many opportunities for overreading in scaled convolve, because the
813 // range of starting points for filter windows is anywhere from 0 to 16 for 8
814 // destination pixels, and the window sizes range from 2 to 8. To accommodate
815 // this range concisely, we use |grade_x| to mean the most steps in src that can
816 // be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
817 // we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
818 // increments. The first load covers the initial elements of src_x, while the
819 // final load covers the taps.
820 template <int grade_x>
LoadSrcVals(const uint8_t * src_x)821 inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
822   uint8x8x3_t ret;
823   const uint8x16_t src_val = vld1q_u8(src_x);
824   ret.val[0] = vget_low_u8(src_val);
825   ret.val[1] = vget_high_u8(src_val);
826   if (grade_x > 1) {
827     ret.val[2] = vld1_u8(src_x + 16);
828   }
829   return ret;
830 }
831 
832 // Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
GetPositive2TapFilter(const int tap_index)833 inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
834   assert(tap_index < 2);
835   alignas(
836       16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
837       {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
838       {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
839 
840   return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
841 }
842 
843 template <int grade_x>
ConvolveKernelHorizontal2Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)844 inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
845                                          const ptrdiff_t src_stride,
846                                          const int width, const int subpixel_x,
847                                          const int step_x,
848                                          const int intermediate_height,
849                                          int16_t* intermediate) {
850   // Account for the 0-taps that precede the 2 nonzero taps.
851   const int kernel_offset = 3;
852   const int ref_x = subpixel_x >> kScaleSubPixelBits;
853   const int step_x8 = step_x << 3;
854   const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
855   const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
856   const uint16x8_t index_steps = vmulq_n_u16(
857       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
858   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
859 
860   int p = subpixel_x;
861   if (width <= 4) {
862     const uint8_t* src_x =
863         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
864     // Only add steps to the 10-bit truncated p to avoid overflow.
865     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
866     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
867     const uint8x8_t filter_indices =
868         vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
869     // This is a special case. The 2-tap filter has no negative taps, so we
870     // can use unsigned values.
871     // For each x, a lane of tapsK has
872     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
873     // on x.
874     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
875                                VQTbl1U8(filter_taps1, filter_indices)};
876     int y = 0;
877     do {
878       // Load a pool of samples to select from using stepped indices.
879       const uint8x16_t src_vals = vld1q_u8(src_x);
880       const uint8x8_t src_indices =
881           vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
882 
883       // For each x, a lane of srcK contains src_x[k].
884       const uint8x8_t src[2] = {
885           VQTbl1U8(src_vals, src_indices),
886           VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
887 
888       vst1q_s16(intermediate,
889                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
890                              kInterRoundBitsHorizontal - 1));
891       src_x += src_stride;
892       intermediate += kIntermediateStride;
893     } while (++y < intermediate_height);
894     return;
895   }
896 
897   // |width| >= 8
898   int x = 0;
899   do {
900     const uint8_t* src_x =
901         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
902     int16_t* intermediate_x = intermediate + x;
903     // Only add steps to the 10-bit truncated p to avoid overflow.
904     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
905     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
906     const uint8x8_t filter_indices =
907         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
908                 filter_index_mask);
909     // This is a special case. The 2-tap filter has no negative taps, so we
910     // can use unsigned values.
911     // For each x, a lane of tapsK has
912     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
913     // on x.
914     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
915                                VQTbl1U8(filter_taps1, filter_indices)};
916     int y = 0;
917     do {
918       // Load a pool of samples to select from using stepped indices.
919       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
920       const uint8x8_t src_indices =
921           vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
922 
923       // For each x, a lane of srcK contains src_x[k].
924       const uint8x8_t src[2] = {
925           vtbl3_u8(src_vals, src_indices),
926           vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
927 
928       vst1q_s16(intermediate_x,
929                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
930                              kInterRoundBitsHorizontal - 1));
931       src_x += src_stride;
932       intermediate_x += kIntermediateStride;
933     } while (++y < intermediate_height);
934     x += 8;
935     p += step_x8;
936   } while (x < width);
937 }
938 
939 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
GetPositive4TapFilter(const int tap_index)940 inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
941   assert(tap_index < 4);
942   alignas(
943       16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
944       {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
945       {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
946       {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
947       {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
948 
949   return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
950 }
951 
952 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalPositive4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)953 void ConvolveKernelHorizontalPositive4Tap(
954     const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
955     const int step_x, const int intermediate_height, int16_t* intermediate) {
956   const int kernel_offset = 2;
957   const int ref_x = subpixel_x >> kScaleSubPixelBits;
958   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
959   const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
960   const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
961   const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
962   const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
963   const uint16x8_t index_steps = vmulq_n_u16(
964       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
965   const int p = subpixel_x;
966   // First filter is special, just a 128 tap on the center.
967   const uint8_t* src_x =
968       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
969   // Only add steps to the 10-bit truncated p to avoid overflow.
970   const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
971   const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
972   const uint8x8_t filter_indices = vand_u8(
973       vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
974   // Note that filter_id depends on x.
975   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
976   const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
977                              VQTbl1U8(filter_taps1, filter_indices),
978                              VQTbl1U8(filter_taps2, filter_indices),
979                              VQTbl1U8(filter_taps3, filter_indices)};
980 
981   const uint8x8_t src_indices =
982       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
983   int y = 0;
984   do {
985     // Load a pool of samples to select from using stepped index vectors.
986     const uint8x16_t src_vals = vld1q_u8(src_x);
987 
988     // For each x, srcK contains src_x[k] where k=1.
989     // Whereas taps come from different arrays, src pixels are drawn from the
990     // same contiguous line.
991     const uint8x8_t src[4] = {
992         VQTbl1U8(src_vals, src_indices),
993         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
994         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
995         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
996 
997     vst1q_s16(intermediate,
998               vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
999                            kInterRoundBitsHorizontal - 1));
1000 
1001     src_x += src_stride;
1002     intermediate += kIntermediateStride;
1003   } while (++y < intermediate_height);
1004 }
1005 
1006 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
GetSigned4TapFilter(const int tap_index)1007 inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
1008   assert(tap_index < 4);
1009   alignas(16) static constexpr uint8_t
1010       kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
1011           {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
1012           {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1013           {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1014           {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
1015 
1016   return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
1017 }
1018 
1019 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalSigned4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1020 inline void ConvolveKernelHorizontalSigned4Tap(
1021     const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
1022     const int step_x, const int intermediate_height, int16_t* intermediate) {
1023   const int kernel_offset = 2;
1024   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1025   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1026   const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
1027   const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
1028   const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
1029   const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
1030   const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
1031                                             static_cast<uint16_t>(step_x));
1032 
1033   const int p = subpixel_x;
1034   const uint8_t* src_x =
1035       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1036   // Only add steps to the 10-bit truncated p to avoid overflow.
1037   const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
1038   const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
1039   const uint8x8_t filter_index_offsets = vshrn_n_u16(
1040       vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
1041   const uint8x8_t filter_indices =
1042       vand_u8(filter_index_offsets, filter_index_mask);
1043   // Note that filter_id depends on x.
1044   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
1045   const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
1046                              VQTbl1U8(filter_taps1, filter_indices),
1047                              VQTbl1U8(filter_taps2, filter_indices),
1048                              VQTbl1U8(filter_taps3, filter_indices)};
1049 
1050   const uint8x8_t src_indices_base =
1051       vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
1052 
1053   const uint8x8_t src_indices[4] = {src_indices_base,
1054                                     vadd_u8(src_indices_base, vdup_n_u8(1)),
1055                                     vadd_u8(src_indices_base, vdup_n_u8(2)),
1056                                     vadd_u8(src_indices_base, vdup_n_u8(3))};
1057 
1058   int y = 0;
1059   do {
1060     // Load a pool of samples to select from using stepped indices.
1061     const uint8x16_t src_vals = vld1q_u8(src_x);
1062 
1063     // For each x, srcK contains src_x[k] where k=1.
1064     // Whereas taps come from different arrays, src pixels are drawn from the
1065     // same contiguous line.
1066     const uint8x8_t src[4] = {
1067         VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
1068         VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
1069 
1070     vst1q_s16(intermediate,
1071               vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
1072                            kInterRoundBitsHorizontal - 1));
1073     src_x += src_stride;
1074     intermediate += kIntermediateStride;
1075   } while (++y < intermediate_height);
1076 }
1077 
1078 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
GetSigned6TapFilter(const int tap_index)1079 inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
1080   assert(tap_index < 6);
1081   alignas(16) static constexpr uint8_t
1082       kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
1083           {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
1084           {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
1085           {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1086           {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1087           {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
1088           {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
1089 
1090   return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
1091 }
1092 
1093 // This filter is only possible when width >= 8.
1094 template <int grade_x>
ConvolveKernelHorizontalSigned6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1095 inline void ConvolveKernelHorizontalSigned6Tap(
1096     const uint8_t* src, const ptrdiff_t src_stride, const int width,
1097     const int subpixel_x, const int step_x, const int intermediate_height,
1098     int16_t* intermediate) {
1099   const int kernel_offset = 1;
1100   const uint8x8_t one = vdup_n_u8(1);
1101   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1102   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1103   const int step_x8 = step_x << 3;
1104   uint8x16_t filter_taps[6];
1105   for (int i = 0; i < 6; ++i) {
1106     filter_taps[i] = GetSigned6TapFilter(i);
1107   }
1108   const uint16x8_t index_steps = vmulq_n_u16(
1109       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1110 
1111   int x = 0;
1112   int p = subpixel_x;
1113   do {
1114     // Avoid overloading outside the reference boundaries. This means
1115     // |trailing_width| can be up to 24.
1116     const uint8_t* src_x =
1117         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1118     int16_t* intermediate_x = intermediate + x;
1119     // Only add steps to the 10-bit truncated p to avoid overflow.
1120     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1121     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1122     const uint8x8_t src_indices =
1123         vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1124     uint8x8_t src_lookup[6];
1125     src_lookup[0] = src_indices;
1126     for (int i = 1; i < 6; ++i) {
1127       src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1128     }
1129 
1130     const uint8x8_t filter_indices =
1131         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1132                 filter_index_mask);
1133     // For each x, a lane of taps[k] has
1134     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1135     // on x.
1136     uint8x8_t taps[6];
1137     for (int i = 0; i < 6; ++i) {
1138       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1139     }
1140     int y = 0;
1141     do {
1142       // Load a pool of samples to select from using stepped indices.
1143       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1144 
1145       const uint8x8_t src[6] = {
1146           vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1147           vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1148           vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
1149 
1150       vst1q_s16(intermediate_x,
1151                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
1152                              kInterRoundBitsHorizontal - 1));
1153       src_x += src_stride;
1154       intermediate_x += kIntermediateStride;
1155     } while (++y < intermediate_height);
1156     x += 8;
1157     p += step_x8;
1158   } while (x < width);
1159 }
1160 
1161 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
1162 // has mixed positive and negative outer taps which are handled in
1163 // GetMixed6TapFilter().
GetPositive6TapFilter(const int tap_index)1164 inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
1165   assert(tap_index < 6);
1166   alignas(16) static constexpr uint8_t
1167       kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
1168           {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
1169           {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
1170           {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
1171           {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
1172 
1173   return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
1174 }
1175 
GetMixed6TapFilter(const int tap_index)1176 inline int8x16_t GetMixed6TapFilter(const int tap_index) {
1177   assert(tap_index < 2);
1178   alignas(
1179       16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
1180       {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
1181       {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
1182 
1183   return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
1184 }
1185 
1186 // This filter is only possible when width >= 8.
1187 template <int grade_x>
ConvolveKernelHorizontalMixed6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1188 inline void ConvolveKernelHorizontalMixed6Tap(
1189     const uint8_t* src, const ptrdiff_t src_stride, const int width,
1190     const int subpixel_x, const int step_x, const int intermediate_height,
1191     int16_t* intermediate) {
1192   const int kernel_offset = 1;
1193   const uint8x8_t one = vdup_n_u8(1);
1194   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1195   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1196   const int step_x8 = step_x << 3;
1197   uint8x8_t taps[4];
1198   int16x8_t mixed_taps[2];
1199   uint8x16_t positive_filter_taps[4];
1200   for (int i = 0; i < 4; ++i) {
1201     positive_filter_taps[i] = GetPositive6TapFilter(i);
1202   }
1203   int8x16_t mixed_filter_taps[2];
1204   mixed_filter_taps[0] = GetMixed6TapFilter(0);
1205   mixed_filter_taps[1] = GetMixed6TapFilter(1);
1206   const uint16x8_t index_steps = vmulq_n_u16(
1207       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1208 
1209   int x = 0;
1210   int p = subpixel_x;
1211   do {
1212     const uint8_t* src_x =
1213         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1214     int16_t* intermediate_x = intermediate + x;
1215     // Only add steps to the 10-bit truncated p to avoid overflow.
1216     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1217     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1218     const uint8x8_t src_indices =
1219         vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1220     uint8x8_t src_lookup[6];
1221     src_lookup[0] = src_indices;
1222     for (int i = 1; i < 6; ++i) {
1223       src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1224     }
1225 
1226     const uint8x8_t filter_indices =
1227         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1228                 filter_index_mask);
1229     // For each x, a lane of taps[k] has
1230     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1231     // on x.
1232     for (int i = 0; i < 4; ++i) {
1233       taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
1234     }
1235     mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
1236     mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
1237 
1238     int y = 0;
1239     do {
1240       // Load a pool of samples to select from using stepped indices.
1241       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1242 
1243       int16x8_t sum_mixed = vmulq_s16(
1244           mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
1245       sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
1246                             ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
1247       uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
1248       sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
1249       sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
1250       sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
1251       sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
1252 
1253       vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
1254                                              kInterRoundBitsHorizontal - 1));
1255       src_x += src_stride;
1256       intermediate_x += kIntermediateStride;
1257     } while (++y < intermediate_height);
1258     x += 8;
1259     p += step_x8;
1260   } while (x < width);
1261 }
1262 
1263 // Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
GetSigned8TapFilter(const int tap_index)1264 inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
1265   assert(tap_index < 8);
1266   alignas(16) static constexpr uint8_t
1267       kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
1268           {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
1269           {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
1270           {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
1271           {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
1272           {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
1273           {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
1274           {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
1275           {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
1276 
1277   return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
1278 }
1279 
1280 // This filter is only possible when width >= 8.
1281 template <int grade_x>
ConvolveKernelHorizontalSigned8Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1282 inline void ConvolveKernelHorizontalSigned8Tap(
1283     const uint8_t* src, const ptrdiff_t src_stride, const int width,
1284     const int subpixel_x, const int step_x, const int intermediate_height,
1285     int16_t* intermediate) {
1286   const uint8x8_t one = vdup_n_u8(1);
1287   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1288   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1289   const int step_x8 = step_x << 3;
1290   uint8x8_t taps[8];
1291   uint8x16_t filter_taps[8];
1292   for (int i = 0; i < 8; ++i) {
1293     filter_taps[i] = GetSigned8TapFilter(i);
1294   }
1295   const uint16x8_t index_steps = vmulq_n_u16(
1296       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1297   int x = 0;
1298   int p = subpixel_x;
1299   do {
1300     const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
1301     int16_t* intermediate_x = intermediate + x;
1302     // Only add steps to the 10-bit truncated p to avoid overflow.
1303     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1304     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1305     const uint8x8_t src_indices =
1306         vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1307     uint8x8_t src_lookup[8];
1308     src_lookup[0] = src_indices;
1309     for (int i = 1; i < 8; ++i) {
1310       src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1311     }
1312 
1313     const uint8x8_t filter_indices =
1314         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1315                 filter_index_mask);
1316     // For each x, a lane of taps[k] has
1317     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1318     // on x.
1319     for (int i = 0; i < 8; ++i) {
1320       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1321     }
1322 
1323     int y = 0;
1324     do {
1325       // Load a pool of samples to select from using stepped indices.
1326       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1327 
1328       const uint8x8_t src[8] = {
1329           vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1330           vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1331           vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
1332           vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
1333 
1334       vst1q_s16(intermediate_x,
1335                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
1336                              kInterRoundBitsHorizontal - 1));
1337       src_x += src_stride;
1338       intermediate_x += kIntermediateStride;
1339     } while (++y < intermediate_height);
1340     x += 8;
1341     p += step_x8;
1342   } while (x < width);
1343 }
1344 
1345 // This function handles blocks of width 2 or 4.
1346 template <int num_taps, int grade_y, int width, bool is_compound>
ConvolveVerticalScale4xH(const int16_t * src,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1347 void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
1348                               const int filter_index, const int step_y,
1349                               const int height, void* dest,
1350                               const ptrdiff_t dest_stride) {
1351   constexpr ptrdiff_t src_stride = kIntermediateStride;
1352   const int16_t* src_y = src;
1353   // |dest| is 16-bit in compound mode, Pixel otherwise.
1354   uint16_t* dest16_y = static_cast<uint16_t*>(dest);
1355   uint8_t* dest_y = static_cast<uint8_t*>(dest);
1356   int16x4_t s[num_taps + grade_y];
1357 
1358   int p = subpixel_y & 1023;
1359   int prev_p = p;
1360   int y = 0;
1361   do {  // y < height
1362     for (int i = 0; i < num_taps; ++i) {
1363       s[i] = vld1_s16(src_y + i * src_stride);
1364     }
1365     int filter_id = (p >> 6) & kSubPixelMask;
1366     int16x8_t filter =
1367         vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1368     int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
1369     if (is_compound) {
1370       assert(width != 2);
1371       const uint16x4_t result = vreinterpret_u16_s16(sums);
1372       vst1_u16(dest16_y, result);
1373     } else {
1374       const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1375       if (width == 2) {
1376         Store2<0>(dest_y, result);
1377       } else {
1378         StoreLo4(dest_y, result);
1379       }
1380     }
1381     p += step_y;
1382     const int p_diff =
1383         (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1384     prev_p = p;
1385     // Here we load extra source in case it is needed. If |p_diff| == 0, these
1386     // values will be unused, but it's faster to load than to branch.
1387     s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
1388     if (grade_y > 1) {
1389       s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
1390     }
1391     dest16_y += dest_stride;
1392     dest_y += dest_stride;
1393 
1394     filter_id = (p >> 6) & kSubPixelMask;
1395     filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1396     sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
1397     if (is_compound) {
1398       assert(width != 2);
1399       const uint16x4_t result = vreinterpret_u16_s16(sums);
1400       vst1_u16(dest16_y, result);
1401     } else {
1402       const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1403       if (width == 2) {
1404         Store2<0>(dest_y, result);
1405       } else {
1406         StoreLo4(dest_y, result);
1407       }
1408     }
1409     p += step_y;
1410     src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1411     prev_p = p;
1412     dest16_y += dest_stride;
1413     dest_y += dest_stride;
1414 
1415     y += 2;
1416   } while (y < height);
1417 }
1418 
1419 template <int num_taps, int grade_y, bool is_compound>
ConvolveVerticalScale(const int16_t * src,const int width,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1420 inline void ConvolveVerticalScale(const int16_t* src, const int width,
1421                                   const int subpixel_y, const int filter_index,
1422                                   const int step_y, const int height,
1423                                   void* dest, const ptrdiff_t dest_stride) {
1424   constexpr ptrdiff_t src_stride = kIntermediateStride;
1425   // A possible improvement is to use arithmetic to decide how many times to
1426   // apply filters to same source before checking whether to load new srcs.
1427   // However, this will only improve performance with very small step sizes.
1428   int16x8_t s[num_taps + grade_y];
1429   // |dest| is 16-bit in compound mode, Pixel otherwise.
1430   uint16_t* dest16_y;
1431   uint8_t* dest_y;
1432 
1433   int x = 0;
1434   do {  // x < width
1435     const int16_t* src_x = src + x;
1436     const int16_t* src_y = src_x;
1437     dest16_y = static_cast<uint16_t*>(dest) + x;
1438     dest_y = static_cast<uint8_t*>(dest) + x;
1439     int p = subpixel_y & 1023;
1440     int prev_p = p;
1441     int y = 0;
1442     do {  // y < height
1443       for (int i = 0; i < num_taps; ++i) {
1444         s[i] = vld1q_s16(src_y + i * src_stride);
1445       }
1446       int filter_id = (p >> 6) & kSubPixelMask;
1447       int16x8_t filter =
1448           vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1449       int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
1450       if (is_compound) {
1451         vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1452       } else {
1453         vst1_u8(dest_y, vqmovun_s16(sum));
1454       }
1455       p += step_y;
1456       const int p_diff =
1457           (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1458       // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
1459       // needed. Otherwise, we only need to load one vector because |p_diff|
1460       // can't exceed 1.
1461       s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
1462       if (grade_y > 1) {
1463         s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
1464       }
1465       dest16_y += dest_stride;
1466       dest_y += dest_stride;
1467 
1468       filter_id = (p >> 6) & kSubPixelMask;
1469       filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1470       sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
1471       if (is_compound) {
1472         vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1473       } else {
1474         vst1_u8(dest_y, vqmovun_s16(sum));
1475       }
1476       p += step_y;
1477       src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
1478       prev_p = p;
1479       dest16_y += dest_stride;
1480       dest_y += dest_stride;
1481 
1482       y += 2;
1483     } while (y < height);
1484     x += 8;
1485   } while (x < width);
1486 }
1487 
1488 template <bool is_compound>
ConvolveScale2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1489 void ConvolveScale2D_NEON(const void* const reference,
1490                           const ptrdiff_t reference_stride,
1491                           const int horizontal_filter_index,
1492                           const int vertical_filter_index, const int subpixel_x,
1493                           const int subpixel_y, const int step_x,
1494                           const int step_y, const int width, const int height,
1495                           void* prediction, const ptrdiff_t pred_stride) {
1496   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
1497   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
1498   assert(step_x <= 2048);
1499   const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
1500   const int intermediate_height =
1501       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
1502        kScaleSubPixelBits) +
1503       num_vert_taps;
1504   assert(step_x <= 2048);
1505   // The output of the horizontal filter, i.e. the intermediate_result, is
1506   // guaranteed to fit in int16_t.
1507   int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
1508                               (2 * kMaxSuperBlockSizeInPixels + 8)];
1509 
1510   // Horizontal filter.
1511   // Filter types used for width <= 4 are different from those for width > 4.
1512   // When width > 4, the valid filter index range is always [0, 3].
1513   // When width <= 4, the valid filter index range is always [3, 5].
1514   // Similarly for height.
1515   int filter_index = GetFilterIndex(horizontal_filter_index, width);
1516   int16_t* intermediate = intermediate_result;
1517   const ptrdiff_t src_stride = reference_stride;
1518   const auto* src = static_cast<const uint8_t*>(reference);
1519   const int vert_kernel_offset = (8 - num_vert_taps) / 2;
1520   src += vert_kernel_offset * src_stride;
1521 
1522   // Derive the maximum value of |step_x| at which all source values fit in one
1523   // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
1524   // step_x*7 is the final base subpel index for the shuffle mask for filter
1525   // inputs in each iteration on large blocks. When step_x is large, we need a
1526   // larger structure and use a larger table lookup in order to gather all
1527   // filter inputs.
1528   // |num_taps| - 1 is the shuffle index of the final filter input.
1529   const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
1530   const int kernel_start_ceiling = 16 - num_horiz_taps;
1531   // This truncated quotient |grade_x_threshold| selects |step_x| such that:
1532   // (step_x * 7) >> kScaleSubPixelBits < single load limit
1533   const int grade_x_threshold =
1534       (kernel_start_ceiling << kScaleSubPixelBits) / 7;
1535   switch (filter_index) {
1536     case 0:
1537       if (step_x > grade_x_threshold) {
1538         ConvolveKernelHorizontalSigned6Tap<2>(
1539             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1540             intermediate);
1541       } else {
1542         ConvolveKernelHorizontalSigned6Tap<1>(
1543             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1544             intermediate);
1545       }
1546       break;
1547     case 1:
1548       if (step_x > grade_x_threshold) {
1549         ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
1550                                              step_x, intermediate_height,
1551                                              intermediate);
1552 
1553       } else {
1554         ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
1555                                              step_x, intermediate_height,
1556                                              intermediate);
1557       }
1558       break;
1559     case 2:
1560       if (step_x > grade_x_threshold) {
1561         ConvolveKernelHorizontalSigned8Tap<2>(
1562             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1563             intermediate);
1564       } else {
1565         ConvolveKernelHorizontalSigned8Tap<1>(
1566             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1567             intermediate);
1568       }
1569       break;
1570     case 3:
1571       if (step_x > grade_x_threshold) {
1572         ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
1573                                         step_x, intermediate_height,
1574                                         intermediate);
1575       } else {
1576         ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
1577                                         step_x, intermediate_height,
1578                                         intermediate);
1579       }
1580       break;
1581     case 4:
1582       assert(width <= 4);
1583       ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
1584                                          intermediate_height, intermediate);
1585       break;
1586     default:
1587       assert(filter_index == 5);
1588       ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
1589                                            intermediate_height, intermediate);
1590   }
1591   // Vertical filter.
1592   filter_index = GetFilterIndex(vertical_filter_index, height);
1593   intermediate = intermediate_result;
1594 
1595   switch (filter_index) {
1596     case 0:
1597     case 1:
1598       if (step_y <= 1024) {
1599         if (!is_compound && width == 2) {
1600           ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
1601               intermediate, subpixel_y, filter_index, step_y, height,
1602               prediction, pred_stride);
1603         } else if (width == 4) {
1604           ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
1605               intermediate, subpixel_y, filter_index, step_y, height,
1606               prediction, pred_stride);
1607         } else {
1608           ConvolveVerticalScale<6, 1, is_compound>(
1609               intermediate, width, subpixel_y, filter_index, step_y, height,
1610               prediction, pred_stride);
1611         }
1612       } else {
1613         if (!is_compound && width == 2) {
1614           ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
1615               intermediate, subpixel_y, filter_index, step_y, height,
1616               prediction, pred_stride);
1617         } else if (width == 4) {
1618           ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
1619               intermediate, subpixel_y, filter_index, step_y, height,
1620               prediction, pred_stride);
1621         } else {
1622           ConvolveVerticalScale<6, 2, is_compound>(
1623               intermediate, width, subpixel_y, filter_index, step_y, height,
1624               prediction, pred_stride);
1625         }
1626       }
1627       break;
1628     case 2:
1629       if (step_y <= 1024) {
1630         if (!is_compound && width == 2) {
1631           ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
1632               intermediate, subpixel_y, filter_index, step_y, height,
1633               prediction, pred_stride);
1634         } else if (width == 4) {
1635           ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
1636               intermediate, subpixel_y, filter_index, step_y, height,
1637               prediction, pred_stride);
1638         } else {
1639           ConvolveVerticalScale<8, 1, is_compound>(
1640               intermediate, width, subpixel_y, filter_index, step_y, height,
1641               prediction, pred_stride);
1642         }
1643       } else {
1644         if (!is_compound && width == 2) {
1645           ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
1646               intermediate, subpixel_y, filter_index, step_y, height,
1647               prediction, pred_stride);
1648         } else if (width == 4) {
1649           ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
1650               intermediate, subpixel_y, filter_index, step_y, height,
1651               prediction, pred_stride);
1652         } else {
1653           ConvolveVerticalScale<8, 2, is_compound>(
1654               intermediate, width, subpixel_y, filter_index, step_y, height,
1655               prediction, pred_stride);
1656         }
1657       }
1658       break;
1659     case 3:
1660       if (step_y <= 1024) {
1661         if (!is_compound && width == 2) {
1662           ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
1663               intermediate, subpixel_y, filter_index, step_y, height,
1664               prediction, pred_stride);
1665         } else if (width == 4) {
1666           ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
1667               intermediate, subpixel_y, filter_index, step_y, height,
1668               prediction, pred_stride);
1669         } else {
1670           ConvolveVerticalScale<2, 1, is_compound>(
1671               intermediate, width, subpixel_y, filter_index, step_y, height,
1672               prediction, pred_stride);
1673         }
1674       } else {
1675         if (!is_compound && width == 2) {
1676           ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
1677               intermediate, subpixel_y, filter_index, step_y, height,
1678               prediction, pred_stride);
1679         } else if (width == 4) {
1680           ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
1681               intermediate, subpixel_y, filter_index, step_y, height,
1682               prediction, pred_stride);
1683         } else {
1684           ConvolveVerticalScale<2, 2, is_compound>(
1685               intermediate, width, subpixel_y, filter_index, step_y, height,
1686               prediction, pred_stride);
1687         }
1688       }
1689       break;
1690     case 4:
1691     default:
1692       assert(filter_index == 4 || filter_index == 5);
1693       assert(height <= 4);
1694       if (step_y <= 1024) {
1695         if (!is_compound && width == 2) {
1696           ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
1697               intermediate, subpixel_y, filter_index, step_y, height,
1698               prediction, pred_stride);
1699         } else if (width == 4) {
1700           ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
1701               intermediate, subpixel_y, filter_index, step_y, height,
1702               prediction, pred_stride);
1703         } else {
1704           ConvolveVerticalScale<4, 1, is_compound>(
1705               intermediate, width, subpixel_y, filter_index, step_y, height,
1706               prediction, pred_stride);
1707         }
1708       } else {
1709         if (!is_compound && width == 2) {
1710           ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
1711               intermediate, subpixel_y, filter_index, step_y, height,
1712               prediction, pred_stride);
1713         } else if (width == 4) {
1714           ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
1715               intermediate, subpixel_y, filter_index, step_y, height,
1716               prediction, pred_stride);
1717         } else {
1718           ConvolveVerticalScale<4, 2, is_compound>(
1719               intermediate, width, subpixel_y, filter_index, step_y, height,
1720               prediction, pred_stride);
1721         }
1722       }
1723   }
1724 }
1725 
ConvolveHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int subpixel_x,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1726 void ConvolveHorizontal_NEON(const void* const reference,
1727                              const ptrdiff_t reference_stride,
1728                              const int horizontal_filter_index,
1729                              const int /*vertical_filter_index*/,
1730                              const int subpixel_x, const int /*subpixel_y*/,
1731                              const int width, const int height,
1732                              void* prediction, const ptrdiff_t pred_stride) {
1733   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
1734   // Set |src| to the outermost tap.
1735   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
1736   auto* dest = static_cast<uint8_t*>(prediction);
1737 
1738   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
1739                    subpixel_x, filter_index);
1740 }
1741 
1742 // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
1743 // Vertical calculations.
Compound1DShift(const int16x8_t sum)1744 uint16x8_t Compound1DShift(const int16x8_t sum) {
1745   return vreinterpretq_u16_s16(
1746       vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
1747 }
1748 
1749 template <int filter_index, bool is_compound = false,
1750           bool negative_outside_taps = false>
FilterVertical(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const uint8x8_t * const taps)1751 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
1752                     void* const dst, const ptrdiff_t dst_stride,
1753                     const int width, const int height,
1754                     const uint8x8_t* const taps) {
1755   const int num_taps = GetNumTapsInFilter(filter_index);
1756   const int next_row = num_taps - 1;
1757   auto* dst8 = static_cast<uint8_t*>(dst);
1758   auto* dst16 = static_cast<uint16_t*>(dst);
1759   assert(width >= 8);
1760 
1761   int x = 0;
1762   do {
1763     const uint8_t* src_x = src + x;
1764     uint8x8_t srcs[8];
1765     srcs[0] = vld1_u8(src_x);
1766     src_x += src_stride;
1767     if (num_taps >= 4) {
1768       srcs[1] = vld1_u8(src_x);
1769       src_x += src_stride;
1770       srcs[2] = vld1_u8(src_x);
1771       src_x += src_stride;
1772       if (num_taps >= 6) {
1773         srcs[3] = vld1_u8(src_x);
1774         src_x += src_stride;
1775         srcs[4] = vld1_u8(src_x);
1776         src_x += src_stride;
1777         if (num_taps == 8) {
1778           srcs[5] = vld1_u8(src_x);
1779           src_x += src_stride;
1780           srcs[6] = vld1_u8(src_x);
1781           src_x += src_stride;
1782         }
1783       }
1784     }
1785 
1786     int y = 0;
1787     do {
1788       srcs[next_row] = vld1_u8(src_x);
1789       src_x += src_stride;
1790 
1791       const int16x8_t sums =
1792           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1793       if (is_compound) {
1794         const uint16x8_t results = Compound1DShift(sums);
1795         vst1q_u16(dst16 + x + y * dst_stride, results);
1796       } else {
1797         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1798         vst1_u8(dst8 + x + y * dst_stride, results);
1799       }
1800 
1801       srcs[0] = srcs[1];
1802       if (num_taps >= 4) {
1803         srcs[1] = srcs[2];
1804         srcs[2] = srcs[3];
1805         if (num_taps >= 6) {
1806           srcs[3] = srcs[4];
1807           srcs[4] = srcs[5];
1808           if (num_taps == 8) {
1809             srcs[5] = srcs[6];
1810             srcs[6] = srcs[7];
1811           }
1812         }
1813       }
1814     } while (++y < height);
1815     x += 8;
1816   } while (x < width);
1817 }
1818 
1819 template <int filter_index, bool is_compound = false,
1820           bool negative_outside_taps = false>
FilterVertical4xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)1821 void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
1822                        void* const dst, const ptrdiff_t dst_stride,
1823                        const int height, const uint8x8_t* const taps) {
1824   const int num_taps = GetNumTapsInFilter(filter_index);
1825   auto* dst8 = static_cast<uint8_t*>(dst);
1826   auto* dst16 = static_cast<uint16_t*>(dst);
1827 
1828   uint8x8_t srcs[9];
1829 
1830   if (num_taps == 2) {
1831     srcs[2] = vdup_n_u8(0);
1832 
1833     srcs[0] = Load4(src);
1834     src += src_stride;
1835 
1836     int y = 0;
1837     do {
1838       srcs[0] = Load4<1>(src, srcs[0]);
1839       src += src_stride;
1840       srcs[2] = Load4<0>(src, srcs[2]);
1841       src += src_stride;
1842       srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1843 
1844       const int16x8_t sums =
1845           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1846       if (is_compound) {
1847         const uint16x8_t results = Compound1DShift(sums);
1848 
1849         vst1q_u16(dst16, results);
1850         dst16 += 4 << 1;
1851       } else {
1852         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1853 
1854         StoreLo4(dst8, results);
1855         dst8 += dst_stride;
1856         StoreHi4(dst8, results);
1857         dst8 += dst_stride;
1858       }
1859 
1860       srcs[0] = srcs[2];
1861       y += 2;
1862     } while (y < height);
1863   } else if (num_taps == 4) {
1864     srcs[4] = vdup_n_u8(0);
1865 
1866     srcs[0] = Load4(src);
1867     src += src_stride;
1868     srcs[0] = Load4<1>(src, srcs[0]);
1869     src += src_stride;
1870     srcs[2] = Load4(src);
1871     src += src_stride;
1872     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1873 
1874     int y = 0;
1875     do {
1876       srcs[2] = Load4<1>(src, srcs[2]);
1877       src += src_stride;
1878       srcs[4] = Load4<0>(src, srcs[4]);
1879       src += src_stride;
1880       srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1881 
1882       const int16x8_t sums =
1883           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1884       if (is_compound) {
1885         const uint16x8_t results = Compound1DShift(sums);
1886 
1887         vst1q_u16(dst16, results);
1888         dst16 += 4 << 1;
1889       } else {
1890         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1891 
1892         StoreLo4(dst8, results);
1893         dst8 += dst_stride;
1894         StoreHi4(dst8, results);
1895         dst8 += dst_stride;
1896       }
1897 
1898       srcs[0] = srcs[2];
1899       srcs[1] = srcs[3];
1900       srcs[2] = srcs[4];
1901       y += 2;
1902     } while (y < height);
1903   } else if (num_taps == 6) {
1904     srcs[6] = vdup_n_u8(0);
1905 
1906     srcs[0] = Load4(src);
1907     src += src_stride;
1908     srcs[0] = Load4<1>(src, srcs[0]);
1909     src += src_stride;
1910     srcs[2] = Load4(src);
1911     src += src_stride;
1912     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1913     srcs[2] = Load4<1>(src, srcs[2]);
1914     src += src_stride;
1915     srcs[4] = Load4(src);
1916     src += src_stride;
1917     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1918 
1919     int y = 0;
1920     do {
1921       srcs[4] = Load4<1>(src, srcs[4]);
1922       src += src_stride;
1923       srcs[6] = Load4<0>(src, srcs[6]);
1924       src += src_stride;
1925       srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1926 
1927       const int16x8_t sums =
1928           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1929       if (is_compound) {
1930         const uint16x8_t results = Compound1DShift(sums);
1931 
1932         vst1q_u16(dst16, results);
1933         dst16 += 4 << 1;
1934       } else {
1935         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1936 
1937         StoreLo4(dst8, results);
1938         dst8 += dst_stride;
1939         StoreHi4(dst8, results);
1940         dst8 += dst_stride;
1941       }
1942 
1943       srcs[0] = srcs[2];
1944       srcs[1] = srcs[3];
1945       srcs[2] = srcs[4];
1946       srcs[3] = srcs[5];
1947       srcs[4] = srcs[6];
1948       y += 2;
1949     } while (y < height);
1950   } else if (num_taps == 8) {
1951     srcs[8] = vdup_n_u8(0);
1952 
1953     srcs[0] = Load4(src);
1954     src += src_stride;
1955     srcs[0] = Load4<1>(src, srcs[0]);
1956     src += src_stride;
1957     srcs[2] = Load4(src);
1958     src += src_stride;
1959     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1960     srcs[2] = Load4<1>(src, srcs[2]);
1961     src += src_stride;
1962     srcs[4] = Load4(src);
1963     src += src_stride;
1964     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1965     srcs[4] = Load4<1>(src, srcs[4]);
1966     src += src_stride;
1967     srcs[6] = Load4(src);
1968     src += src_stride;
1969     srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1970 
1971     int y = 0;
1972     do {
1973       srcs[6] = Load4<1>(src, srcs[6]);
1974       src += src_stride;
1975       srcs[8] = Load4<0>(src, srcs[8]);
1976       src += src_stride;
1977       srcs[7] = vext_u8(srcs[6], srcs[8], 4);
1978 
1979       const int16x8_t sums =
1980           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1981       if (is_compound) {
1982         const uint16x8_t results = Compound1DShift(sums);
1983 
1984         vst1q_u16(dst16, results);
1985         dst16 += 4 << 1;
1986       } else {
1987         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1988 
1989         StoreLo4(dst8, results);
1990         dst8 += dst_stride;
1991         StoreHi4(dst8, results);
1992         dst8 += dst_stride;
1993       }
1994 
1995       srcs[0] = srcs[2];
1996       srcs[1] = srcs[3];
1997       srcs[2] = srcs[4];
1998       srcs[3] = srcs[5];
1999       srcs[4] = srcs[6];
2000       srcs[5] = srcs[7];
2001       srcs[6] = srcs[8];
2002       y += 2;
2003     } while (y < height);
2004   }
2005 }
2006 
2007 template <int filter_index, bool negative_outside_taps = false>
FilterVertical2xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)2008 void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
2009                        void* const dst, const ptrdiff_t dst_stride,
2010                        const int height, const uint8x8_t* const taps) {
2011   const int num_taps = GetNumTapsInFilter(filter_index);
2012   auto* dst8 = static_cast<uint8_t*>(dst);
2013 
2014   uint8x8_t srcs[9];
2015 
2016   if (num_taps == 2) {
2017     srcs[2] = vdup_n_u8(0);
2018 
2019     srcs[0] = Load2(src);
2020     src += src_stride;
2021 
2022     int y = 0;
2023     do {
2024       srcs[0] = Load2<1>(src, srcs[0]);
2025       src += src_stride;
2026       srcs[0] = Load2<2>(src, srcs[0]);
2027       src += src_stride;
2028       srcs[0] = Load2<3>(src, srcs[0]);
2029       src += src_stride;
2030       srcs[2] = Load2<0>(src, srcs[2]);
2031       src += src_stride;
2032       srcs[1] = vext_u8(srcs[0], srcs[2], 2);
2033 
2034       // This uses srcs[0]..srcs[1].
2035       const int16x8_t sums =
2036           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2037       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2038 
2039       Store2<0>(dst8, results);
2040       dst8 += dst_stride;
2041       Store2<1>(dst8, results);
2042       if (height == 2) return;
2043       dst8 += dst_stride;
2044       Store2<2>(dst8, results);
2045       dst8 += dst_stride;
2046       Store2<3>(dst8, results);
2047       dst8 += dst_stride;
2048 
2049       srcs[0] = srcs[2];
2050       y += 4;
2051     } while (y < height);
2052   } else if (num_taps == 4) {
2053     srcs[4] = vdup_n_u8(0);
2054 
2055     srcs[0] = Load2(src);
2056     src += src_stride;
2057     srcs[0] = Load2<1>(src, srcs[0]);
2058     src += src_stride;
2059     srcs[0] = Load2<2>(src, srcs[0]);
2060     src += src_stride;
2061 
2062     int y = 0;
2063     do {
2064       srcs[0] = Load2<3>(src, srcs[0]);
2065       src += src_stride;
2066       srcs[4] = Load2<0>(src, srcs[4]);
2067       src += src_stride;
2068       srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2069       srcs[4] = Load2<1>(src, srcs[4]);
2070       src += src_stride;
2071       srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2072       srcs[4] = Load2<2>(src, srcs[4]);
2073       src += src_stride;
2074       srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2075 
2076       // This uses srcs[0]..srcs[3].
2077       const int16x8_t sums =
2078           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2079       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2080 
2081       Store2<0>(dst8, results);
2082       dst8 += dst_stride;
2083       Store2<1>(dst8, results);
2084       if (height == 2) return;
2085       dst8 += dst_stride;
2086       Store2<2>(dst8, results);
2087       dst8 += dst_stride;
2088       Store2<3>(dst8, results);
2089       dst8 += dst_stride;
2090 
2091       srcs[0] = srcs[4];
2092       y += 4;
2093     } while (y < height);
2094   } else if (num_taps == 6) {
2095     // During the vertical pass the number of taps is restricted when
2096     // |height| <= 4.
2097     assert(height > 4);
2098     srcs[8] = vdup_n_u8(0);
2099 
2100     srcs[0] = Load2(src);
2101     src += src_stride;
2102     srcs[0] = Load2<1>(src, srcs[0]);
2103     src += src_stride;
2104     srcs[0] = Load2<2>(src, srcs[0]);
2105     src += src_stride;
2106     srcs[0] = Load2<3>(src, srcs[0]);
2107     src += src_stride;
2108     srcs[4] = Load2(src);
2109     src += src_stride;
2110     srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2111 
2112     int y = 0;
2113     do {
2114       srcs[4] = Load2<1>(src, srcs[4]);
2115       src += src_stride;
2116       srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2117       srcs[4] = Load2<2>(src, srcs[4]);
2118       src += src_stride;
2119       srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2120       srcs[4] = Load2<3>(src, srcs[4]);
2121       src += src_stride;
2122       srcs[8] = Load2<0>(src, srcs[8]);
2123       src += src_stride;
2124       srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2125 
2126       // This uses srcs[0]..srcs[5].
2127       const int16x8_t sums =
2128           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2129       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2130 
2131       Store2<0>(dst8, results);
2132       dst8 += dst_stride;
2133       Store2<1>(dst8, results);
2134       dst8 += dst_stride;
2135       Store2<2>(dst8, results);
2136       dst8 += dst_stride;
2137       Store2<3>(dst8, results);
2138       dst8 += dst_stride;
2139 
2140       srcs[0] = srcs[4];
2141       srcs[1] = srcs[5];
2142       srcs[4] = srcs[8];
2143       y += 4;
2144     } while (y < height);
2145   } else if (num_taps == 8) {
2146     // During the vertical pass the number of taps is restricted when
2147     // |height| <= 4.
2148     assert(height > 4);
2149     srcs[8] = vdup_n_u8(0);
2150 
2151     srcs[0] = Load2(src);
2152     src += src_stride;
2153     srcs[0] = Load2<1>(src, srcs[0]);
2154     src += src_stride;
2155     srcs[0] = Load2<2>(src, srcs[0]);
2156     src += src_stride;
2157     srcs[0] = Load2<3>(src, srcs[0]);
2158     src += src_stride;
2159     srcs[4] = Load2(src);
2160     src += src_stride;
2161     srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2162     srcs[4] = Load2<1>(src, srcs[4]);
2163     src += src_stride;
2164     srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2165     srcs[4] = Load2<2>(src, srcs[4]);
2166     src += src_stride;
2167     srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2168 
2169     int y = 0;
2170     do {
2171       srcs[4] = Load2<3>(src, srcs[4]);
2172       src += src_stride;
2173       srcs[8] = Load2<0>(src, srcs[8]);
2174       src += src_stride;
2175       srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2176       srcs[8] = Load2<1>(src, srcs[8]);
2177       src += src_stride;
2178       srcs[6] = vext_u8(srcs[4], srcs[8], 4);
2179       srcs[8] = Load2<2>(src, srcs[8]);
2180       src += src_stride;
2181       srcs[7] = vext_u8(srcs[4], srcs[8], 6);
2182 
2183       // This uses srcs[0]..srcs[7].
2184       const int16x8_t sums =
2185           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2186       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2187 
2188       Store2<0>(dst8, results);
2189       dst8 += dst_stride;
2190       Store2<1>(dst8, results);
2191       dst8 += dst_stride;
2192       Store2<2>(dst8, results);
2193       dst8 += dst_stride;
2194       Store2<3>(dst8, results);
2195       dst8 += dst_stride;
2196 
2197       srcs[0] = srcs[4];
2198       srcs[1] = srcs[5];
2199       srcs[2] = srcs[6];
2200       srcs[3] = srcs[7];
2201       srcs[4] = srcs[8];
2202       y += 4;
2203     } while (y < height);
2204   }
2205 }
2206 
2207 // This function is a simplified version of Convolve2D_C.
2208 // It is called when it is single prediction mode, where only vertical
2209 // filtering is required.
2210 // The output is the single prediction of the block, clipped to valid pixel
2211 // range.
ConvolveVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)2212 void ConvolveVertical_NEON(const void* const reference,
2213                            const ptrdiff_t reference_stride,
2214                            const int /*horizontal_filter_index*/,
2215                            const int vertical_filter_index,
2216                            const int /*subpixel_x*/, const int subpixel_y,
2217                            const int width, const int height, void* prediction,
2218                            const ptrdiff_t pred_stride) {
2219   const int filter_index = GetFilterIndex(vertical_filter_index, height);
2220   const int vertical_taps = GetNumTapsInFilter(filter_index);
2221   const ptrdiff_t src_stride = reference_stride;
2222   const auto* src = static_cast<const uint8_t*>(reference) -
2223                     (vertical_taps / 2 - 1) * src_stride;
2224   auto* dest = static_cast<uint8_t*>(prediction);
2225   const ptrdiff_t dest_stride = pred_stride;
2226   const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
2227   assert(filter_id != 0);
2228 
2229   uint8x8_t taps[8];
2230   for (int k = 0; k < kSubPixelTaps; ++k) {
2231     taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
2232   }
2233 
2234   if (filter_index == 0) {  // 6 tap.
2235     if (width == 2) {
2236       FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
2237                            taps + 1);
2238     } else if (width == 4) {
2239       FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
2240                            taps + 1);
2241     } else {
2242       FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
2243                         taps + 1);
2244     }
2245   } else if ((filter_index == 1) &
2246              ((filter_id == 1) | (filter_id == 15))) {  // 5 tap.
2247     if (width == 2) {
2248       FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
2249                            taps + 1);
2250     } else if (width == 4) {
2251       FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
2252                            taps + 1);
2253     } else {
2254       FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
2255                         taps + 1);
2256     }
2257   } else if ((filter_index == 1) &
2258              ((filter_id == 7) | (filter_id == 8) |
2259               (filter_id == 9))) {  // 6 tap with weird negative taps.
2260     if (width == 2) {
2261       FilterVertical2xH<1,
2262                         /*negative_outside_taps=*/true>(
2263           src, src_stride, dest, dest_stride, height, taps + 1);
2264     } else if (width == 4) {
2265       FilterVertical4xH<1, /*is_compound=*/false,
2266                         /*negative_outside_taps=*/true>(
2267           src, src_stride, dest, dest_stride, height, taps + 1);
2268     } else {
2269       FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
2270           src, src_stride, dest, dest_stride, width, height, taps + 1);
2271     }
2272   } else if (filter_index == 2) {  // 8 tap.
2273     if (width == 2) {
2274       FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
2275     } else if (width == 4) {
2276       FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
2277     } else {
2278       FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
2279                         taps);
2280     }
2281   } else if (filter_index == 3) {  // 2 tap.
2282     if (width == 2) {
2283       FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
2284                            taps + 3);
2285     } else if (width == 4) {
2286       FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
2287                            taps + 3);
2288     } else {
2289       FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
2290                         taps + 3);
2291     }
2292   } else if (filter_index == 4) {  // 4 tap.
2293     // Outside taps are negative.
2294     if (width == 2) {
2295       FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
2296                            taps + 2);
2297     } else if (width == 4) {
2298       FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
2299                            taps + 2);
2300     } else {
2301       FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
2302                         taps + 2);
2303     }
2304   } else {
2305     // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
2306     // to 4 tap filters.
2307     assert(filter_index == 5 ||
2308            (filter_index == 1 &&
2309             (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
2310              filter_id == 5 || filter_id == 6 || filter_id == 10 ||
2311              filter_id == 11 || filter_id == 12 || filter_id == 13 ||
2312              filter_id == 14)));
2313     // According to GetNumTapsInFilter() this has 6 taps but here we are
2314     // treating it as though it has 4.
2315     if (filter_index == 1) src += src_stride;
2316     if (width == 2) {
2317       FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
2318                            taps + 2);
2319     } else if (width == 4) {
2320       FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
2321                            taps + 2);
2322     } else {
2323       FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
2324                         taps + 2);
2325     }
2326   }
2327 }
2328 
ConvolveCompoundCopy_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t)2329 void ConvolveCompoundCopy_NEON(
2330     const void* const reference, const ptrdiff_t reference_stride,
2331     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2332     const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2333     const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2334   const auto* src = static_cast<const uint8_t*>(reference);
2335   const ptrdiff_t src_stride = reference_stride;
2336   auto* dest = static_cast<uint16_t*>(prediction);
2337   constexpr int final_shift =
2338       kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
2339 
2340   if (width >= 16) {
2341     int y = 0;
2342     do {
2343       int x = 0;
2344       do {
2345         const uint8x16_t v_src = vld1q_u8(&src[x]);
2346         const uint16x8_t v_dest_lo =
2347             vshll_n_u8(vget_low_u8(v_src), final_shift);
2348         const uint16x8_t v_dest_hi =
2349             vshll_n_u8(vget_high_u8(v_src), final_shift);
2350         vst1q_u16(&dest[x], v_dest_lo);
2351         x += 8;
2352         vst1q_u16(&dest[x], v_dest_hi);
2353         x += 8;
2354       } while (x < width);
2355       src += src_stride;
2356       dest += width;
2357     } while (++y < height);
2358   } else if (width == 8) {
2359     int y = 0;
2360     do {
2361       const uint8x8_t v_src = vld1_u8(&src[0]);
2362       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2363       vst1q_u16(&dest[0], v_dest);
2364       src += src_stride;
2365       dest += width;
2366     } while (++y < height);
2367   } else { /* width == 4 */
2368     uint8x8_t v_src = vdup_n_u8(0);
2369 
2370     int y = 0;
2371     do {
2372       v_src = Load4<0>(&src[0], v_src);
2373       src += src_stride;
2374       v_src = Load4<1>(&src[0], v_src);
2375       src += src_stride;
2376       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2377       vst1q_u16(&dest[0], v_dest);
2378       dest += 4 << 1;
2379       y += 2;
2380     } while (y < height);
2381   }
2382 }
2383 
ConvolveCompoundVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t)2384 void ConvolveCompoundVertical_NEON(
2385     const void* const reference, const ptrdiff_t reference_stride,
2386     const int /*horizontal_filter_index*/, const int vertical_filter_index,
2387     const int /*subpixel_x*/, const int subpixel_y, const int width,
2388     const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2389   const int filter_index = GetFilterIndex(vertical_filter_index, height);
2390   const int vertical_taps = GetNumTapsInFilter(filter_index);
2391   const ptrdiff_t src_stride = reference_stride;
2392   const auto* src = static_cast<const uint8_t*>(reference) -
2393                     (vertical_taps / 2 - 1) * src_stride;
2394   auto* dest = static_cast<uint16_t*>(prediction);
2395   const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
2396   assert(filter_id != 0);
2397 
2398   uint8x8_t taps[8];
2399   for (int k = 0; k < kSubPixelTaps; ++k) {
2400     taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
2401   }
2402 
2403   if (filter_index == 0) {  // 6 tap.
2404     if (width == 4) {
2405       FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
2406                                                  height, taps + 1);
2407     } else {
2408       FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
2409                                               width, height, taps + 1);
2410     }
2411   } else if ((filter_index == 1) &
2412              ((filter_id == 1) | (filter_id == 15))) {  // 5 tap.
2413     if (width == 4) {
2414       FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
2415                                                  height, taps + 1);
2416     } else {
2417       FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
2418                                               width, height, taps + 1);
2419     }
2420   } else if ((filter_index == 1) &
2421              ((filter_id == 7) | (filter_id == 8) |
2422               (filter_id == 9))) {  // 6 tap with weird negative taps.
2423     if (width == 4) {
2424       FilterVertical4xH<1, /*is_compound=*/true,
2425                         /*negative_outside_taps=*/true>(src, src_stride, dest,
2426                                                         4, height, taps + 1);
2427     } else {
2428       FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
2429           src, src_stride, dest, width, width, height, taps + 1);
2430     }
2431   } else if (filter_index == 2) {  // 8 tap.
2432     if (width == 4) {
2433       FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
2434                                                  height, taps);
2435     } else {
2436       FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
2437                                               width, height, taps);
2438     }
2439   } else if (filter_index == 3) {  // 2 tap.
2440     if (width == 4) {
2441       FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
2442                                                  height, taps + 3);
2443     } else {
2444       FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
2445                                               width, height, taps + 3);
2446     }
2447   } else if (filter_index == 4) {  // 4 tap.
2448     if (width == 4) {
2449       FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
2450                                                  height, taps + 2);
2451     } else {
2452       FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
2453                                               width, height, taps + 2);
2454     }
2455   } else {
2456     // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
2457     // to 4 tap filters.
2458     assert(filter_index == 5 ||
2459            (filter_index == 1 &&
2460             (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
2461              filter_id == 5 || filter_id == 6 || filter_id == 10 ||
2462              filter_id == 11 || filter_id == 12 || filter_id == 13 ||
2463              filter_id == 14)));
2464     // According to GetNumTapsInFilter() this has 6 taps but here we are
2465     // treating it as though it has 4.
2466     if (filter_index == 1) src += src_stride;
2467     if (width == 4) {
2468       FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
2469                                                  height, taps + 2);
2470     } else {
2471       FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
2472                                               width, height, taps + 2);
2473     }
2474   }
2475 }
2476 
ConvolveCompoundHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int subpixel_x,const int,const int width,const int height,void * prediction,const ptrdiff_t)2477 void ConvolveCompoundHorizontal_NEON(
2478     const void* const reference, const ptrdiff_t reference_stride,
2479     const int horizontal_filter_index, const int /*vertical_filter_index*/,
2480     const int subpixel_x, const int /*subpixel_y*/, const int width,
2481     const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2482   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
2483   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
2484   auto* dest = static_cast<uint16_t*>(prediction);
2485 
2486   DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
2487       src, reference_stride, dest, width, width, height, subpixel_x,
2488       filter_index);
2489 }
2490 
ConvolveCompound2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int width,const int height,void * prediction,const ptrdiff_t)2491 void ConvolveCompound2D_NEON(
2492     const void* const reference, const ptrdiff_t reference_stride,
2493     const int horizontal_filter_index, const int vertical_filter_index,
2494     const int subpixel_x, const int subpixel_y, const int width,
2495     const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
2496   // The output of the horizontal filter, i.e. the intermediate_result, is
2497   // guaranteed to fit in int16_t.
2498   uint16_t
2499       intermediate_result[kMaxSuperBlockSizeInPixels *
2500                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
2501 
2502   // Horizontal filter.
2503   // Filter types used for width <= 4 are different from those for width > 4.
2504   // When width > 4, the valid filter index range is always [0, 3].
2505   // When width <= 4, the valid filter index range is always [4, 5].
2506   // Similarly for height.
2507   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
2508   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
2509   const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
2510   const int intermediate_height = height + vertical_taps - 1;
2511   const ptrdiff_t src_stride = reference_stride;
2512   const auto* const src = static_cast<const uint8_t*>(reference) -
2513                           (vertical_taps / 2 - 1) * src_stride -
2514                           kHorizontalOffset;
2515 
2516   DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
2517       src, src_stride, intermediate_result, width, width, intermediate_height,
2518       subpixel_x, horiz_filter_index);
2519 
2520   // Vertical filter.
2521   auto* dest = static_cast<uint16_t*>(prediction);
2522   const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
2523   assert(filter_id != 0);
2524 
2525   const ptrdiff_t dest_stride = width;
2526   const int16x8_t taps =
2527       vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
2528 
2529   if (vertical_taps == 8) {
2530     if (width == 4) {
2531       Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
2532                                                    dest_stride, height, taps);
2533     } else {
2534       Filter2DVertical<8, /*is_compound=*/true>(
2535           intermediate_result, dest, dest_stride, width, height, taps);
2536     }
2537   } else if (vertical_taps == 6) {
2538     if (width == 4) {
2539       Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
2540                                                    dest_stride, height, taps);
2541     } else {
2542       Filter2DVertical<6, /*is_compound=*/true>(
2543           intermediate_result, dest, dest_stride, width, height, taps);
2544     }
2545   } else if (vertical_taps == 4) {
2546     if (width == 4) {
2547       Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
2548                                                    dest_stride, height, taps);
2549     } else {
2550       Filter2DVertical<4, /*is_compound=*/true>(
2551           intermediate_result, dest, dest_stride, width, height, taps);
2552     }
2553   } else {  // |vertical_taps| == 2
2554     if (width == 4) {
2555       Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
2556                                                    dest_stride, height, taps);
2557     } else {
2558       Filter2DVertical<2, /*is_compound=*/true>(
2559           intermediate_result, dest, dest_stride, width, height, taps);
2560     }
2561   }
2562 }
2563 
HalfAddHorizontal(const uint8_t * src,uint8_t * dst)2564 inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
2565   const uint8x16_t left = vld1q_u8(src);
2566   const uint8x16_t right = vld1q_u8(src + 1);
2567   vst1q_u8(dst, vrhaddq_u8(left, right));
2568 }
2569 
2570 template <int width>
IntraBlockCopyHorizontal(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2571 inline void IntraBlockCopyHorizontal(const uint8_t* src,
2572                                      const ptrdiff_t src_stride,
2573                                      const int height, uint8_t* dst,
2574                                      const ptrdiff_t dst_stride) {
2575   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2576   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2577 
2578   int y = 0;
2579   do {
2580     HalfAddHorizontal(src, dst);
2581     if (width >= 32) {
2582       src += 16;
2583       dst += 16;
2584       HalfAddHorizontal(src, dst);
2585       if (width >= 64) {
2586         src += 16;
2587         dst += 16;
2588         HalfAddHorizontal(src, dst);
2589         src += 16;
2590         dst += 16;
2591         HalfAddHorizontal(src, dst);
2592         if (width == 128) {
2593           src += 16;
2594           dst += 16;
2595           HalfAddHorizontal(src, dst);
2596           src += 16;
2597           dst += 16;
2598           HalfAddHorizontal(src, dst);
2599           src += 16;
2600           dst += 16;
2601           HalfAddHorizontal(src, dst);
2602           src += 16;
2603           dst += 16;
2604           HalfAddHorizontal(src, dst);
2605         }
2606       }
2607     }
2608     src += src_remainder_stride;
2609     dst += dst_remainder_stride;
2610   } while (++y < height);
2611 }
2612 
ConvolveIntraBlockCopyHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2613 void ConvolveIntraBlockCopyHorizontal_NEON(
2614     const void* const reference, const ptrdiff_t reference_stride,
2615     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2616     const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2617     const int height, void* const prediction, const ptrdiff_t pred_stride) {
2618   const auto* src = static_cast<const uint8_t*>(reference);
2619   auto* dest = static_cast<uint8_t*>(prediction);
2620 
2621   if (width == 128) {
2622     IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
2623                                   pred_stride);
2624   } else if (width == 64) {
2625     IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
2626                                  pred_stride);
2627   } else if (width == 32) {
2628     IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
2629                                  pred_stride);
2630   } else if (width == 16) {
2631     IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
2632                                  pred_stride);
2633   } else if (width == 8) {
2634     int y = 0;
2635     do {
2636       const uint8x8_t left = vld1_u8(src);
2637       const uint8x8_t right = vld1_u8(src + 1);
2638       vst1_u8(dest, vrhadd_u8(left, right));
2639 
2640       src += reference_stride;
2641       dest += pred_stride;
2642     } while (++y < height);
2643   } else if (width == 4) {
2644     uint8x8_t left = vdup_n_u8(0);
2645     uint8x8_t right = vdup_n_u8(0);
2646     int y = 0;
2647     do {
2648       left = Load4<0>(src, left);
2649       right = Load4<0>(src + 1, right);
2650       src += reference_stride;
2651       left = Load4<1>(src, left);
2652       right = Load4<1>(src + 1, right);
2653       src += reference_stride;
2654 
2655       const uint8x8_t result = vrhadd_u8(left, right);
2656 
2657       StoreLo4(dest, result);
2658       dest += pred_stride;
2659       StoreHi4(dest, result);
2660       dest += pred_stride;
2661       y += 2;
2662     } while (y < height);
2663   } else {
2664     assert(width == 2);
2665     uint8x8_t left = vdup_n_u8(0);
2666     uint8x8_t right = vdup_n_u8(0);
2667     int y = 0;
2668     do {
2669       left = Load2<0>(src, left);
2670       right = Load2<0>(src + 1, right);
2671       src += reference_stride;
2672       left = Load2<1>(src, left);
2673       right = Load2<1>(src + 1, right);
2674       src += reference_stride;
2675 
2676       const uint8x8_t result = vrhadd_u8(left, right);
2677 
2678       Store2<0>(dest, result);
2679       dest += pred_stride;
2680       Store2<1>(dest, result);
2681       dest += pred_stride;
2682       y += 2;
2683     } while (y < height);
2684   }
2685 }
2686 
2687 template <int width>
IntraBlockCopyVertical(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2688 inline void IntraBlockCopyVertical(const uint8_t* src,
2689                                    const ptrdiff_t src_stride, const int height,
2690                                    uint8_t* dst, const ptrdiff_t dst_stride) {
2691   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2692   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2693   uint8x16_t row[8], below[8];
2694 
2695   row[0] = vld1q_u8(src);
2696   if (width >= 32) {
2697     src += 16;
2698     row[1] = vld1q_u8(src);
2699     if (width >= 64) {
2700       src += 16;
2701       row[2] = vld1q_u8(src);
2702       src += 16;
2703       row[3] = vld1q_u8(src);
2704       if (width == 128) {
2705         src += 16;
2706         row[4] = vld1q_u8(src);
2707         src += 16;
2708         row[5] = vld1q_u8(src);
2709         src += 16;
2710         row[6] = vld1q_u8(src);
2711         src += 16;
2712         row[7] = vld1q_u8(src);
2713       }
2714     }
2715   }
2716   src += src_remainder_stride;
2717 
2718   int y = 0;
2719   do {
2720     below[0] = vld1q_u8(src);
2721     if (width >= 32) {
2722       src += 16;
2723       below[1] = vld1q_u8(src);
2724       if (width >= 64) {
2725         src += 16;
2726         below[2] = vld1q_u8(src);
2727         src += 16;
2728         below[3] = vld1q_u8(src);
2729         if (width == 128) {
2730           src += 16;
2731           below[4] = vld1q_u8(src);
2732           src += 16;
2733           below[5] = vld1q_u8(src);
2734           src += 16;
2735           below[6] = vld1q_u8(src);
2736           src += 16;
2737           below[7] = vld1q_u8(src);
2738         }
2739       }
2740     }
2741     src += src_remainder_stride;
2742 
2743     vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
2744     row[0] = below[0];
2745     if (width >= 32) {
2746       dst += 16;
2747       vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
2748       row[1] = below[1];
2749       if (width >= 64) {
2750         dst += 16;
2751         vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
2752         row[2] = below[2];
2753         dst += 16;
2754         vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
2755         row[3] = below[3];
2756         if (width >= 128) {
2757           dst += 16;
2758           vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
2759           row[4] = below[4];
2760           dst += 16;
2761           vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
2762           row[5] = below[5];
2763           dst += 16;
2764           vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
2765           row[6] = below[6];
2766           dst += 16;
2767           vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
2768           row[7] = below[7];
2769         }
2770       }
2771     }
2772     dst += dst_remainder_stride;
2773   } while (++y < height);
2774 }
2775 
ConvolveIntraBlockCopyVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2776 void ConvolveIntraBlockCopyVertical_NEON(
2777     const void* const reference, const ptrdiff_t reference_stride,
2778     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2779     const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2780     const int height, void* const prediction, const ptrdiff_t pred_stride) {
2781   const auto* src = static_cast<const uint8_t*>(reference);
2782   auto* dest = static_cast<uint8_t*>(prediction);
2783 
2784   if (width == 128) {
2785     IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
2786                                 pred_stride);
2787   } else if (width == 64) {
2788     IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
2789                                pred_stride);
2790   } else if (width == 32) {
2791     IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
2792                                pred_stride);
2793   } else if (width == 16) {
2794     IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
2795                                pred_stride);
2796   } else if (width == 8) {
2797     uint8x8_t row, below;
2798     row = vld1_u8(src);
2799     src += reference_stride;
2800 
2801     int y = 0;
2802     do {
2803       below = vld1_u8(src);
2804       src += reference_stride;
2805 
2806       vst1_u8(dest, vrhadd_u8(row, below));
2807       dest += pred_stride;
2808 
2809       row = below;
2810     } while (++y < height);
2811   } else if (width == 4) {
2812     uint8x8_t row = Load4(src);
2813     uint8x8_t below = vdup_n_u8(0);
2814     src += reference_stride;
2815 
2816     int y = 0;
2817     do {
2818       below = Load4<0>(src, below);
2819       src += reference_stride;
2820 
2821       StoreLo4(dest, vrhadd_u8(row, below));
2822       dest += pred_stride;
2823 
2824       row = below;
2825     } while (++y < height);
2826   } else {
2827     assert(width == 2);
2828     uint8x8_t row = Load2(src);
2829     uint8x8_t below = vdup_n_u8(0);
2830     src += reference_stride;
2831 
2832     int y = 0;
2833     do {
2834       below = Load2<0>(src, below);
2835       src += reference_stride;
2836 
2837       Store2<0>(dest, vrhadd_u8(row, below));
2838       dest += pred_stride;
2839 
2840       row = below;
2841     } while (++y < height);
2842   }
2843 }
2844 
2845 template <int width>
IntraBlockCopy2D(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2846 inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
2847                              const int height, uint8_t* dst,
2848                              const ptrdiff_t dst_stride) {
2849   const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
2850   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
2851   uint16x8_t row[16];
2852   row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2853   if (width >= 16) {
2854     src += 8;
2855     row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2856     if (width >= 32) {
2857       src += 8;
2858       row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2859       src += 8;
2860       row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2861       if (width >= 64) {
2862         src += 8;
2863         row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2864         src += 8;
2865         row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2866         src += 8;
2867         row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2868         src += 8;
2869         row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2870         if (width == 128) {
2871           src += 8;
2872           row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2873           src += 8;
2874           row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2875           src += 8;
2876           row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2877           src += 8;
2878           row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2879           src += 8;
2880           row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2881           src += 8;
2882           row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2883           src += 8;
2884           row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2885           src += 8;
2886           row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2887         }
2888       }
2889     }
2890   }
2891   src += src_remainder_stride;
2892 
2893   int y = 0;
2894   do {
2895     const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2896     vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
2897     row[0] = below_0;
2898     if (width >= 16) {
2899       src += 8;
2900       dst += 8;
2901 
2902       const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2903       vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
2904       row[1] = below_1;
2905       if (width >= 32) {
2906         src += 8;
2907         dst += 8;
2908 
2909         const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2910         vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
2911         row[2] = below_2;
2912         src += 8;
2913         dst += 8;
2914 
2915         const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2916         vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
2917         row[3] = below_3;
2918         if (width >= 64) {
2919           src += 8;
2920           dst += 8;
2921 
2922           const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2923           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
2924           row[4] = below_4;
2925           src += 8;
2926           dst += 8;
2927 
2928           const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2929           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
2930           row[5] = below_5;
2931           src += 8;
2932           dst += 8;
2933 
2934           const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2935           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
2936           row[6] = below_6;
2937           src += 8;
2938           dst += 8;
2939 
2940           const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2941           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
2942           row[7] = below_7;
2943           if (width == 128) {
2944             src += 8;
2945             dst += 8;
2946 
2947             const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2948             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
2949             row[8] = below_8;
2950             src += 8;
2951             dst += 8;
2952 
2953             const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2954             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
2955             row[9] = below_9;
2956             src += 8;
2957             dst += 8;
2958 
2959             const uint16x8_t below_10 =
2960                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2961             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
2962             row[10] = below_10;
2963             src += 8;
2964             dst += 8;
2965 
2966             const uint16x8_t below_11 =
2967                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2968             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
2969             row[11] = below_11;
2970             src += 8;
2971             dst += 8;
2972 
2973             const uint16x8_t below_12 =
2974                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2975             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
2976             row[12] = below_12;
2977             src += 8;
2978             dst += 8;
2979 
2980             const uint16x8_t below_13 =
2981                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2982             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
2983             row[13] = below_13;
2984             src += 8;
2985             dst += 8;
2986 
2987             const uint16x8_t below_14 =
2988                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2989             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
2990             row[14] = below_14;
2991             src += 8;
2992             dst += 8;
2993 
2994             const uint16x8_t below_15 =
2995                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2996             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
2997             row[15] = below_15;
2998           }
2999         }
3000       }
3001     }
3002     src += src_remainder_stride;
3003     dst += dst_remainder_stride;
3004   } while (++y < height);
3005 }
3006 
ConvolveIntraBlockCopy2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)3007 void ConvolveIntraBlockCopy2D_NEON(
3008     const void* const reference, const ptrdiff_t reference_stride,
3009     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
3010     const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
3011     const int height, void* const prediction, const ptrdiff_t pred_stride) {
3012   const auto* src = static_cast<const uint8_t*>(reference);
3013   auto* dest = static_cast<uint8_t*>(prediction);
3014   // Note: allow vertical access to height + 1. Because this function is only
3015   // for u/v plane of intra block copy, such access is guaranteed to be within
3016   // the prediction block.
3017 
3018   if (width == 128) {
3019     IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
3020   } else if (width == 64) {
3021     IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
3022   } else if (width == 32) {
3023     IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
3024   } else if (width == 16) {
3025     IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
3026   } else if (width == 8) {
3027     IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
3028   } else if (width == 4) {
3029     uint8x8_t left = Load4(src);
3030     uint8x8_t right = Load4(src + 1);
3031     src += reference_stride;
3032 
3033     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3034 
3035     int y = 0;
3036     do {
3037       left = Load4<0>(src, left);
3038       right = Load4<0>(src + 1, right);
3039       src += reference_stride;
3040       left = Load4<1>(src, left);
3041       right = Load4<1>(src + 1, right);
3042       src += reference_stride;
3043 
3044       const uint16x8_t below = vaddl_u8(left, right);
3045 
3046       const uint8x8_t result = vrshrn_n_u16(
3047           vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3048       StoreLo4(dest, result);
3049       dest += pred_stride;
3050       StoreHi4(dest, result);
3051       dest += pred_stride;
3052 
3053       row = vget_high_u16(below);
3054       y += 2;
3055     } while (y < height);
3056   } else {
3057     uint8x8_t left = Load2(src);
3058     uint8x8_t right = Load2(src + 1);
3059     src += reference_stride;
3060 
3061     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3062 
3063     int y = 0;
3064     do {
3065       left = Load2<0>(src, left);
3066       right = Load2<0>(src + 1, right);
3067       src += reference_stride;
3068       left = Load2<2>(src, left);
3069       right = Load2<2>(src + 1, right);
3070       src += reference_stride;
3071 
3072       const uint16x8_t below = vaddl_u8(left, right);
3073 
3074       const uint8x8_t result = vrshrn_n_u16(
3075           vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3076       Store2<0>(dest, result);
3077       dest += pred_stride;
3078       Store2<2>(dest, result);
3079       dest += pred_stride;
3080 
3081       row = vget_high_u16(below);
3082       y += 2;
3083     } while (y < height);
3084   }
3085 }
3086 
Init8bpp()3087 void Init8bpp() {
3088   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
3089   assert(dsp != nullptr);
3090   dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
3091   dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
3092   dsp->convolve[0][0][1][1] = Convolve2D_NEON;
3093 
3094   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
3095   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
3096   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
3097   dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
3098 
3099   dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
3100   dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
3101   dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
3102 
3103   dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
3104   dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
3105 }
3106 
3107 }  // namespace
3108 }  // namespace low_bitdepth
3109 
ConvolveInit_NEON()3110 void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
3111 
3112 }  // namespace dsp
3113 }  // namespace libgav1
3114 
3115 #else  // !LIBGAV1_ENABLE_NEON
3116 
3117 namespace libgav1 {
3118 namespace dsp {
3119 
ConvolveInit_NEON()3120 void ConvolveInit_NEON() {}
3121 
3122 }  // namespace dsp
3123 }  // namespace libgav1
3124 #endif  // LIBGAV1_ENABLE_NEON
3125