• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 The libgav1 Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Common 128 bit functions used for sse4/avx2 convolve implementations.
16// This will be included inside an anonymous namespace on files where these are
17// necessary.
18
19#include "src/dsp/convolve.inc"
20
21// This version checks for the special cases when filter_index == 1.
22int GetNumTapsInFilter(const int filter_index, const int filter_id) {
23  if (filter_index == 0) {
24    // Despite the names these only use 6 taps.
25    // kInterpolationFilterEightTap
26    // kInterpolationFilterEightTapSmooth
27    return 6;
28  }
29
30  if (filter_index == 1) {
31    // Despite the names these only use 6 taps.
32    // kInterpolationFilterEightTap
33    // kInterpolationFilterEightTapSmooth
34    if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
35         (filter_id == 8) | (filter_id == 9)) != 0) {
36      return 6;
37    }
38    // When |filter_index| == 1, the |filter_id| values not listed above map to
39    // 4 tap filters.
40    return 4;
41  }
42
43  if (filter_index == 2) {
44    // kInterpolationFilterEightTapSharp
45    return 8;
46  }
47
48  if (filter_index == 3) {
49    // kInterpolationFilterBilinear
50    return 2;
51  }
52
53  assert(filter_index > 3);
54  // For small sizes (width/height <= 4) the large filters are replaced with 4
55  // tap options.
56  // If the original filters were |kInterpolationFilterEightTap| or
57  // |kInterpolationFilterEightTapSharp| then it becomes
58  // |kInterpolationFilterSwitchable|.
59  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
60  // tap filter.
61  return 4;
62}
63
64// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
65// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
66// sum from outranging int16_t.
67template <int num_taps>
68__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
69  __m128i sum;
70  if (num_taps == 6) {
71    // 6 taps.
72    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
73    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
74    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
75    sum = _mm_add_epi16(v_madd_21, v_madd_43);
76    sum = _mm_add_epi16(sum, v_madd_65);
77  } else if (num_taps == 8) {
78    // 8 taps.
79    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
80    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
81    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
82    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
83    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
84    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
85    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
86  } else if (num_taps == 2) {
87    // 2 taps.
88    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
89  } else {
90    // 4 taps.
91    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
92    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
93    sum = _mm_add_epi16(v_madd_32, v_madd_54);
94  }
95  return sum;
96}
97
98template <int num_taps>
99__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
100                             const __m128i* const v_tap) {
101  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
102  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
103
104  if (num_taps == 2) {
105    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
106    const __m128i v_src_43 = _mm_shuffle_epi8(
107        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
108    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
109    return v_sum_43;
110  }
111
112  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
113  const __m128i v_src_32 = _mm_shuffle_epi8(
114      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
115  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
116  const __m128i v_src_54 = _mm_shuffle_epi8(
117      v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
118                           static_cast<int>(0x80070706), 0x06050504));
119  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
120  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
121  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
122  return v_sum_5432;
123}
124
125template <int num_taps>
126__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
127                                const __m128i* const v_tap) {
128  __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
129
130  // Normally the Horizontal pass does the downshift in two passes:
131  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
132  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
133  // requires adding the rounding offset from the skipped shift.
134  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
135
136  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
137  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
138  return _mm_packus_epi16(sum, sum);
139}
140
141template <int num_taps>
142__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
143                                const __m128i* const v_tap) {
144  const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
145
146  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
147}
148
149template <int num_taps, bool is_2d_vertical = false>
150LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
151                                     __m128i* v_tap) {
152  if (num_taps == 8) {
153    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
154    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
155    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
156    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
157    if (is_2d_vertical) {
158      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
159      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
160      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
161      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
162    } else {
163      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
164      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
165      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
166      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
167    }
168  } else if (num_taps == 6) {
169    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
170    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
171    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
172    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
173    if (is_2d_vertical) {
174      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
175      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
176      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
177    } else {
178      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
179      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
180      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
181    }
182  } else if (num_taps == 4) {
183    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
184    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
185    if (is_2d_vertical) {
186      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
187      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
188    } else {
189      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
190      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
191    }
192  } else {  // num_taps == 2
193    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
194    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
195    if (is_2d_vertical) {
196      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
197    } else {
198      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
199    }
200  }
201}
202
203template <int num_taps, bool is_compound>
204__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
205                                const __m128i* const taps) {
206  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
207  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
208  if (num_taps >= 4) {
209    __m128i madd_lo =
210        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
211    __m128i madd_hi =
212        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
213    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
214    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
215    if (num_taps >= 6) {
216      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
217      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
218      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
219      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
220      if (num_taps == 8) {
221        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
222        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
223        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
224        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
225      }
226    }
227  }
228
229  if (is_compound) {
230    return _mm_packs_epi32(
231        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
232        RightShiftWithRounding_S32(sum_hi,
233                                   kInterRoundBitsCompoundVertical - 1));
234  }
235
236  return _mm_packs_epi32(
237      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
238      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
239}
240
241template <int num_taps, bool is_compound = false>
242void Filter2DVertical(const uint16_t* src, void* const dst,
243                      const ptrdiff_t dst_stride, const int width,
244                      const int height, const __m128i* const taps) {
245  assert(width >= 8);
246  constexpr int next_row = num_taps - 1;
247  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
248  const ptrdiff_t src_stride = width;
249
250  auto* dst8 = static_cast<uint8_t*>(dst);
251  auto* dst16 = static_cast<uint16_t*>(dst);
252
253  int x = 0;
254  do {
255    __m128i srcs[8];
256    const uint16_t* src_x = src + x;
257    srcs[0] = LoadAligned16(src_x);
258    src_x += src_stride;
259    if (num_taps >= 4) {
260      srcs[1] = LoadAligned16(src_x);
261      src_x += src_stride;
262      srcs[2] = LoadAligned16(src_x);
263      src_x += src_stride;
264      if (num_taps >= 6) {
265        srcs[3] = LoadAligned16(src_x);
266        src_x += src_stride;
267        srcs[4] = LoadAligned16(src_x);
268        src_x += src_stride;
269        if (num_taps == 8) {
270          srcs[5] = LoadAligned16(src_x);
271          src_x += src_stride;
272          srcs[6] = LoadAligned16(src_x);
273          src_x += src_stride;
274        }
275      }
276    }
277
278    auto* dst8_x = dst8 + x;
279    auto* dst16_x = dst16 + x;
280    int y = height;
281    do {
282      srcs[next_row] = LoadAligned16(src_x);
283      src_x += src_stride;
284
285      const __m128i sum =
286          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
287      if (is_compound) {
288        StoreUnaligned16(dst16_x, sum);
289        dst16_x += dst_stride;
290      } else {
291        StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
292        dst8_x += dst_stride;
293      }
294
295      srcs[0] = srcs[1];
296      if (num_taps >= 4) {
297        srcs[1] = srcs[2];
298        srcs[2] = srcs[3];
299        if (num_taps >= 6) {
300          srcs[3] = srcs[4];
301          srcs[4] = srcs[5];
302          if (num_taps == 8) {
303            srcs[5] = srcs[6];
304            srcs[6] = srcs[7];
305          }
306        }
307      }
308    } while (--y != 0);
309    x += 8;
310  } while (x < width);
311}
312
313// Take advantage of |src_stride| == |width| to process two rows at a time.
314template <int num_taps, bool is_compound = false>
315void Filter2DVertical4xH(const uint16_t* src, void* const dst,
316                         const ptrdiff_t dst_stride, const int height,
317                         const __m128i* const taps) {
318  auto* dst8 = static_cast<uint8_t*>(dst);
319  auto* dst16 = static_cast<uint16_t*>(dst);
320
321  __m128i srcs[9];
322  srcs[0] = LoadAligned16(src);
323  src += 8;
324  if (num_taps >= 4) {
325    srcs[2] = LoadAligned16(src);
326    src += 8;
327    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
328    if (num_taps >= 6) {
329      srcs[4] = LoadAligned16(src);
330      src += 8;
331      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
332      if (num_taps == 8) {
333        srcs[6] = LoadAligned16(src);
334        src += 8;
335        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
336      }
337    }
338  }
339
340  int y = height;
341  do {
342    srcs[num_taps] = LoadAligned16(src);
343    src += 8;
344    srcs[num_taps - 1] = _mm_unpacklo_epi64(
345        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
346
347    const __m128i sum =
348        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
349    if (is_compound) {
350      StoreUnaligned16(dst16, sum);
351      dst16 += 4 << 1;
352    } else {
353      const __m128i results = _mm_packus_epi16(sum, sum);
354      Store4(dst8, results);
355      dst8 += dst_stride;
356      Store4(dst8, _mm_srli_si128(results, 4));
357      dst8 += dst_stride;
358    }
359
360    srcs[0] = srcs[2];
361    if (num_taps >= 4) {
362      srcs[1] = srcs[3];
363      srcs[2] = srcs[4];
364      if (num_taps >= 6) {
365        srcs[3] = srcs[5];
366        srcs[4] = srcs[6];
367        if (num_taps == 8) {
368          srcs[5] = srcs[7];
369          srcs[6] = srcs[8];
370        }
371      }
372    }
373    y -= 2;
374  } while (y != 0);
375}
376
377// Take advantage of |src_stride| == |width| to process four rows at a time.
378template <int num_taps>
379void Filter2DVertical2xH(const uint16_t* src, void* const dst,
380                         const ptrdiff_t dst_stride, const int height,
381                         const __m128i* const taps) {
382  constexpr int next_row = (num_taps < 6) ? 4 : 8;
383
384  auto* dst8 = static_cast<uint8_t*>(dst);
385
386  __m128i srcs[9];
387  srcs[0] = LoadAligned16(src);
388  src += 8;
389  if (num_taps >= 6) {
390    srcs[4] = LoadAligned16(src);
391    src += 8;
392    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
393    if (num_taps == 8) {
394      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
395      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
396    }
397  }
398
399  int y = height;
400  do {
401    srcs[next_row] = LoadAligned16(src);
402    src += 8;
403    if (num_taps == 2) {
404      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
405    } else if (num_taps == 4) {
406      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
407      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
408      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
409    } else if (num_taps == 6) {
410      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
411      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
412      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
413    } else if (num_taps == 8) {
414      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
415      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
416      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
417    }
418
419    const __m128i sum =
420        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
421    const __m128i results = _mm_packus_epi16(sum, sum);
422
423    Store2(dst8, results);
424    dst8 += dst_stride;
425    Store2(dst8, _mm_srli_si128(results, 2));
426    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
427    // Therefore we don't need to check this condition when |height| > 4.
428    if (num_taps <= 4 && height == 2) return;
429    dst8 += dst_stride;
430    Store2(dst8, _mm_srli_si128(results, 4));
431    dst8 += dst_stride;
432    Store2(dst8, _mm_srli_si128(results, 6));
433    dst8 += dst_stride;
434
435    srcs[0] = srcs[4];
436    if (num_taps == 6) {
437      srcs[1] = srcs[5];
438      srcs[4] = srcs[8];
439    } else if (num_taps == 8) {
440      srcs[1] = srcs[5];
441      srcs[2] = srcs[6];
442      srcs[3] = srcs[7];
443      srcs[4] = srcs[8];
444    }
445
446    y -= 4;
447  } while (y != 0);
448}
449
450// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
451// Vertical calculations.
452__m128i Compound1DShift(const __m128i sum) {
453  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
454}
455
456template <int num_taps>
457__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
458  __m128i v_src[4];
459
460  if (num_taps == 6) {
461    // 6 taps.
462    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
463    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
464    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
465  } else if (num_taps == 8) {
466    // 8 taps.
467    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
468    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
469    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
470    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
471  } else if (num_taps == 2) {
472    // 2 taps.
473    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
474  } else {
475    // 4 taps.
476    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
477    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
478  }
479  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
480  return sum;
481}
482
483template <int num_taps, bool is_compound = false>
484void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
485                       void* const dst, const ptrdiff_t dst_stride,
486                       const int height, const __m128i* const v_tap) {
487  auto* dst8 = static_cast<uint8_t*>(dst);
488  auto* dst16 = static_cast<uint16_t*>(dst);
489
490  __m128i srcs[9];
491
492  if (num_taps == 2) {
493    srcs[2] = _mm_setzero_si128();
494    // 00 01 02 03
495    srcs[0] = Load4(src);
496    src += src_stride;
497
498    int y = height;
499    do {
500      // 10 11 12 13
501      const __m128i a = Load4(src);
502      // 00 01 02 03 10 11 12 13
503      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
504      src += src_stride;
505      // 20 21 22 23
506      srcs[2] = Load4(src);
507      src += src_stride;
508      // 10 11 12 13 20 21 22 23
509      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
510
511      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
512      if (is_compound) {
513        const __m128i results = Compound1DShift(sums);
514        StoreUnaligned16(dst16, results);
515        dst16 += 4 << 1;
516      } else {
517        const __m128i results_16 =
518            RightShiftWithRounding_S16(sums, kFilterBits - 1);
519        const __m128i results = _mm_packus_epi16(results_16, results_16);
520        Store4(dst8, results);
521        dst8 += dst_stride;
522        Store4(dst8, _mm_srli_si128(results, 4));
523        dst8 += dst_stride;
524      }
525
526      srcs[0] = srcs[2];
527      y -= 2;
528    } while (y != 0);
529  } else if (num_taps == 4) {
530    srcs[4] = _mm_setzero_si128();
531    // 00 01 02 03
532    srcs[0] = Load4(src);
533    src += src_stride;
534    // 10 11 12 13
535    const __m128i a = Load4(src);
536    // 00 01 02 03 10 11 12 13
537    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
538    src += src_stride;
539    // 20 21 22 23
540    srcs[2] = Load4(src);
541    src += src_stride;
542    // 10 11 12 13 20 21 22 23
543    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
544
545    int y = height;
546    do {
547      // 30 31 32 33
548      const __m128i b = Load4(src);
549      // 20 21 22 23 30 31 32 33
550      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
551      src += src_stride;
552      // 40 41 42 43
553      srcs[4] = Load4(src);
554      src += src_stride;
555      // 30 31 32 33 40 41 42 43
556      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
557
558      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
559      if (is_compound) {
560        const __m128i results = Compound1DShift(sums);
561        StoreUnaligned16(dst16, results);
562        dst16 += 4 << 1;
563      } else {
564        const __m128i results_16 =
565            RightShiftWithRounding_S16(sums, kFilterBits - 1);
566        const __m128i results = _mm_packus_epi16(results_16, results_16);
567        Store4(dst8, results);
568        dst8 += dst_stride;
569        Store4(dst8, _mm_srli_si128(results, 4));
570        dst8 += dst_stride;
571      }
572
573      srcs[0] = srcs[2];
574      srcs[1] = srcs[3];
575      srcs[2] = srcs[4];
576      y -= 2;
577    } while (y != 0);
578  } else if (num_taps == 6) {
579    srcs[6] = _mm_setzero_si128();
580    // 00 01 02 03
581    srcs[0] = Load4(src);
582    src += src_stride;
583    // 10 11 12 13
584    const __m128i a = Load4(src);
585    // 00 01 02 03 10 11 12 13
586    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
587    src += src_stride;
588    // 20 21 22 23
589    srcs[2] = Load4(src);
590    src += src_stride;
591    // 10 11 12 13 20 21 22 23
592    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
593    // 30 31 32 33
594    const __m128i b = Load4(src);
595    // 20 21 22 23 30 31 32 33
596    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
597    src += src_stride;
598    // 40 41 42 43
599    srcs[4] = Load4(src);
600    src += src_stride;
601    // 30 31 32 33 40 41 42 43
602    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
603
604    int y = height;
605    do {
606      // 50 51 52 53
607      const __m128i c = Load4(src);
608      // 40 41 42 43 50 51 52 53
609      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
610      src += src_stride;
611      // 60 61 62 63
612      srcs[6] = Load4(src);
613      src += src_stride;
614      // 50 51 52 53 60 61 62 63
615      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
616
617      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
618      if (is_compound) {
619        const __m128i results = Compound1DShift(sums);
620        StoreUnaligned16(dst16, results);
621        dst16 += 4 << 1;
622      } else {
623        const __m128i results_16 =
624            RightShiftWithRounding_S16(sums, kFilterBits - 1);
625        const __m128i results = _mm_packus_epi16(results_16, results_16);
626        Store4(dst8, results);
627        dst8 += dst_stride;
628        Store4(dst8, _mm_srli_si128(results, 4));
629        dst8 += dst_stride;
630      }
631
632      srcs[0] = srcs[2];
633      srcs[1] = srcs[3];
634      srcs[2] = srcs[4];
635      srcs[3] = srcs[5];
636      srcs[4] = srcs[6];
637      y -= 2;
638    } while (y != 0);
639  } else if (num_taps == 8) {
640    srcs[8] = _mm_setzero_si128();
641    // 00 01 02 03
642    srcs[0] = Load4(src);
643    src += src_stride;
644    // 10 11 12 13
645    const __m128i a = Load4(src);
646    // 00 01 02 03 10 11 12 13
647    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
648    src += src_stride;
649    // 20 21 22 23
650    srcs[2] = Load4(src);
651    src += src_stride;
652    // 10 11 12 13 20 21 22 23
653    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
654    // 30 31 32 33
655    const __m128i b = Load4(src);
656    // 20 21 22 23 30 31 32 33
657    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
658    src += src_stride;
659    // 40 41 42 43
660    srcs[4] = Load4(src);
661    src += src_stride;
662    // 30 31 32 33 40 41 42 43
663    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
664    // 50 51 52 53
665    const __m128i c = Load4(src);
666    // 40 41 42 43 50 51 52 53
667    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
668    src += src_stride;
669    // 60 61 62 63
670    srcs[6] = Load4(src);
671    src += src_stride;
672    // 50 51 52 53 60 61 62 63
673    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
674
675    int y = height;
676    do {
677      // 70 71 72 73
678      const __m128i d = Load4(src);
679      // 60 61 62 63 70 71 72 73
680      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
681      src += src_stride;
682      // 80 81 82 83
683      srcs[8] = Load4(src);
684      src += src_stride;
685      // 70 71 72 73 80 81 82 83
686      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
687
688      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
689      if (is_compound) {
690        const __m128i results = Compound1DShift(sums);
691        StoreUnaligned16(dst16, results);
692        dst16 += 4 << 1;
693      } else {
694        const __m128i results_16 =
695            RightShiftWithRounding_S16(sums, kFilterBits - 1);
696        const __m128i results = _mm_packus_epi16(results_16, results_16);
697        Store4(dst8, results);
698        dst8 += dst_stride;
699        Store4(dst8, _mm_srli_si128(results, 4));
700        dst8 += dst_stride;
701      }
702
703      srcs[0] = srcs[2];
704      srcs[1] = srcs[3];
705      srcs[2] = srcs[4];
706      srcs[3] = srcs[5];
707      srcs[4] = srcs[6];
708      srcs[5] = srcs[7];
709      srcs[6] = srcs[8];
710      y -= 2;
711    } while (y != 0);
712  }
713}
714
715template <int num_taps, bool negative_outside_taps = false>
716void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
717                       void* const dst, const ptrdiff_t dst_stride,
718                       const int height, const __m128i* const v_tap) {
719  auto* dst8 = static_cast<uint8_t*>(dst);
720
721  __m128i srcs[9];
722
723  if (num_taps == 2) {
724    srcs[2] = _mm_setzero_si128();
725    // 00 01
726    srcs[0] = Load2(src);
727    src += src_stride;
728
729    int y = height;
730    do {
731      // 00 01 10 11
732      srcs[0] = Load2<1>(src, srcs[0]);
733      src += src_stride;
734      // 00 01 10 11 20 21
735      srcs[0] = Load2<2>(src, srcs[0]);
736      src += src_stride;
737      // 00 01 10 11 20 21 30 31
738      srcs[0] = Load2<3>(src, srcs[0]);
739      src += src_stride;
740      // 40 41
741      srcs[2] = Load2<0>(src, srcs[2]);
742      src += src_stride;
743      // 00 01 10 11 20 21 30 31 40 41
744      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
745      // 10 11 20 21 30 31 40 41
746      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
747      // This uses srcs[0]..srcs[1].
748      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
749      const __m128i results_16 =
750          RightShiftWithRounding_S16(sums, kFilterBits - 1);
751      const __m128i results = _mm_packus_epi16(results_16, results_16);
752
753      Store2(dst8, results);
754      dst8 += dst_stride;
755      Store2(dst8, _mm_srli_si128(results, 2));
756      if (height == 2) return;
757      dst8 += dst_stride;
758      Store2(dst8, _mm_srli_si128(results, 4));
759      dst8 += dst_stride;
760      Store2(dst8, _mm_srli_si128(results, 6));
761      dst8 += dst_stride;
762
763      srcs[0] = srcs[2];
764      y -= 4;
765    } while (y != 0);
766  } else if (num_taps == 4) {
767    srcs[4] = _mm_setzero_si128();
768
769    // 00 01
770    srcs[0] = Load2(src);
771    src += src_stride;
772    // 00 01 10 11
773    srcs[0] = Load2<1>(src, srcs[0]);
774    src += src_stride;
775    // 00 01 10 11 20 21
776    srcs[0] = Load2<2>(src, srcs[0]);
777    src += src_stride;
778
779    int y = height;
780    do {
781      // 00 01 10 11 20 21 30 31
782      srcs[0] = Load2<3>(src, srcs[0]);
783      src += src_stride;
784      // 40 41
785      srcs[4] = Load2<0>(src, srcs[4]);
786      src += src_stride;
787      // 40 41 50 51
788      srcs[4] = Load2<1>(src, srcs[4]);
789      src += src_stride;
790      // 40 41 50 51 60 61
791      srcs[4] = Load2<2>(src, srcs[4]);
792      src += src_stride;
793      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
794      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
795      // 10 11 20 21 30 31 40 41
796      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
797      // 20 21 30 31 40 41 50 51
798      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
799      // 30 31 40 41 50 51 60 61
800      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
801
802      // This uses srcs[0]..srcs[3].
803      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
804      const __m128i results_16 =
805          RightShiftWithRounding_S16(sums, kFilterBits - 1);
806      const __m128i results = _mm_packus_epi16(results_16, results_16);
807
808      Store2(dst8, results);
809      dst8 += dst_stride;
810      Store2(dst8, _mm_srli_si128(results, 2));
811      if (height == 2) return;
812      dst8 += dst_stride;
813      Store2(dst8, _mm_srli_si128(results, 4));
814      dst8 += dst_stride;
815      Store2(dst8, _mm_srli_si128(results, 6));
816      dst8 += dst_stride;
817
818      srcs[0] = srcs[4];
819      y -= 4;
820    } while (y != 0);
821  } else if (num_taps == 6) {
822    // During the vertical pass the number of taps is restricted when
823    // |height| <= 4.
824    assert(height > 4);
825    srcs[8] = _mm_setzero_si128();
826
827    // 00 01
828    srcs[0] = Load2(src);
829    src += src_stride;
830    // 00 01 10 11
831    srcs[0] = Load2<1>(src, srcs[0]);
832    src += src_stride;
833    // 00 01 10 11 20 21
834    srcs[0] = Load2<2>(src, srcs[0]);
835    src += src_stride;
836    // 00 01 10 11 20 21 30 31
837    srcs[0] = Load2<3>(src, srcs[0]);
838    src += src_stride;
839    // 40 41
840    srcs[4] = Load2(src);
841    src += src_stride;
842    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
843    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
844    // 10 11 20 21 30 31 40 41
845    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
846
847    int y = height;
848    do {
849      // 40 41 50 51
850      srcs[4] = Load2<1>(src, srcs[4]);
851      src += src_stride;
852      // 40 41 50 51 60 61
853      srcs[4] = Load2<2>(src, srcs[4]);
854      src += src_stride;
855      // 40 41 50 51 60 61 70 71
856      srcs[4] = Load2<3>(src, srcs[4]);
857      src += src_stride;
858      // 80 81
859      srcs[8] = Load2<0>(src, srcs[8]);
860      src += src_stride;
861      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
862      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
863      // 20 21 30 31 40 41 50 51
864      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
865      // 30 31 40 41 50 51 60 61
866      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
867      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
868      // 50 51 60 61 70 71 80 81
869      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
870
871      // This uses srcs[0]..srcs[5].
872      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
873      const __m128i results_16 =
874          RightShiftWithRounding_S16(sums, kFilterBits - 1);
875      const __m128i results = _mm_packus_epi16(results_16, results_16);
876
877      Store2(dst8, results);
878      dst8 += dst_stride;
879      Store2(dst8, _mm_srli_si128(results, 2));
880      dst8 += dst_stride;
881      Store2(dst8, _mm_srli_si128(results, 4));
882      dst8 += dst_stride;
883      Store2(dst8, _mm_srli_si128(results, 6));
884      dst8 += dst_stride;
885
886      srcs[0] = srcs[4];
887      srcs[1] = srcs[5];
888      srcs[4] = srcs[8];
889      y -= 4;
890    } while (y != 0);
891  } else if (num_taps == 8) {
892    // During the vertical pass the number of taps is restricted when
893    // |height| <= 4.
894    assert(height > 4);
895    srcs[8] = _mm_setzero_si128();
896    // 00 01
897    srcs[0] = Load2(src);
898    src += src_stride;
899    // 00 01 10 11
900    srcs[0] = Load2<1>(src, srcs[0]);
901    src += src_stride;
902    // 00 01 10 11 20 21
903    srcs[0] = Load2<2>(src, srcs[0]);
904    src += src_stride;
905    // 00 01 10 11 20 21 30 31
906    srcs[0] = Load2<3>(src, srcs[0]);
907    src += src_stride;
908    // 40 41
909    srcs[4] = Load2(src);
910    src += src_stride;
911    // 40 41 50 51
912    srcs[4] = Load2<1>(src, srcs[4]);
913    src += src_stride;
914    // 40 41 50 51 60 61
915    srcs[4] = Load2<2>(src, srcs[4]);
916    src += src_stride;
917
918    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
919    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
920    // 10 11 20 21 30 31 40 41
921    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
922    // 20 21 30 31 40 41 50 51
923    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
924    // 30 31 40 41 50 51 60 61
925    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
926
927    int y = height;
928    do {
929      // 40 41 50 51 60 61 70 71
930      srcs[4] = Load2<3>(src, srcs[4]);
931      src += src_stride;
932      // 80 81
933      srcs[8] = Load2<0>(src, srcs[8]);
934      src += src_stride;
935      // 80 81 90 91
936      srcs[8] = Load2<1>(src, srcs[8]);
937      src += src_stride;
938      // 80 81 90 91 a0 a1
939      srcs[8] = Load2<2>(src, srcs[8]);
940      src += src_stride;
941
942      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
943      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
944      // 50 51 60 61 70 71 80 81
945      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
946      // 60 61 70 71 80 81 90 91
947      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
948      // 70 71 80 81 90 91 a0 a1
949      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
950
951      // This uses srcs[0]..srcs[7].
952      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
953      const __m128i results_16 =
954          RightShiftWithRounding_S16(sums, kFilterBits - 1);
955      const __m128i results = _mm_packus_epi16(results_16, results_16);
956
957      Store2(dst8, results);
958      dst8 += dst_stride;
959      Store2(dst8, _mm_srli_si128(results, 2));
960      dst8 += dst_stride;
961      Store2(dst8, _mm_srli_si128(results, 4));
962      dst8 += dst_stride;
963      Store2(dst8, _mm_srli_si128(results, 6));
964      dst8 += dst_stride;
965
966      srcs[0] = srcs[4];
967      srcs[1] = srcs[5];
968      srcs[2] = srcs[6];
969      srcs[3] = srcs[7];
970      srcs[4] = srcs[8];
971      y -= 4;
972    } while (y != 0);
973  }
974}
975