1// Copyright 2020 The libgav1 Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Common 128 bit functions used for sse4/avx2 convolve implementations. 16// This will be included inside an anonymous namespace on files where these are 17// necessary. 18 19#include "src/dsp/convolve.inc" 20 21// This version checks for the special cases when filter_index == 1. 22int GetNumTapsInFilter(const int filter_index, const int filter_id) { 23 if (filter_index == 0) { 24 // Despite the names these only use 6 taps. 25 // kInterpolationFilterEightTap 26 // kInterpolationFilterEightTapSmooth 27 return 6; 28 } 29 30 if (filter_index == 1) { 31 // Despite the names these only use 6 taps. 32 // kInterpolationFilterEightTap 33 // kInterpolationFilterEightTapSmooth 34 if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) | 35 (filter_id == 8) | (filter_id == 9)) != 0) { 36 return 6; 37 } 38 // When |filter_index| == 1, the |filter_id| values not listed above map to 39 // 4 tap filters. 40 return 4; 41 } 42 43 if (filter_index == 2) { 44 // kInterpolationFilterEightTapSharp 45 return 8; 46 } 47 48 if (filter_index == 3) { 49 // kInterpolationFilterBilinear 50 return 2; 51 } 52 53 assert(filter_index > 3); 54 // For small sizes (width/height <= 4) the large filters are replaced with 4 55 // tap options. 56 // If the original filters were |kInterpolationFilterEightTap| or 57 // |kInterpolationFilterEightTapSharp| then it becomes 58 // |kInterpolationFilterSwitchable|. 59 // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 60 // tap filter. 61 return 4; 62} 63 64// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and 65// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final 66// sum from outranging int16_t. 67template <int num_taps> 68__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { 69 __m128i sum; 70 if (num_taps == 6) { 71 // 6 taps. 72 const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 73 const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 74 const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 75 sum = _mm_add_epi16(v_madd_21, v_madd_43); 76 sum = _mm_add_epi16(sum, v_madd_65); 77 } else if (num_taps == 8) { 78 // 8 taps. 79 const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 80 const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 81 const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4 82 const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6 83 const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); 84 const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); 85 sum = _mm_add_epi16(v_sum_7654, v_sum_3210); 86 } else if (num_taps == 2) { 87 // 2 taps. 88 sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 89 } else { 90 // 4 taps. 91 const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2 92 const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4 93 sum = _mm_add_epi16(v_madd_32, v_madd_54); 94 } 95 return sum; 96} 97 98template <int num_taps> 99__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, 100 const __m128i* const v_tap) { 101 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 102 const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); 103 104 if (num_taps == 2) { 105 // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 106 const __m128i v_src_43 = _mm_shuffle_epi8( 107 v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); 108 const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 109 return v_sum_43; 110 } 111 112 // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 113 const __m128i v_src_32 = _mm_shuffle_epi8( 114 v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302)); 115 // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx 116 const __m128i v_src_54 = _mm_shuffle_epi8( 117 v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c, 118 static_cast<int>(0x80070706), 0x06050504)); 119 const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 120 const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 121 const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); 122 return v_sum_5432; 123} 124 125template <int num_taps> 126__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, 127 const __m128i* const v_tap) { 128 __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); 129 130 // Normally the Horizontal pass does the downshift in two passes: 131 // kInterRoundBitsHorizontal - 1 and then (kFilterBits - 132 // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them 133 // requires adding the rounding offset from the skipped shift. 134 constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); 135 136 sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); 137 sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); 138 return _mm_packus_epi16(sum, sum); 139} 140 141template <int num_taps> 142__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, 143 const __m128i* const v_tap) { 144 const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); 145 146 return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); 147} 148 149template <int num_taps, bool is_2d_vertical = false> 150LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, 151 __m128i* v_tap) { 152 if (num_taps == 8) { 153 v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 154 v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 155 v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 156 v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 157 if (is_2d_vertical) { 158 v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 159 v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); 160 v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); 161 v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); 162 } else { 163 v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 164 v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); 165 v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); 166 v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); 167 } 168 } else if (num_taps == 6) { 169 const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); 170 v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 171 v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 172 v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 173 if (is_2d_vertical) { 174 v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 175 v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); 176 v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); 177 } else { 178 v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 179 v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); 180 v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); 181 } 182 } else if (num_taps == 4) { 183 v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 184 v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 185 if (is_2d_vertical) { 186 v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 187 v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); 188 } else { 189 v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 190 v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); 191 } 192 } else { // num_taps == 2 193 const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); 194 v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 195 if (is_2d_vertical) { 196 v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 197 } else { 198 v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 199 } 200 } 201} 202 203template <int num_taps, bool is_compound> 204__m128i SimpleSum2DVerticalTaps(const __m128i* const src, 205 const __m128i* const taps) { 206 __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); 207 __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]); 208 if (num_taps >= 4) { 209 __m128i madd_lo = 210 _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]); 211 __m128i madd_hi = 212 _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]); 213 sum_lo = _mm_add_epi32(sum_lo, madd_lo); 214 sum_hi = _mm_add_epi32(sum_hi, madd_hi); 215 if (num_taps >= 6) { 216 madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]); 217 madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]); 218 sum_lo = _mm_add_epi32(sum_lo, madd_lo); 219 sum_hi = _mm_add_epi32(sum_hi, madd_hi); 220 if (num_taps == 8) { 221 madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]); 222 madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]); 223 sum_lo = _mm_add_epi32(sum_lo, madd_lo); 224 sum_hi = _mm_add_epi32(sum_hi, madd_hi); 225 } 226 } 227 } 228 229 if (is_compound) { 230 return _mm_packs_epi32( 231 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), 232 RightShiftWithRounding_S32(sum_hi, 233 kInterRoundBitsCompoundVertical - 1)); 234 } 235 236 return _mm_packs_epi32( 237 RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), 238 RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); 239} 240 241template <int num_taps, bool is_compound = false> 242void Filter2DVertical(const uint16_t* src, void* const dst, 243 const ptrdiff_t dst_stride, const int width, 244 const int height, const __m128i* const taps) { 245 assert(width >= 8); 246 constexpr int next_row = num_taps - 1; 247 // The Horizontal pass uses |width| as |stride| for the intermediate buffer. 248 const ptrdiff_t src_stride = width; 249 250 auto* dst8 = static_cast<uint8_t*>(dst); 251 auto* dst16 = static_cast<uint16_t*>(dst); 252 253 int x = 0; 254 do { 255 __m128i srcs[8]; 256 const uint16_t* src_x = src + x; 257 srcs[0] = LoadAligned16(src_x); 258 src_x += src_stride; 259 if (num_taps >= 4) { 260 srcs[1] = LoadAligned16(src_x); 261 src_x += src_stride; 262 srcs[2] = LoadAligned16(src_x); 263 src_x += src_stride; 264 if (num_taps >= 6) { 265 srcs[3] = LoadAligned16(src_x); 266 src_x += src_stride; 267 srcs[4] = LoadAligned16(src_x); 268 src_x += src_stride; 269 if (num_taps == 8) { 270 srcs[5] = LoadAligned16(src_x); 271 src_x += src_stride; 272 srcs[6] = LoadAligned16(src_x); 273 src_x += src_stride; 274 } 275 } 276 } 277 278 auto* dst8_x = dst8 + x; 279 auto* dst16_x = dst16 + x; 280 int y = height; 281 do { 282 srcs[next_row] = LoadAligned16(src_x); 283 src_x += src_stride; 284 285 const __m128i sum = 286 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); 287 if (is_compound) { 288 StoreUnaligned16(dst16_x, sum); 289 dst16_x += dst_stride; 290 } else { 291 StoreLo8(dst8_x, _mm_packus_epi16(sum, sum)); 292 dst8_x += dst_stride; 293 } 294 295 srcs[0] = srcs[1]; 296 if (num_taps >= 4) { 297 srcs[1] = srcs[2]; 298 srcs[2] = srcs[3]; 299 if (num_taps >= 6) { 300 srcs[3] = srcs[4]; 301 srcs[4] = srcs[5]; 302 if (num_taps == 8) { 303 srcs[5] = srcs[6]; 304 srcs[6] = srcs[7]; 305 } 306 } 307 } 308 } while (--y != 0); 309 x += 8; 310 } while (x < width); 311} 312 313// Take advantage of |src_stride| == |width| to process two rows at a time. 314template <int num_taps, bool is_compound = false> 315void Filter2DVertical4xH(const uint16_t* src, void* const dst, 316 const ptrdiff_t dst_stride, const int height, 317 const __m128i* const taps) { 318 auto* dst8 = static_cast<uint8_t*>(dst); 319 auto* dst16 = static_cast<uint16_t*>(dst); 320 321 __m128i srcs[9]; 322 srcs[0] = LoadAligned16(src); 323 src += 8; 324 if (num_taps >= 4) { 325 srcs[2] = LoadAligned16(src); 326 src += 8; 327 srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]); 328 if (num_taps >= 6) { 329 srcs[4] = LoadAligned16(src); 330 src += 8; 331 srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]); 332 if (num_taps == 8) { 333 srcs[6] = LoadAligned16(src); 334 src += 8; 335 srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]); 336 } 337 } 338 } 339 340 int y = height; 341 do { 342 srcs[num_taps] = LoadAligned16(src); 343 src += 8; 344 srcs[num_taps - 1] = _mm_unpacklo_epi64( 345 _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]); 346 347 const __m128i sum = 348 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); 349 if (is_compound) { 350 StoreUnaligned16(dst16, sum); 351 dst16 += 4 << 1; 352 } else { 353 const __m128i results = _mm_packus_epi16(sum, sum); 354 Store4(dst8, results); 355 dst8 += dst_stride; 356 Store4(dst8, _mm_srli_si128(results, 4)); 357 dst8 += dst_stride; 358 } 359 360 srcs[0] = srcs[2]; 361 if (num_taps >= 4) { 362 srcs[1] = srcs[3]; 363 srcs[2] = srcs[4]; 364 if (num_taps >= 6) { 365 srcs[3] = srcs[5]; 366 srcs[4] = srcs[6]; 367 if (num_taps == 8) { 368 srcs[5] = srcs[7]; 369 srcs[6] = srcs[8]; 370 } 371 } 372 } 373 y -= 2; 374 } while (y != 0); 375} 376 377// Take advantage of |src_stride| == |width| to process four rows at a time. 378template <int num_taps> 379void Filter2DVertical2xH(const uint16_t* src, void* const dst, 380 const ptrdiff_t dst_stride, const int height, 381 const __m128i* const taps) { 382 constexpr int next_row = (num_taps < 6) ? 4 : 8; 383 384 auto* dst8 = static_cast<uint8_t*>(dst); 385 386 __m128i srcs[9]; 387 srcs[0] = LoadAligned16(src); 388 src += 8; 389 if (num_taps >= 6) { 390 srcs[4] = LoadAligned16(src); 391 src += 8; 392 srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); 393 if (num_taps == 8) { 394 srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); 395 srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); 396 } 397 } 398 399 int y = height; 400 do { 401 srcs[next_row] = LoadAligned16(src); 402 src += 8; 403 if (num_taps == 2) { 404 srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); 405 } else if (num_taps == 4) { 406 srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); 407 srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); 408 srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); 409 } else if (num_taps == 6) { 410 srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); 411 srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); 412 srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); 413 } else if (num_taps == 8) { 414 srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); 415 srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8); 416 srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12); 417 } 418 419 const __m128i sum = 420 SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps); 421 const __m128i results = _mm_packus_epi16(sum, sum); 422 423 Store2(dst8, results); 424 dst8 += dst_stride; 425 Store2(dst8, _mm_srli_si128(results, 2)); 426 // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. 427 // Therefore we don't need to check this condition when |height| > 4. 428 if (num_taps <= 4 && height == 2) return; 429 dst8 += dst_stride; 430 Store2(dst8, _mm_srli_si128(results, 4)); 431 dst8 += dst_stride; 432 Store2(dst8, _mm_srli_si128(results, 6)); 433 dst8 += dst_stride; 434 435 srcs[0] = srcs[4]; 436 if (num_taps == 6) { 437 srcs[1] = srcs[5]; 438 srcs[4] = srcs[8]; 439 } else if (num_taps == 8) { 440 srcs[1] = srcs[5]; 441 srcs[2] = srcs[6]; 442 srcs[3] = srcs[7]; 443 srcs[4] = srcs[8]; 444 } 445 446 y -= 4; 447 } while (y != 0); 448} 449 450// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D 451// Vertical calculations. 452__m128i Compound1DShift(const __m128i sum) { 453 return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); 454} 455 456template <int num_taps> 457__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { 458 __m128i v_src[4]; 459 460 if (num_taps == 6) { 461 // 6 taps. 462 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 463 v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); 464 v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); 465 } else if (num_taps == 8) { 466 // 8 taps. 467 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 468 v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); 469 v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); 470 v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); 471 } else if (num_taps == 2) { 472 // 2 taps. 473 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 474 } else { 475 // 4 taps. 476 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 477 v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); 478 } 479 const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap); 480 return sum; 481} 482 483template <int num_taps, bool is_compound = false> 484void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, 485 void* const dst, const ptrdiff_t dst_stride, 486 const int height, const __m128i* const v_tap) { 487 auto* dst8 = static_cast<uint8_t*>(dst); 488 auto* dst16 = static_cast<uint16_t*>(dst); 489 490 __m128i srcs[9]; 491 492 if (num_taps == 2) { 493 srcs[2] = _mm_setzero_si128(); 494 // 00 01 02 03 495 srcs[0] = Load4(src); 496 src += src_stride; 497 498 int y = height; 499 do { 500 // 10 11 12 13 501 const __m128i a = Load4(src); 502 // 00 01 02 03 10 11 12 13 503 srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 504 src += src_stride; 505 // 20 21 22 23 506 srcs[2] = Load4(src); 507 src += src_stride; 508 // 10 11 12 13 20 21 22 23 509 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 510 511 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 512 if (is_compound) { 513 const __m128i results = Compound1DShift(sums); 514 StoreUnaligned16(dst16, results); 515 dst16 += 4 << 1; 516 } else { 517 const __m128i results_16 = 518 RightShiftWithRounding_S16(sums, kFilterBits - 1); 519 const __m128i results = _mm_packus_epi16(results_16, results_16); 520 Store4(dst8, results); 521 dst8 += dst_stride; 522 Store4(dst8, _mm_srli_si128(results, 4)); 523 dst8 += dst_stride; 524 } 525 526 srcs[0] = srcs[2]; 527 y -= 2; 528 } while (y != 0); 529 } else if (num_taps == 4) { 530 srcs[4] = _mm_setzero_si128(); 531 // 00 01 02 03 532 srcs[0] = Load4(src); 533 src += src_stride; 534 // 10 11 12 13 535 const __m128i a = Load4(src); 536 // 00 01 02 03 10 11 12 13 537 srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 538 src += src_stride; 539 // 20 21 22 23 540 srcs[2] = Load4(src); 541 src += src_stride; 542 // 10 11 12 13 20 21 22 23 543 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 544 545 int y = height; 546 do { 547 // 30 31 32 33 548 const __m128i b = Load4(src); 549 // 20 21 22 23 30 31 32 33 550 srcs[2] = _mm_unpacklo_epi32(srcs[2], b); 551 src += src_stride; 552 // 40 41 42 43 553 srcs[4] = Load4(src); 554 src += src_stride; 555 // 30 31 32 33 40 41 42 43 556 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); 557 558 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 559 if (is_compound) { 560 const __m128i results = Compound1DShift(sums); 561 StoreUnaligned16(dst16, results); 562 dst16 += 4 << 1; 563 } else { 564 const __m128i results_16 = 565 RightShiftWithRounding_S16(sums, kFilterBits - 1); 566 const __m128i results = _mm_packus_epi16(results_16, results_16); 567 Store4(dst8, results); 568 dst8 += dst_stride; 569 Store4(dst8, _mm_srli_si128(results, 4)); 570 dst8 += dst_stride; 571 } 572 573 srcs[0] = srcs[2]; 574 srcs[1] = srcs[3]; 575 srcs[2] = srcs[4]; 576 y -= 2; 577 } while (y != 0); 578 } else if (num_taps == 6) { 579 srcs[6] = _mm_setzero_si128(); 580 // 00 01 02 03 581 srcs[0] = Load4(src); 582 src += src_stride; 583 // 10 11 12 13 584 const __m128i a = Load4(src); 585 // 00 01 02 03 10 11 12 13 586 srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 587 src += src_stride; 588 // 20 21 22 23 589 srcs[2] = Load4(src); 590 src += src_stride; 591 // 10 11 12 13 20 21 22 23 592 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 593 // 30 31 32 33 594 const __m128i b = Load4(src); 595 // 20 21 22 23 30 31 32 33 596 srcs[2] = _mm_unpacklo_epi32(srcs[2], b); 597 src += src_stride; 598 // 40 41 42 43 599 srcs[4] = Load4(src); 600 src += src_stride; 601 // 30 31 32 33 40 41 42 43 602 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); 603 604 int y = height; 605 do { 606 // 50 51 52 53 607 const __m128i c = Load4(src); 608 // 40 41 42 43 50 51 52 53 609 srcs[4] = _mm_unpacklo_epi32(srcs[4], c); 610 src += src_stride; 611 // 60 61 62 63 612 srcs[6] = Load4(src); 613 src += src_stride; 614 // 50 51 52 53 60 61 62 63 615 srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); 616 617 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 618 if (is_compound) { 619 const __m128i results = Compound1DShift(sums); 620 StoreUnaligned16(dst16, results); 621 dst16 += 4 << 1; 622 } else { 623 const __m128i results_16 = 624 RightShiftWithRounding_S16(sums, kFilterBits - 1); 625 const __m128i results = _mm_packus_epi16(results_16, results_16); 626 Store4(dst8, results); 627 dst8 += dst_stride; 628 Store4(dst8, _mm_srli_si128(results, 4)); 629 dst8 += dst_stride; 630 } 631 632 srcs[0] = srcs[2]; 633 srcs[1] = srcs[3]; 634 srcs[2] = srcs[4]; 635 srcs[3] = srcs[5]; 636 srcs[4] = srcs[6]; 637 y -= 2; 638 } while (y != 0); 639 } else if (num_taps == 8) { 640 srcs[8] = _mm_setzero_si128(); 641 // 00 01 02 03 642 srcs[0] = Load4(src); 643 src += src_stride; 644 // 10 11 12 13 645 const __m128i a = Load4(src); 646 // 00 01 02 03 10 11 12 13 647 srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 648 src += src_stride; 649 // 20 21 22 23 650 srcs[2] = Load4(src); 651 src += src_stride; 652 // 10 11 12 13 20 21 22 23 653 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 654 // 30 31 32 33 655 const __m128i b = Load4(src); 656 // 20 21 22 23 30 31 32 33 657 srcs[2] = _mm_unpacklo_epi32(srcs[2], b); 658 src += src_stride; 659 // 40 41 42 43 660 srcs[4] = Load4(src); 661 src += src_stride; 662 // 30 31 32 33 40 41 42 43 663 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); 664 // 50 51 52 53 665 const __m128i c = Load4(src); 666 // 40 41 42 43 50 51 52 53 667 srcs[4] = _mm_unpacklo_epi32(srcs[4], c); 668 src += src_stride; 669 // 60 61 62 63 670 srcs[6] = Load4(src); 671 src += src_stride; 672 // 50 51 52 53 60 61 62 63 673 srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); 674 675 int y = height; 676 do { 677 // 70 71 72 73 678 const __m128i d = Load4(src); 679 // 60 61 62 63 70 71 72 73 680 srcs[6] = _mm_unpacklo_epi32(srcs[6], d); 681 src += src_stride; 682 // 80 81 82 83 683 srcs[8] = Load4(src); 684 src += src_stride; 685 // 70 71 72 73 80 81 82 83 686 srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); 687 688 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 689 if (is_compound) { 690 const __m128i results = Compound1DShift(sums); 691 StoreUnaligned16(dst16, results); 692 dst16 += 4 << 1; 693 } else { 694 const __m128i results_16 = 695 RightShiftWithRounding_S16(sums, kFilterBits - 1); 696 const __m128i results = _mm_packus_epi16(results_16, results_16); 697 Store4(dst8, results); 698 dst8 += dst_stride; 699 Store4(dst8, _mm_srli_si128(results, 4)); 700 dst8 += dst_stride; 701 } 702 703 srcs[0] = srcs[2]; 704 srcs[1] = srcs[3]; 705 srcs[2] = srcs[4]; 706 srcs[3] = srcs[5]; 707 srcs[4] = srcs[6]; 708 srcs[5] = srcs[7]; 709 srcs[6] = srcs[8]; 710 y -= 2; 711 } while (y != 0); 712 } 713} 714 715template <int num_taps, bool negative_outside_taps = false> 716void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, 717 void* const dst, const ptrdiff_t dst_stride, 718 const int height, const __m128i* const v_tap) { 719 auto* dst8 = static_cast<uint8_t*>(dst); 720 721 __m128i srcs[9]; 722 723 if (num_taps == 2) { 724 srcs[2] = _mm_setzero_si128(); 725 // 00 01 726 srcs[0] = Load2(src); 727 src += src_stride; 728 729 int y = height; 730 do { 731 // 00 01 10 11 732 srcs[0] = Load2<1>(src, srcs[0]); 733 src += src_stride; 734 // 00 01 10 11 20 21 735 srcs[0] = Load2<2>(src, srcs[0]); 736 src += src_stride; 737 // 00 01 10 11 20 21 30 31 738 srcs[0] = Load2<3>(src, srcs[0]); 739 src += src_stride; 740 // 40 41 741 srcs[2] = Load2<0>(src, srcs[2]); 742 src += src_stride; 743 // 00 01 10 11 20 21 30 31 40 41 744 const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]); 745 // 10 11 20 21 30 31 40 41 746 srcs[1] = _mm_srli_si128(srcs_0_2, 2); 747 // This uses srcs[0]..srcs[1]. 748 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 749 const __m128i results_16 = 750 RightShiftWithRounding_S16(sums, kFilterBits - 1); 751 const __m128i results = _mm_packus_epi16(results_16, results_16); 752 753 Store2(dst8, results); 754 dst8 += dst_stride; 755 Store2(dst8, _mm_srli_si128(results, 2)); 756 if (height == 2) return; 757 dst8 += dst_stride; 758 Store2(dst8, _mm_srli_si128(results, 4)); 759 dst8 += dst_stride; 760 Store2(dst8, _mm_srli_si128(results, 6)); 761 dst8 += dst_stride; 762 763 srcs[0] = srcs[2]; 764 y -= 4; 765 } while (y != 0); 766 } else if (num_taps == 4) { 767 srcs[4] = _mm_setzero_si128(); 768 769 // 00 01 770 srcs[0] = Load2(src); 771 src += src_stride; 772 // 00 01 10 11 773 srcs[0] = Load2<1>(src, srcs[0]); 774 src += src_stride; 775 // 00 01 10 11 20 21 776 srcs[0] = Load2<2>(src, srcs[0]); 777 src += src_stride; 778 779 int y = height; 780 do { 781 // 00 01 10 11 20 21 30 31 782 srcs[0] = Load2<3>(src, srcs[0]); 783 src += src_stride; 784 // 40 41 785 srcs[4] = Load2<0>(src, srcs[4]); 786 src += src_stride; 787 // 40 41 50 51 788 srcs[4] = Load2<1>(src, srcs[4]); 789 src += src_stride; 790 // 40 41 50 51 60 61 791 srcs[4] = Load2<2>(src, srcs[4]); 792 src += src_stride; 793 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 794 const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); 795 // 10 11 20 21 30 31 40 41 796 srcs[1] = _mm_srli_si128(srcs_0_4, 2); 797 // 20 21 30 31 40 41 50 51 798 srcs[2] = _mm_srli_si128(srcs_0_4, 4); 799 // 30 31 40 41 50 51 60 61 800 srcs[3] = _mm_srli_si128(srcs_0_4, 6); 801 802 // This uses srcs[0]..srcs[3]. 803 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 804 const __m128i results_16 = 805 RightShiftWithRounding_S16(sums, kFilterBits - 1); 806 const __m128i results = _mm_packus_epi16(results_16, results_16); 807 808 Store2(dst8, results); 809 dst8 += dst_stride; 810 Store2(dst8, _mm_srli_si128(results, 2)); 811 if (height == 2) return; 812 dst8 += dst_stride; 813 Store2(dst8, _mm_srli_si128(results, 4)); 814 dst8 += dst_stride; 815 Store2(dst8, _mm_srli_si128(results, 6)); 816 dst8 += dst_stride; 817 818 srcs[0] = srcs[4]; 819 y -= 4; 820 } while (y != 0); 821 } else if (num_taps == 6) { 822 // During the vertical pass the number of taps is restricted when 823 // |height| <= 4. 824 assert(height > 4); 825 srcs[8] = _mm_setzero_si128(); 826 827 // 00 01 828 srcs[0] = Load2(src); 829 src += src_stride; 830 // 00 01 10 11 831 srcs[0] = Load2<1>(src, srcs[0]); 832 src += src_stride; 833 // 00 01 10 11 20 21 834 srcs[0] = Load2<2>(src, srcs[0]); 835 src += src_stride; 836 // 00 01 10 11 20 21 30 31 837 srcs[0] = Load2<3>(src, srcs[0]); 838 src += src_stride; 839 // 40 41 840 srcs[4] = Load2(src); 841 src += src_stride; 842 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 843 const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]); 844 // 10 11 20 21 30 31 40 41 845 srcs[1] = _mm_srli_si128(srcs_0_4x, 2); 846 847 int y = height; 848 do { 849 // 40 41 50 51 850 srcs[4] = Load2<1>(src, srcs[4]); 851 src += src_stride; 852 // 40 41 50 51 60 61 853 srcs[4] = Load2<2>(src, srcs[4]); 854 src += src_stride; 855 // 40 41 50 51 60 61 70 71 856 srcs[4] = Load2<3>(src, srcs[4]); 857 src += src_stride; 858 // 80 81 859 srcs[8] = Load2<0>(src, srcs[8]); 860 src += src_stride; 861 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 862 const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); 863 // 20 21 30 31 40 41 50 51 864 srcs[2] = _mm_srli_si128(srcs_0_4, 4); 865 // 30 31 40 41 50 51 60 61 866 srcs[3] = _mm_srli_si128(srcs_0_4, 6); 867 const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); 868 // 50 51 60 61 70 71 80 81 869 srcs[5] = _mm_srli_si128(srcs_4_8, 2); 870 871 // This uses srcs[0]..srcs[5]. 872 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 873 const __m128i results_16 = 874 RightShiftWithRounding_S16(sums, kFilterBits - 1); 875 const __m128i results = _mm_packus_epi16(results_16, results_16); 876 877 Store2(dst8, results); 878 dst8 += dst_stride; 879 Store2(dst8, _mm_srli_si128(results, 2)); 880 dst8 += dst_stride; 881 Store2(dst8, _mm_srli_si128(results, 4)); 882 dst8 += dst_stride; 883 Store2(dst8, _mm_srli_si128(results, 6)); 884 dst8 += dst_stride; 885 886 srcs[0] = srcs[4]; 887 srcs[1] = srcs[5]; 888 srcs[4] = srcs[8]; 889 y -= 4; 890 } while (y != 0); 891 } else if (num_taps == 8) { 892 // During the vertical pass the number of taps is restricted when 893 // |height| <= 4. 894 assert(height > 4); 895 srcs[8] = _mm_setzero_si128(); 896 // 00 01 897 srcs[0] = Load2(src); 898 src += src_stride; 899 // 00 01 10 11 900 srcs[0] = Load2<1>(src, srcs[0]); 901 src += src_stride; 902 // 00 01 10 11 20 21 903 srcs[0] = Load2<2>(src, srcs[0]); 904 src += src_stride; 905 // 00 01 10 11 20 21 30 31 906 srcs[0] = Load2<3>(src, srcs[0]); 907 src += src_stride; 908 // 40 41 909 srcs[4] = Load2(src); 910 src += src_stride; 911 // 40 41 50 51 912 srcs[4] = Load2<1>(src, srcs[4]); 913 src += src_stride; 914 // 40 41 50 51 60 61 915 srcs[4] = Load2<2>(src, srcs[4]); 916 src += src_stride; 917 918 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 919 const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); 920 // 10 11 20 21 30 31 40 41 921 srcs[1] = _mm_srli_si128(srcs_0_4, 2); 922 // 20 21 30 31 40 41 50 51 923 srcs[2] = _mm_srli_si128(srcs_0_4, 4); 924 // 30 31 40 41 50 51 60 61 925 srcs[3] = _mm_srli_si128(srcs_0_4, 6); 926 927 int y = height; 928 do { 929 // 40 41 50 51 60 61 70 71 930 srcs[4] = Load2<3>(src, srcs[4]); 931 src += src_stride; 932 // 80 81 933 srcs[8] = Load2<0>(src, srcs[8]); 934 src += src_stride; 935 // 80 81 90 91 936 srcs[8] = Load2<1>(src, srcs[8]); 937 src += src_stride; 938 // 80 81 90 91 a0 a1 939 srcs[8] = Load2<2>(src, srcs[8]); 940 src += src_stride; 941 942 // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1 943 const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); 944 // 50 51 60 61 70 71 80 81 945 srcs[5] = _mm_srli_si128(srcs_4_8, 2); 946 // 60 61 70 71 80 81 90 91 947 srcs[6] = _mm_srli_si128(srcs_4_8, 4); 948 // 70 71 80 81 90 91 a0 a1 949 srcs[7] = _mm_srli_si128(srcs_4_8, 6); 950 951 // This uses srcs[0]..srcs[7]. 952 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 953 const __m128i results_16 = 954 RightShiftWithRounding_S16(sums, kFilterBits - 1); 955 const __m128i results = _mm_packus_epi16(results_16, results_16); 956 957 Store2(dst8, results); 958 dst8 += dst_stride; 959 Store2(dst8, _mm_srli_si128(results, 2)); 960 dst8 += dst_stride; 961 Store2(dst8, _mm_srli_si128(results, 4)); 962 dst8 += dst_stride; 963 Store2(dst8, _mm_srli_si128(results, 6)); 964 dst8 += dst_stride; 965 966 srcs[0] = srcs[4]; 967 srcs[1] = srcs[5]; 968 srcs[2] = srcs[6]; 969 srcs[3] = srcs[7]; 970 srcs[4] = srcs[8]; 971 y -= 4; 972 } while (y != 0); 973 } 974} 975