• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/mask_blend.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_SSE4_1
19 
20 #include <smmintrin.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30 
31 namespace libgav1 {
32 namespace dsp {
33 namespace low_bitdepth {
34 namespace {
35 
36 // Width can only be 4 when it is subsampled from a block of width 8, hence
37 // subsampling_x is always 1 when this function is called.
38 template <int subsampling_x, int subsampling_y>
GetMask4x2(const uint8_t * mask,ptrdiff_t mask_stride)39 inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
40   if (subsampling_x == 1) {
41     const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
42     const __m128i mask_val_1 =
43         _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
44     __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
45     if (subsampling_y == 1) {
46       const __m128i next_mask_val_0 =
47           _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
48       const __m128i next_mask_val_1 =
49           _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
50       subsampled_mask = _mm_add_epi16(
51           subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
52     }
53     return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
54   }
55   const __m128i mask_val_0 = Load4(mask);
56   const __m128i mask_val_1 = Load4(mask + mask_stride);
57   return _mm_cvtepu8_epi16(
58       _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
59 }
60 
61 // This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
62 // 16-bit is also the lowest packing for hadd, but without subsampling there is
63 // an unfortunate conversion required.
64 template <int subsampling_x, int subsampling_y>
GetMask8(const uint8_t * mask,ptrdiff_t stride)65 inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) {
66   if (subsampling_x == 1) {
67     const __m128i row_vals = LoadUnaligned16(mask);
68 
69     const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
70     const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
71     __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
72 
73     if (subsampling_y == 1) {
74       const __m128i next_row_vals = LoadUnaligned16(mask + stride);
75       const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
76       const __m128i next_mask_val_1 =
77           _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
78       subsampled_mask = _mm_add_epi16(
79           subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
80     }
81     return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
82   }
83   assert(subsampling_y == 0 && subsampling_x == 0);
84   const __m128i mask_val = LoadLo8(mask);
85   return _mm_cvtepu8_epi16(mask_val);
86 }
87 
88 // This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
89 // when is_inter_intra is true, the prediction values are brought to 8-bit
90 // packing as well.
91 template <int subsampling_x, int subsampling_y>
GetInterIntraMask8(const uint8_t * mask,ptrdiff_t stride)92 inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) {
93   if (subsampling_x == 1) {
94     const __m128i row_vals = LoadUnaligned16(mask);
95 
96     const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
97     const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
98     __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
99 
100     if (subsampling_y == 1) {
101       const __m128i next_row_vals = LoadUnaligned16(mask + stride);
102       const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
103       const __m128i next_mask_val_1 =
104           _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
105       subsampled_mask = _mm_add_epi16(
106           subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
107     }
108     const __m128i ret =
109         RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
110     return _mm_packus_epi16(ret, ret);
111   }
112   assert(subsampling_y == 0 && subsampling_x == 0);
113   // Unfortunately there is no shift operation for 8-bit packing, or else we
114   // could return everything with 8-bit packing.
115   const __m128i mask_val = LoadLo8(mask);
116   return mask_val;
117 }
118 
WriteMaskBlendLine4x2(const int16_t * const pred_0,const int16_t * const pred_1,const __m128i pred_mask_0,const __m128i pred_mask_1,uint8_t * dst,const ptrdiff_t dst_stride)119 inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
120                                   const int16_t* const pred_1,
121                                   const __m128i pred_mask_0,
122                                   const __m128i pred_mask_1, uint8_t* dst,
123                                   const ptrdiff_t dst_stride) {
124   const __m128i pred_val_0_lo = LoadLo8(pred_0);
125   const __m128i pred_val_0 = LoadHi8(pred_val_0_lo, pred_0 + 4);
126   const __m128i pred_val_1_lo = LoadLo8(pred_1);
127   const __m128i pred_val_1 = LoadHi8(pred_val_1_lo, pred_1 + 4);
128   const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
129   const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
130   const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
131   const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
132 
133   // int res = (mask_value * prediction_0[x] +
134   //      (64 - mask_value) * prediction_1[x]) >> 6;
135   const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
136   const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
137   const __m128i compound_pred = _mm_packus_epi32(
138       _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
139 
140   // dst[x] = static_cast<Pixel>(
141   //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
142   //           (1 << kBitdepth8) - 1));
143   const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
144   const __m128i res = _mm_packus_epi16(result, result);
145   Store4(dst, res);
146   Store4(dst + dst_stride, _mm_srli_si128(res, 4));
147 }
148 
149 template <int subsampling_x, int subsampling_y>
MaskBlending4x4_SSE4(const int16_t * pred_0,const int16_t * pred_1,const uint8_t * mask,const ptrdiff_t mask_stride,uint8_t * dst,const ptrdiff_t dst_stride)150 inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1,
151                                  const uint8_t* mask,
152                                  const ptrdiff_t mask_stride, uint8_t* dst,
153                                  const ptrdiff_t dst_stride) {
154   const __m128i mask_inverter = _mm_set1_epi16(64);
155   __m128i pred_mask_0 =
156       GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
157   __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
158   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
159                         dst_stride);
160   pred_0 += 4 << 1;
161   pred_1 += 4 << 1;
162   mask += mask_stride << (1 + subsampling_y);
163   dst += dst_stride << 1;
164 
165   pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
166   pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
167   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
168                         dst_stride);
169 }
170 
171 template <int subsampling_x, int subsampling_y>
MaskBlending4xH_SSE4(const int16_t * pred_0,const int16_t * pred_1,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)172 inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1,
173                                  const uint8_t* const mask_ptr,
174                                  const ptrdiff_t mask_stride, const int height,
175                                  uint8_t* dst, const ptrdiff_t dst_stride) {
176   const uint8_t* mask = mask_ptr;
177   if (height == 4) {
178     MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
179         pred_0, pred_1, mask, mask_stride, dst, dst_stride);
180     return;
181   }
182   const __m128i mask_inverter = _mm_set1_epi16(64);
183   int y = 0;
184   do {
185     __m128i pred_mask_0 =
186         GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
187     __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
188 
189     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
190                           dst_stride);
191     pred_0 += 4 << 1;
192     pred_1 += 4 << 1;
193     mask += mask_stride << (1 + subsampling_y);
194     dst += dst_stride << 1;
195 
196     pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
197     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
198     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
199                           dst_stride);
200     pred_0 += 4 << 1;
201     pred_1 += 4 << 1;
202     mask += mask_stride << (1 + subsampling_y);
203     dst += dst_stride << 1;
204 
205     pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
206     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
207     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
208                           dst_stride);
209     pred_0 += 4 << 1;
210     pred_1 += 4 << 1;
211     mask += mask_stride << (1 + subsampling_y);
212     dst += dst_stride << 1;
213 
214     pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
215     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
216     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
217                           dst_stride);
218     pred_0 += 4 << 1;
219     pred_1 += 4 << 1;
220     mask += mask_stride << (1 + subsampling_y);
221     dst += dst_stride << 1;
222     y += 8;
223   } while (y < height);
224 }
225 
226 template <int subsampling_x, int subsampling_y>
MaskBlend_SSE4(const void * prediction_0,const void * prediction_1,const ptrdiff_t,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int width,const int height,void * dest,const ptrdiff_t dst_stride)227 inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1,
228                            const ptrdiff_t /*prediction_stride_1*/,
229                            const uint8_t* const mask_ptr,
230                            const ptrdiff_t mask_stride, const int width,
231                            const int height, void* dest,
232                            const ptrdiff_t dst_stride) {
233   auto* dst = static_cast<uint8_t*>(dest);
234   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
235   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
236   const ptrdiff_t pred_stride_0 = width;
237   const ptrdiff_t pred_stride_1 = width;
238   if (width == 4) {
239     MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
240         pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
241     return;
242   }
243   const uint8_t* mask = mask_ptr;
244   const __m128i mask_inverter = _mm_set1_epi16(64);
245   int y = 0;
246   do {
247     int x = 0;
248     do {
249       const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
250           mask + (x << subsampling_x), mask_stride);
251       // 64 - mask
252       const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
253       const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
254       const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
255 
256       const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
257       const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
258       const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
259       const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
260       // int res = (mask_value * prediction_0[x] +
261       //      (64 - mask_value) * prediction_1[x]) >> 6;
262       const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
263       const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
264 
265       const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
266                                            _mm_srli_epi32(compound_pred_hi, 6));
267       // dst[x] = static_cast<Pixel>(
268       //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
269       //           (1 << kBitdepth8) - 1));
270       const __m128i result = RightShiftWithRounding_S16(res, 4);
271       StoreLo8(dst + x, _mm_packus_epi16(result, result));
272 
273       x += 8;
274     } while (x < width);
275     dst += dst_stride;
276     pred_0 += pred_stride_0;
277     pred_1 += pred_stride_1;
278     mask += mask_stride << subsampling_y;
279   } while (++y < height);
280 }
281 
InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t * const pred_0,uint8_t * const pred_1,const ptrdiff_t pred_stride_1,const __m128i pred_mask_0,const __m128i pred_mask_1)282 inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
283                                                 uint8_t* const pred_1,
284                                                 const ptrdiff_t pred_stride_1,
285                                                 const __m128i pred_mask_0,
286                                                 const __m128i pred_mask_1) {
287   const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
288 
289   __m128i pred_val_0 = Load4(pred_0);
290   pred_val_0 = _mm_or_si128(_mm_slli_si128(Load4(pred_0 + 4), 4), pred_val_0);
291   // TODO(b/150326556): One load.
292   __m128i pred_val_1 = Load4(pred_1);
293   pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
294                             pred_val_1);
295   const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
296   // int res = (mask_value * prediction_1[x] +
297   //      (64 - mask_value) * prediction_0[x]) >> 6;
298   const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
299   const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
300   const __m128i res = _mm_packus_epi16(result, result);
301 
302   Store4(pred_1, res);
303   Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
304 }
305 
306 template <int subsampling_x, int subsampling_y>
InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t * pred_0,uint8_t * pred_1,const ptrdiff_t pred_stride_1,const uint8_t * mask,const ptrdiff_t mask_stride)307 inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0,
308                                                uint8_t* pred_1,
309                                                const ptrdiff_t pred_stride_1,
310                                                const uint8_t* mask,
311                                                const ptrdiff_t mask_stride) {
312   const __m128i mask_inverter = _mm_set1_epi8(64);
313   const __m128i pred_mask_u16_first =
314       GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
315   mask += mask_stride << (1 + subsampling_y);
316   const __m128i pred_mask_u16_second =
317       GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
318   mask += mask_stride << (1 + subsampling_y);
319   __m128i pred_mask_1 =
320       _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
321   __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
322   InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
323                                       pred_mask_0, pred_mask_1);
324   pred_0 += 4 << 1;
325   pred_1 += pred_stride_1 << 1;
326 
327   pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
328   pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
329   InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
330                                       pred_mask_0, pred_mask_1);
331 }
332 
333 template <int subsampling_x, int subsampling_y>
InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t * pred_0,uint8_t * pred_1,const ptrdiff_t pred_stride_1,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int height)334 inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0,
335                                                uint8_t* pred_1,
336                                                const ptrdiff_t pred_stride_1,
337                                                const uint8_t* const mask_ptr,
338                                                const ptrdiff_t mask_stride,
339                                                const int height) {
340   const uint8_t* mask = mask_ptr;
341   if (height == 4) {
342     InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
343         pred_0, pred_1, pred_stride_1, mask, mask_stride);
344     return;
345   }
346   int y = 0;
347   do {
348     InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
349         pred_0, pred_1, pred_stride_1, mask, mask_stride);
350     pred_0 += 4 << 2;
351     pred_1 += pred_stride_1 << 2;
352     mask += mask_stride << (2 + subsampling_y);
353 
354     InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
355         pred_0, pred_1, pred_stride_1, mask, mask_stride);
356     pred_0 += 4 << 2;
357     pred_1 += pred_stride_1 << 2;
358     mask += mask_stride << (2 + subsampling_y);
359     y += 8;
360   } while (y < height);
361 }
362 
363 template <int subsampling_x, int subsampling_y>
InterIntraMaskBlend8bpp_SSE4(const uint8_t * prediction_0,uint8_t * prediction_1,const ptrdiff_t prediction_stride_1,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int width,const int height)364 void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0,
365                                   uint8_t* prediction_1,
366                                   const ptrdiff_t prediction_stride_1,
367                                   const uint8_t* const mask_ptr,
368                                   const ptrdiff_t mask_stride, const int width,
369                                   const int height) {
370   if (width == 4) {
371     InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
372         prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
373         height);
374     return;
375   }
376   const uint8_t* mask = mask_ptr;
377   const __m128i mask_inverter = _mm_set1_epi8(64);
378   int y = 0;
379   do {
380     int x = 0;
381     do {
382       const __m128i pred_mask_1 =
383           GetInterIntraMask8<subsampling_x, subsampling_y>(
384               mask + (x << subsampling_x), mask_stride);
385       // 64 - mask
386       const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
387       const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
388 
389       const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
390       const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
391       const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
392       // int res = (mask_value * prediction_1[x] +
393       //      (64 - mask_value) * prediction_0[x]) >> 6;
394       const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
395       const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
396       const __m128i res = _mm_packus_epi16(result, result);
397 
398       StoreLo8(prediction_1 + x, res);
399 
400       x += 8;
401     } while (x < width);
402     prediction_0 += width;
403     prediction_1 += prediction_stride_1;
404     mask += mask_stride << subsampling_y;
405   } while (++y < height);
406 }
407 
Init8bpp()408 void Init8bpp() {
409   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
410   assert(dsp != nullptr);
411 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
412   dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
413 #endif
414 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
415   dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
416 #endif
417 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
418   dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
419 #endif
420   // The is_inter_intra index of mask_blend[][] is replaced by
421   // inter_intra_mask_blend_8bpp[] in 8-bit.
422 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
423   dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
424 #endif
425 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
426   dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
427 #endif
428 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
429   dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
430 #endif
431 }
432 
433 }  // namespace
434 }  // namespace low_bitdepth
435 
MaskBlendInit_SSE4_1()436 void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
437 
438 }  // namespace dsp
439 }  // namespace libgav1
440 
441 #else  // !LIBGAV1_ENABLE_SSE4_1
442 
443 namespace libgav1 {
444 namespace dsp {
445 
MaskBlendInit_SSE4_1()446 void MaskBlendInit_SSE4_1() {}
447 
448 }  // namespace dsp
449 }  // namespace libgav1
450 #endif  // LIBGAV1_ENABLE_SSE4_1
451