1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/mask_blend.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30
31 namespace libgav1 {
32 namespace dsp {
33 namespace low_bitdepth {
34 namespace {
35
36 // Width can only be 4 when it is subsampled from a block of width 8, hence
37 // subsampling_x is always 1 when this function is called.
38 template <int subsampling_x, int subsampling_y>
GetMask4x2(const uint8_t * mask,ptrdiff_t mask_stride)39 inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
40 if (subsampling_x == 1) {
41 const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
42 const __m128i mask_val_1 =
43 _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
44 __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
45 if (subsampling_y == 1) {
46 const __m128i next_mask_val_0 =
47 _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
48 const __m128i next_mask_val_1 =
49 _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
50 subsampled_mask = _mm_add_epi16(
51 subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
52 }
53 return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
54 }
55 const __m128i mask_val_0 = Load4(mask);
56 const __m128i mask_val_1 = Load4(mask + mask_stride);
57 return _mm_cvtepu8_epi16(
58 _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
59 }
60
61 // This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
62 // 16-bit is also the lowest packing for hadd, but without subsampling there is
63 // an unfortunate conversion required.
64 template <int subsampling_x, int subsampling_y>
GetMask8(const uint8_t * mask,ptrdiff_t stride)65 inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) {
66 if (subsampling_x == 1) {
67 const __m128i row_vals = LoadUnaligned16(mask);
68
69 const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
70 const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
71 __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
72
73 if (subsampling_y == 1) {
74 const __m128i next_row_vals = LoadUnaligned16(mask + stride);
75 const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
76 const __m128i next_mask_val_1 =
77 _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
78 subsampled_mask = _mm_add_epi16(
79 subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
80 }
81 return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
82 }
83 assert(subsampling_y == 0 && subsampling_x == 0);
84 const __m128i mask_val = LoadLo8(mask);
85 return _mm_cvtepu8_epi16(mask_val);
86 }
87
88 // This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
89 // when is_inter_intra is true, the prediction values are brought to 8-bit
90 // packing as well.
91 template <int subsampling_x, int subsampling_y>
GetInterIntraMask8(const uint8_t * mask,ptrdiff_t stride)92 inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) {
93 if (subsampling_x == 1) {
94 const __m128i row_vals = LoadUnaligned16(mask);
95
96 const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
97 const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
98 __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
99
100 if (subsampling_y == 1) {
101 const __m128i next_row_vals = LoadUnaligned16(mask + stride);
102 const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
103 const __m128i next_mask_val_1 =
104 _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
105 subsampled_mask = _mm_add_epi16(
106 subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
107 }
108 const __m128i ret =
109 RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
110 return _mm_packus_epi16(ret, ret);
111 }
112 assert(subsampling_y == 0 && subsampling_x == 0);
113 // Unfortunately there is no shift operation for 8-bit packing, or else we
114 // could return everything with 8-bit packing.
115 const __m128i mask_val = LoadLo8(mask);
116 return mask_val;
117 }
118
WriteMaskBlendLine4x2(const int16_t * const pred_0,const int16_t * const pred_1,const __m128i pred_mask_0,const __m128i pred_mask_1,uint8_t * dst,const ptrdiff_t dst_stride)119 inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
120 const int16_t* const pred_1,
121 const __m128i pred_mask_0,
122 const __m128i pred_mask_1, uint8_t* dst,
123 const ptrdiff_t dst_stride) {
124 const __m128i pred_val_0_lo = LoadLo8(pred_0);
125 const __m128i pred_val_0 = LoadHi8(pred_val_0_lo, pred_0 + 4);
126 const __m128i pred_val_1_lo = LoadLo8(pred_1);
127 const __m128i pred_val_1 = LoadHi8(pred_val_1_lo, pred_1 + 4);
128 const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
129 const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
130 const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
131 const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
132
133 // int res = (mask_value * prediction_0[x] +
134 // (64 - mask_value) * prediction_1[x]) >> 6;
135 const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
136 const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
137 const __m128i compound_pred = _mm_packus_epi32(
138 _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
139
140 // dst[x] = static_cast<Pixel>(
141 // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
142 // (1 << kBitdepth8) - 1));
143 const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
144 const __m128i res = _mm_packus_epi16(result, result);
145 Store4(dst, res);
146 Store4(dst + dst_stride, _mm_srli_si128(res, 4));
147 }
148
149 template <int subsampling_x, int subsampling_y>
MaskBlending4x4_SSE4(const int16_t * pred_0,const int16_t * pred_1,const uint8_t * mask,const ptrdiff_t mask_stride,uint8_t * dst,const ptrdiff_t dst_stride)150 inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1,
151 const uint8_t* mask,
152 const ptrdiff_t mask_stride, uint8_t* dst,
153 const ptrdiff_t dst_stride) {
154 const __m128i mask_inverter = _mm_set1_epi16(64);
155 __m128i pred_mask_0 =
156 GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
157 __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
158 WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
159 dst_stride);
160 pred_0 += 4 << 1;
161 pred_1 += 4 << 1;
162 mask += mask_stride << (1 + subsampling_y);
163 dst += dst_stride << 1;
164
165 pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
166 pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
167 WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
168 dst_stride);
169 }
170
171 template <int subsampling_x, int subsampling_y>
MaskBlending4xH_SSE4(const int16_t * pred_0,const int16_t * pred_1,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)172 inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1,
173 const uint8_t* const mask_ptr,
174 const ptrdiff_t mask_stride, const int height,
175 uint8_t* dst, const ptrdiff_t dst_stride) {
176 const uint8_t* mask = mask_ptr;
177 if (height == 4) {
178 MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
179 pred_0, pred_1, mask, mask_stride, dst, dst_stride);
180 return;
181 }
182 const __m128i mask_inverter = _mm_set1_epi16(64);
183 int y = 0;
184 do {
185 __m128i pred_mask_0 =
186 GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
187 __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
188
189 WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
190 dst_stride);
191 pred_0 += 4 << 1;
192 pred_1 += 4 << 1;
193 mask += mask_stride << (1 + subsampling_y);
194 dst += dst_stride << 1;
195
196 pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
197 pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
198 WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
199 dst_stride);
200 pred_0 += 4 << 1;
201 pred_1 += 4 << 1;
202 mask += mask_stride << (1 + subsampling_y);
203 dst += dst_stride << 1;
204
205 pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
206 pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
207 WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
208 dst_stride);
209 pred_0 += 4 << 1;
210 pred_1 += 4 << 1;
211 mask += mask_stride << (1 + subsampling_y);
212 dst += dst_stride << 1;
213
214 pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
215 pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
216 WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
217 dst_stride);
218 pred_0 += 4 << 1;
219 pred_1 += 4 << 1;
220 mask += mask_stride << (1 + subsampling_y);
221 dst += dst_stride << 1;
222 y += 8;
223 } while (y < height);
224 }
225
226 template <int subsampling_x, int subsampling_y>
MaskBlend_SSE4(const void * prediction_0,const void * prediction_1,const ptrdiff_t,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int width,const int height,void * dest,const ptrdiff_t dst_stride)227 inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1,
228 const ptrdiff_t /*prediction_stride_1*/,
229 const uint8_t* const mask_ptr,
230 const ptrdiff_t mask_stride, const int width,
231 const int height, void* dest,
232 const ptrdiff_t dst_stride) {
233 auto* dst = static_cast<uint8_t*>(dest);
234 const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
235 const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
236 const ptrdiff_t pred_stride_0 = width;
237 const ptrdiff_t pred_stride_1 = width;
238 if (width == 4) {
239 MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
240 pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
241 return;
242 }
243 const uint8_t* mask = mask_ptr;
244 const __m128i mask_inverter = _mm_set1_epi16(64);
245 int y = 0;
246 do {
247 int x = 0;
248 do {
249 const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
250 mask + (x << subsampling_x), mask_stride);
251 // 64 - mask
252 const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
253 const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
254 const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
255
256 const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
257 const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
258 const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
259 const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
260 // int res = (mask_value * prediction_0[x] +
261 // (64 - mask_value) * prediction_1[x]) >> 6;
262 const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
263 const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
264
265 const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
266 _mm_srli_epi32(compound_pred_hi, 6));
267 // dst[x] = static_cast<Pixel>(
268 // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
269 // (1 << kBitdepth8) - 1));
270 const __m128i result = RightShiftWithRounding_S16(res, 4);
271 StoreLo8(dst + x, _mm_packus_epi16(result, result));
272
273 x += 8;
274 } while (x < width);
275 dst += dst_stride;
276 pred_0 += pred_stride_0;
277 pred_1 += pred_stride_1;
278 mask += mask_stride << subsampling_y;
279 } while (++y < height);
280 }
281
InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t * const pred_0,uint8_t * const pred_1,const ptrdiff_t pred_stride_1,const __m128i pred_mask_0,const __m128i pred_mask_1)282 inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
283 uint8_t* const pred_1,
284 const ptrdiff_t pred_stride_1,
285 const __m128i pred_mask_0,
286 const __m128i pred_mask_1) {
287 const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
288
289 __m128i pred_val_0 = Load4(pred_0);
290 pred_val_0 = _mm_or_si128(_mm_slli_si128(Load4(pred_0 + 4), 4), pred_val_0);
291 // TODO(b/150326556): One load.
292 __m128i pred_val_1 = Load4(pred_1);
293 pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
294 pred_val_1);
295 const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
296 // int res = (mask_value * prediction_1[x] +
297 // (64 - mask_value) * prediction_0[x]) >> 6;
298 const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
299 const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
300 const __m128i res = _mm_packus_epi16(result, result);
301
302 Store4(pred_1, res);
303 Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
304 }
305
306 template <int subsampling_x, int subsampling_y>
InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t * pred_0,uint8_t * pred_1,const ptrdiff_t pred_stride_1,const uint8_t * mask,const ptrdiff_t mask_stride)307 inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0,
308 uint8_t* pred_1,
309 const ptrdiff_t pred_stride_1,
310 const uint8_t* mask,
311 const ptrdiff_t mask_stride) {
312 const __m128i mask_inverter = _mm_set1_epi8(64);
313 const __m128i pred_mask_u16_first =
314 GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
315 mask += mask_stride << (1 + subsampling_y);
316 const __m128i pred_mask_u16_second =
317 GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
318 mask += mask_stride << (1 + subsampling_y);
319 __m128i pred_mask_1 =
320 _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
321 __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
322 InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
323 pred_mask_0, pred_mask_1);
324 pred_0 += 4 << 1;
325 pred_1 += pred_stride_1 << 1;
326
327 pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
328 pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
329 InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
330 pred_mask_0, pred_mask_1);
331 }
332
333 template <int subsampling_x, int subsampling_y>
InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t * pred_0,uint8_t * pred_1,const ptrdiff_t pred_stride_1,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int height)334 inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0,
335 uint8_t* pred_1,
336 const ptrdiff_t pred_stride_1,
337 const uint8_t* const mask_ptr,
338 const ptrdiff_t mask_stride,
339 const int height) {
340 const uint8_t* mask = mask_ptr;
341 if (height == 4) {
342 InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
343 pred_0, pred_1, pred_stride_1, mask, mask_stride);
344 return;
345 }
346 int y = 0;
347 do {
348 InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
349 pred_0, pred_1, pred_stride_1, mask, mask_stride);
350 pred_0 += 4 << 2;
351 pred_1 += pred_stride_1 << 2;
352 mask += mask_stride << (2 + subsampling_y);
353
354 InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
355 pred_0, pred_1, pred_stride_1, mask, mask_stride);
356 pred_0 += 4 << 2;
357 pred_1 += pred_stride_1 << 2;
358 mask += mask_stride << (2 + subsampling_y);
359 y += 8;
360 } while (y < height);
361 }
362
363 template <int subsampling_x, int subsampling_y>
InterIntraMaskBlend8bpp_SSE4(const uint8_t * prediction_0,uint8_t * prediction_1,const ptrdiff_t prediction_stride_1,const uint8_t * const mask_ptr,const ptrdiff_t mask_stride,const int width,const int height)364 void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0,
365 uint8_t* prediction_1,
366 const ptrdiff_t prediction_stride_1,
367 const uint8_t* const mask_ptr,
368 const ptrdiff_t mask_stride, const int width,
369 const int height) {
370 if (width == 4) {
371 InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
372 prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
373 height);
374 return;
375 }
376 const uint8_t* mask = mask_ptr;
377 const __m128i mask_inverter = _mm_set1_epi8(64);
378 int y = 0;
379 do {
380 int x = 0;
381 do {
382 const __m128i pred_mask_1 =
383 GetInterIntraMask8<subsampling_x, subsampling_y>(
384 mask + (x << subsampling_x), mask_stride);
385 // 64 - mask
386 const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
387 const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
388
389 const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
390 const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
391 const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
392 // int res = (mask_value * prediction_1[x] +
393 // (64 - mask_value) * prediction_0[x]) >> 6;
394 const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
395 const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
396 const __m128i res = _mm_packus_epi16(result, result);
397
398 StoreLo8(prediction_1 + x, res);
399
400 x += 8;
401 } while (x < width);
402 prediction_0 += width;
403 prediction_1 += prediction_stride_1;
404 mask += mask_stride << subsampling_y;
405 } while (++y < height);
406 }
407
Init8bpp()408 void Init8bpp() {
409 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
410 assert(dsp != nullptr);
411 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
412 dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
413 #endif
414 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
415 dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
416 #endif
417 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
418 dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
419 #endif
420 // The is_inter_intra index of mask_blend[][] is replaced by
421 // inter_intra_mask_blend_8bpp[] in 8-bit.
422 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
423 dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
424 #endif
425 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
426 dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
427 #endif
428 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
429 dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
430 #endif
431 }
432
433 } // namespace
434 } // namespace low_bitdepth
435
MaskBlendInit_SSE4_1()436 void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
437
438 } // namespace dsp
439 } // namespace libgav1
440
441 #else // !LIBGAV1_ENABLE_SSE4_1
442
443 namespace libgav1 {
444 namespace dsp {
445
MaskBlendInit_SSE4_1()446 void MaskBlendInit_SSE4_1() {}
447
448 } // namespace dsp
449 } // namespace libgav1
450 #endif // LIBGAV1_ENABLE_SSE4_1
451