• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/obmc.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_SSE4_1
19 
20 #include <xmmintrin.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31 
32 namespace libgav1 {
33 namespace dsp {
34 namespace {
35 
36 #include "src/dsp/obmc.inc"
37 
OverlapBlendFromLeft2xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)38 inline void OverlapBlendFromLeft2xH_SSE4_1(
39     uint8_t* const prediction, const ptrdiff_t prediction_stride,
40     const int height, const uint8_t* const obmc_prediction,
41     const ptrdiff_t obmc_prediction_stride) {
42   uint8_t* pred = prediction;
43   const uint8_t* obmc_pred = obmc_prediction;
44   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
45   const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
46   // 64 - mask
47   const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
48   const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
49   int y = height;
50   do {
51     const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
52     const __m128i obmc_pred_val =
53         Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
54 
55     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
56     const __m128i result =
57         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
58     const __m128i packed_result = _mm_packus_epi16(result, result);
59     Store2(pred, packed_result);
60     pred += prediction_stride;
61     const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
62     memcpy(pred, &second_row_result, sizeof(second_row_result));
63     pred += prediction_stride;
64     obmc_pred += obmc_prediction_stride << 1;
65     y -= 2;
66   } while (y != 0);
67 }
68 
OverlapBlendFromLeft4xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)69 inline void OverlapBlendFromLeft4xH_SSE4_1(
70     uint8_t* const prediction, const ptrdiff_t prediction_stride,
71     const int height, const uint8_t* const obmc_prediction,
72     const ptrdiff_t obmc_prediction_stride) {
73   uint8_t* pred = prediction;
74   const uint8_t* obmc_pred = obmc_prediction;
75   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
76   const __m128i mask_val = Load4(kObmcMask + 2);
77   // 64 - mask
78   const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
79   // Duplicate first half of vector.
80   const __m128i masks =
81       _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
82   int y = height;
83   do {
84     const __m128i pred_val0 = Load4(pred);
85     const __m128i obmc_pred_val0 = Load4(obmc_pred);
86     pred += prediction_stride;
87     obmc_pred += obmc_prediction_stride;
88 
89     // Place the second row of each source in the second four bytes.
90     const __m128i pred_val =
91         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
92     const __m128i obmc_pred_val = _mm_alignr_epi8(
93         Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
94     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
95     const __m128i result =
96         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
97     const __m128i packed_result = _mm_packus_epi16(result, result);
98     Store4(pred - prediction_stride, packed_result);
99     const int second_row_result = _mm_extract_epi32(packed_result, 1);
100     memcpy(pred, &second_row_result, sizeof(second_row_result));
101     pred += prediction_stride;
102     obmc_pred += obmc_prediction_stride;
103     y -= 2;
104   } while (y != 0);
105 }
106 
OverlapBlendFromLeft8xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)107 inline void OverlapBlendFromLeft8xH_SSE4_1(
108     uint8_t* const prediction, const ptrdiff_t prediction_stride,
109     const int height, const uint8_t* const obmc_prediction,
110     const ptrdiff_t obmc_prediction_stride) {
111   uint8_t* pred = prediction;
112   const uint8_t* obmc_pred = obmc_prediction;
113   const __m128i mask_inverter = _mm_set1_epi8(64);
114   const __m128i mask_val = LoadLo8(kObmcMask + 6);
115   // 64 - mask
116   const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
117   const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
118   int y = height;
119   do {
120     const __m128i pred_val = LoadLo8(pred);
121     const __m128i obmc_pred_val = LoadLo8(obmc_pred);
122     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
123     const __m128i result =
124         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
125 
126     StoreLo8(pred, _mm_packus_epi16(result, result));
127     pred += prediction_stride;
128     obmc_pred += obmc_prediction_stride;
129   } while (--y != 0);
130 }
131 
OverlapBlendFromLeft_SSE4_1(void * const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)132 void OverlapBlendFromLeft_SSE4_1(void* const prediction,
133                                  const ptrdiff_t prediction_stride,
134                                  const int width, const int height,
135                                  const void* const obmc_prediction,
136                                  const ptrdiff_t obmc_prediction_stride) {
137   auto* pred = static_cast<uint8_t*>(prediction);
138   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
139 
140   if (width == 2) {
141     OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
142                                    obmc_prediction_stride);
143     return;
144   }
145   if (width == 4) {
146     OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
147                                    obmc_prediction_stride);
148     return;
149   }
150   if (width == 8) {
151     OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
152                                    obmc_prediction_stride);
153     return;
154   }
155   const __m128i mask_inverter = _mm_set1_epi8(64);
156   const uint8_t* mask = kObmcMask + width - 2;
157   int x = 0;
158   do {
159     pred = static_cast<uint8_t*>(prediction) + x;
160     obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
161     const __m128i mask_val = LoadUnaligned16(mask + x);
162     // 64 - mask
163     const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
164     const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
165     const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
166 
167     int y = 0;
168     do {
169       const __m128i pred_val = LoadUnaligned16(pred);
170       const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
171       const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
172       const __m128i result_lo =
173           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
174       const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
175       const __m128i result_hi =
176           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
177       StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
178 
179       pred += prediction_stride;
180       obmc_pred += obmc_prediction_stride;
181     } while (++y < height);
182     x += 16;
183   } while (x < width);
184 }
185 
OverlapBlendFromTop4xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)186 inline void OverlapBlendFromTop4xH_SSE4_1(
187     uint8_t* const prediction, const ptrdiff_t prediction_stride,
188     const int height, const uint8_t* const obmc_prediction,
189     const ptrdiff_t obmc_prediction_stride) {
190   uint8_t* pred = prediction;
191   const uint8_t* obmc_pred = obmc_prediction;
192   const __m128i mask_inverter = _mm_set1_epi16(64);
193   const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
194   const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
195 
196   const uint8_t* mask = kObmcMask + height - 2;
197   const int compute_height = height - (height >> 2);
198   int y = 0;
199   do {
200     // First mask in the first half, second mask in the second half.
201     const __m128i mask_val = _mm_shuffle_epi8(
202         _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
203         mask_shuffler);
204     const __m128i masks =
205         _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
206     const __m128i pred_val0 = Load4(pred);
207 
208     const __m128i obmc_pred_val0 = Load4(obmc_pred);
209     pred += prediction_stride;
210     obmc_pred += obmc_prediction_stride;
211     const __m128i pred_val =
212         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
213     const __m128i obmc_pred_val = _mm_alignr_epi8(
214         Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
215     const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
216     const __m128i result =
217         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
218 
219     const __m128i packed_result = _mm_packus_epi16(result, result);
220     Store4(pred - prediction_stride, packed_result);
221     Store4(pred, _mm_srli_si128(packed_result, 4));
222     pred += prediction_stride;
223     obmc_pred += obmc_prediction_stride;
224     y += 2;
225   } while (y < compute_height);
226 }
227 
OverlapBlendFromTop8xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)228 inline void OverlapBlendFromTop8xH_SSE4_1(
229     uint8_t* const prediction, const ptrdiff_t prediction_stride,
230     const int height, const uint8_t* const obmc_prediction,
231     const ptrdiff_t obmc_prediction_stride) {
232   uint8_t* pred = prediction;
233   const uint8_t* obmc_pred = obmc_prediction;
234   const uint8_t* mask = kObmcMask + height - 2;
235   const __m128i mask_inverter = _mm_set1_epi8(64);
236   const int compute_height = height - (height >> 2);
237   int y = compute_height;
238   do {
239     const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
240     // 64 - mask
241     const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
242     const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
243     const __m128i pred_val = LoadLo8(pred);
244     const __m128i obmc_pred_val = LoadLo8(obmc_pred);
245     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
246     const __m128i result =
247         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
248 
249     StoreLo8(pred, _mm_packus_epi16(result, result));
250     pred += prediction_stride;
251     obmc_pred += obmc_prediction_stride;
252   } while (--y != 0);
253 }
254 
OverlapBlendFromTop_SSE4_1(void * const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)255 void OverlapBlendFromTop_SSE4_1(void* const prediction,
256                                 const ptrdiff_t prediction_stride,
257                                 const int width, const int height,
258                                 const void* const obmc_prediction,
259                                 const ptrdiff_t obmc_prediction_stride) {
260   auto* pred = static_cast<uint8_t*>(prediction);
261   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
262 
263   if (width <= 4) {
264     OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
265                                   obmc_prediction_stride);
266     return;
267   }
268   if (width == 8) {
269     OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
270                                   obmc_prediction_stride);
271     return;
272   }
273 
274   // Stop when mask value becomes 64.
275   const int compute_height = height - (height >> 2);
276   const __m128i mask_inverter = _mm_set1_epi8(64);
277   int y = 0;
278   const uint8_t* mask = kObmcMask + height - 2;
279   do {
280     const __m128i mask_val = _mm_set1_epi8(mask[y]);
281     // 64 - mask
282     const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
283     const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
284     int x = 0;
285     do {
286       const __m128i pred_val = LoadUnaligned16(pred + x);
287       const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
288       const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
289       const __m128i result_lo =
290           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
291       const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
292       const __m128i result_hi =
293           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
294       StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
295       x += 16;
296     } while (x < width);
297     pred += prediction_stride;
298     obmc_pred += obmc_prediction_stride;
299   } while (++y < compute_height);
300 }
301 
Init8bpp()302 void Init8bpp() {
303   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
304   assert(dsp != nullptr);
305 #if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
306   dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
307 #endif
308 #if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
309   dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
310 #endif
311 }
312 
313 }  // namespace
314 
ObmcInit_SSE4_1()315 void ObmcInit_SSE4_1() { Init8bpp(); }
316 
317 }  // namespace dsp
318 }  // namespace libgav1
319 
320 #else  // !LIBGAV1_ENABLE_SSE4_1
321 
322 namespace libgav1 {
323 namespace dsp {
324 
ObmcInit_SSE4_1()325 void ObmcInit_SSE4_1() {}
326 
327 }  // namespace dsp
328 }  // namespace libgav1
329 #endif  // LIBGAV1_ENABLE_SSE4_1
330