1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/obmc.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_SSE4_1
19
20 #include <xmmintrin.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31
32 namespace libgav1 {
33 namespace dsp {
34 namespace {
35
36 #include "src/dsp/obmc.inc"
37
OverlapBlendFromLeft2xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)38 inline void OverlapBlendFromLeft2xH_SSE4_1(
39 uint8_t* const prediction, const ptrdiff_t prediction_stride,
40 const int height, const uint8_t* const obmc_prediction,
41 const ptrdiff_t obmc_prediction_stride) {
42 uint8_t* pred = prediction;
43 const uint8_t* obmc_pred = obmc_prediction;
44 const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
45 const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
46 // 64 - mask
47 const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
48 const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
49 int y = height;
50 do {
51 const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
52 const __m128i obmc_pred_val =
53 Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
54
55 const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
56 const __m128i result =
57 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
58 const __m128i packed_result = _mm_packus_epi16(result, result);
59 Store2(pred, packed_result);
60 pred += prediction_stride;
61 const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
62 memcpy(pred, &second_row_result, sizeof(second_row_result));
63 pred += prediction_stride;
64 obmc_pred += obmc_prediction_stride << 1;
65 y -= 2;
66 } while (y != 0);
67 }
68
OverlapBlendFromLeft4xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)69 inline void OverlapBlendFromLeft4xH_SSE4_1(
70 uint8_t* const prediction, const ptrdiff_t prediction_stride,
71 const int height, const uint8_t* const obmc_prediction,
72 const ptrdiff_t obmc_prediction_stride) {
73 uint8_t* pred = prediction;
74 const uint8_t* obmc_pred = obmc_prediction;
75 const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
76 const __m128i mask_val = Load4(kObmcMask + 2);
77 // 64 - mask
78 const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
79 // Duplicate first half of vector.
80 const __m128i masks =
81 _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
82 int y = height;
83 do {
84 const __m128i pred_val0 = Load4(pred);
85 const __m128i obmc_pred_val0 = Load4(obmc_pred);
86 pred += prediction_stride;
87 obmc_pred += obmc_prediction_stride;
88
89 // Place the second row of each source in the second four bytes.
90 const __m128i pred_val =
91 _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
92 const __m128i obmc_pred_val = _mm_alignr_epi8(
93 Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
94 const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
95 const __m128i result =
96 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
97 const __m128i packed_result = _mm_packus_epi16(result, result);
98 Store4(pred - prediction_stride, packed_result);
99 const int second_row_result = _mm_extract_epi32(packed_result, 1);
100 memcpy(pred, &second_row_result, sizeof(second_row_result));
101 pred += prediction_stride;
102 obmc_pred += obmc_prediction_stride;
103 y -= 2;
104 } while (y != 0);
105 }
106
OverlapBlendFromLeft8xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)107 inline void OverlapBlendFromLeft8xH_SSE4_1(
108 uint8_t* const prediction, const ptrdiff_t prediction_stride,
109 const int height, const uint8_t* const obmc_prediction,
110 const ptrdiff_t obmc_prediction_stride) {
111 uint8_t* pred = prediction;
112 const uint8_t* obmc_pred = obmc_prediction;
113 const __m128i mask_inverter = _mm_set1_epi8(64);
114 const __m128i mask_val = LoadLo8(kObmcMask + 6);
115 // 64 - mask
116 const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
117 const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
118 int y = height;
119 do {
120 const __m128i pred_val = LoadLo8(pred);
121 const __m128i obmc_pred_val = LoadLo8(obmc_pred);
122 const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
123 const __m128i result =
124 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
125
126 StoreLo8(pred, _mm_packus_epi16(result, result));
127 pred += prediction_stride;
128 obmc_pred += obmc_prediction_stride;
129 } while (--y != 0);
130 }
131
OverlapBlendFromLeft_SSE4_1(void * const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)132 void OverlapBlendFromLeft_SSE4_1(void* const prediction,
133 const ptrdiff_t prediction_stride,
134 const int width, const int height,
135 const void* const obmc_prediction,
136 const ptrdiff_t obmc_prediction_stride) {
137 auto* pred = static_cast<uint8_t*>(prediction);
138 const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
139
140 if (width == 2) {
141 OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
142 obmc_prediction_stride);
143 return;
144 }
145 if (width == 4) {
146 OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
147 obmc_prediction_stride);
148 return;
149 }
150 if (width == 8) {
151 OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
152 obmc_prediction_stride);
153 return;
154 }
155 const __m128i mask_inverter = _mm_set1_epi8(64);
156 const uint8_t* mask = kObmcMask + width - 2;
157 int x = 0;
158 do {
159 pred = static_cast<uint8_t*>(prediction) + x;
160 obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
161 const __m128i mask_val = LoadUnaligned16(mask + x);
162 // 64 - mask
163 const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
164 const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
165 const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
166
167 int y = 0;
168 do {
169 const __m128i pred_val = LoadUnaligned16(pred);
170 const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
171 const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
172 const __m128i result_lo =
173 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
174 const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
175 const __m128i result_hi =
176 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
177 StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
178
179 pred += prediction_stride;
180 obmc_pred += obmc_prediction_stride;
181 } while (++y < height);
182 x += 16;
183 } while (x < width);
184 }
185
OverlapBlendFromTop4xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)186 inline void OverlapBlendFromTop4xH_SSE4_1(
187 uint8_t* const prediction, const ptrdiff_t prediction_stride,
188 const int height, const uint8_t* const obmc_prediction,
189 const ptrdiff_t obmc_prediction_stride) {
190 uint8_t* pred = prediction;
191 const uint8_t* obmc_pred = obmc_prediction;
192 const __m128i mask_inverter = _mm_set1_epi16(64);
193 const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
194 const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
195
196 const uint8_t* mask = kObmcMask + height - 2;
197 const int compute_height = height - (height >> 2);
198 int y = 0;
199 do {
200 // First mask in the first half, second mask in the second half.
201 const __m128i mask_val = _mm_shuffle_epi8(
202 _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
203 mask_shuffler);
204 const __m128i masks =
205 _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
206 const __m128i pred_val0 = Load4(pred);
207
208 const __m128i obmc_pred_val0 = Load4(obmc_pred);
209 pred += prediction_stride;
210 obmc_pred += obmc_prediction_stride;
211 const __m128i pred_val =
212 _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
213 const __m128i obmc_pred_val = _mm_alignr_epi8(
214 Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
215 const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
216 const __m128i result =
217 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
218
219 const __m128i packed_result = _mm_packus_epi16(result, result);
220 Store4(pred - prediction_stride, packed_result);
221 Store4(pred, _mm_srli_si128(packed_result, 4));
222 pred += prediction_stride;
223 obmc_pred += obmc_prediction_stride;
224 y += 2;
225 } while (y < compute_height);
226 }
227
OverlapBlendFromTop8xH_SSE4_1(uint8_t * const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)228 inline void OverlapBlendFromTop8xH_SSE4_1(
229 uint8_t* const prediction, const ptrdiff_t prediction_stride,
230 const int height, const uint8_t* const obmc_prediction,
231 const ptrdiff_t obmc_prediction_stride) {
232 uint8_t* pred = prediction;
233 const uint8_t* obmc_pred = obmc_prediction;
234 const uint8_t* mask = kObmcMask + height - 2;
235 const __m128i mask_inverter = _mm_set1_epi8(64);
236 const int compute_height = height - (height >> 2);
237 int y = compute_height;
238 do {
239 const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
240 // 64 - mask
241 const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
242 const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
243 const __m128i pred_val = LoadLo8(pred);
244 const __m128i obmc_pred_val = LoadLo8(obmc_pred);
245 const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
246 const __m128i result =
247 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
248
249 StoreLo8(pred, _mm_packus_epi16(result, result));
250 pred += prediction_stride;
251 obmc_pred += obmc_prediction_stride;
252 } while (--y != 0);
253 }
254
OverlapBlendFromTop_SSE4_1(void * const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * const obmc_prediction,const ptrdiff_t obmc_prediction_stride)255 void OverlapBlendFromTop_SSE4_1(void* const prediction,
256 const ptrdiff_t prediction_stride,
257 const int width, const int height,
258 const void* const obmc_prediction,
259 const ptrdiff_t obmc_prediction_stride) {
260 auto* pred = static_cast<uint8_t*>(prediction);
261 const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
262
263 if (width <= 4) {
264 OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
265 obmc_prediction_stride);
266 return;
267 }
268 if (width == 8) {
269 OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
270 obmc_prediction_stride);
271 return;
272 }
273
274 // Stop when mask value becomes 64.
275 const int compute_height = height - (height >> 2);
276 const __m128i mask_inverter = _mm_set1_epi8(64);
277 int y = 0;
278 const uint8_t* mask = kObmcMask + height - 2;
279 do {
280 const __m128i mask_val = _mm_set1_epi8(mask[y]);
281 // 64 - mask
282 const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
283 const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
284 int x = 0;
285 do {
286 const __m128i pred_val = LoadUnaligned16(pred + x);
287 const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
288 const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
289 const __m128i result_lo =
290 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
291 const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
292 const __m128i result_hi =
293 RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
294 StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
295 x += 16;
296 } while (x < width);
297 pred += prediction_stride;
298 obmc_pred += obmc_prediction_stride;
299 } while (++y < compute_height);
300 }
301
Init8bpp()302 void Init8bpp() {
303 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
304 assert(dsp != nullptr);
305 #if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
306 dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
307 #endif
308 #if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
309 dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
310 #endif
311 }
312
313 } // namespace
314
ObmcInit_SSE4_1()315 void ObmcInit_SSE4_1() { Init8bpp(); }
316
317 } // namespace dsp
318 } // namespace libgav1
319
320 #else // !LIBGAV1_ENABLE_SSE4_1
321
322 namespace libgav1 {
323 namespace dsp {
324
ObmcInit_SSE4_1()325 void ObmcInit_SSE4_1() {}
326
327 } // namespace dsp
328 } // namespace libgav1
329 #endif // LIBGAV1_ENABLE_SSE4_1
330