1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/inverse_transform.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstdint>
25 #include <cstring>
26
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/dsp/x86/transpose_sse4.h"
31 #include "src/utils/array_2d.h"
32 #include "src/utils/common.h"
33 #include "src/utils/compiler_attributes.h"
34
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39
40 // Include the constants and utility functions inside the anonymous namespace.
41 #include "src/dsp/inverse_transform.inc"
42
43 template <int store_width, int store_count>
StoreDst(int16_t * dst,int32_t stride,int32_t idx,const __m128i * s)44 LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
45 const __m128i* s) {
46 // NOTE: It is expected that the compiler will unroll these loops.
47 if (store_width == 16) {
48 for (int i = 0; i < store_count; i += 4) {
49 StoreUnaligned16(&dst[i * stride + idx], s[i]);
50 StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
51 StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
52 StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
53 }
54 }
55 if (store_width == 8) {
56 for (int i = 0; i < store_count; i += 4) {
57 StoreLo8(&dst[i * stride + idx], s[i]);
58 StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
59 StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
60 StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
61 }
62 }
63 }
64
65 template <int load_width, int load_count>
LoadSrc(const int16_t * src,int32_t stride,int32_t idx,__m128i * x)66 LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
67 int32_t idx, __m128i* x) {
68 // NOTE: It is expected that the compiler will unroll these loops.
69 if (load_width == 16) {
70 for (int i = 0; i < load_count; i += 4) {
71 x[i] = LoadUnaligned16(&src[i * stride + idx]);
72 x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
73 x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
74 x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
75 }
76 }
77 if (load_width == 8) {
78 for (int i = 0; i < load_count; i += 4) {
79 x[i] = LoadLo8(&src[i * stride + idx]);
80 x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
81 x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
82 x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
83 }
84 }
85 }
86
87 // Butterfly rotate 4 values.
ButterflyRotation_4(__m128i * a,__m128i * b,const int angle,const bool flip)88 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
89 const int angle,
90 const bool flip) {
91 const int16_t cos128 = Cos128(angle);
92 const int16_t sin128 = Sin128(angle);
93 const __m128i psin_pcos = _mm_set1_epi32(
94 static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
95 const __m128i ba = _mm_unpacklo_epi16(*a, *b);
96 const __m128i ab = _mm_unpacklo_epi16(*b, *a);
97 const __m128i sign =
98 _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
99 // -sin cos, -sin cos, -sin cos, -sin cos
100 const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
101 const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
102 const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
103 const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
104 const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
105 const __m128i x = _mm_packs_epi32(x1, x1);
106 const __m128i y = _mm_packs_epi32(y1, y1);
107 if (flip) {
108 *a = y;
109 *b = x;
110 } else {
111 *a = x;
112 *b = y;
113 }
114 }
115
116 // Butterfly rotate 8 values.
ButterflyRotation_8(__m128i * a,__m128i * b,const int angle,const bool flip)117 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
118 const int angle,
119 const bool flip) {
120 const int16_t cos128 = Cos128(angle);
121 const int16_t sin128 = Sin128(angle);
122 const __m128i psin_pcos = _mm_set1_epi32(
123 static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
124 const __m128i sign =
125 _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
126 // -sin cos, -sin cos, -sin cos, -sin cos
127 const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
128 const __m128i ba = _mm_unpacklo_epi16(*a, *b);
129 const __m128i ab = _mm_unpacklo_epi16(*b, *a);
130 const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
131 const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
132 const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
133 const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
134 const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
135 const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
136 const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
137 const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
138 const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
139 const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
140 const __m128i x = _mm_packs_epi32(x1, x1_hi);
141 const __m128i y = _mm_packs_epi32(y1, y1_hi);
142 if (flip) {
143 *a = y;
144 *b = x;
145 } else {
146 *a = x;
147 *b = y;
148 }
149 }
150
ButterflyRotation_FirstIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)151 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
152 const int angle,
153 const bool flip) {
154 const int16_t cos128 = Cos128(angle);
155 const int16_t sin128 = Sin128(angle);
156 const __m128i pcos = _mm_set1_epi16(cos128 << 3);
157 const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
158 const __m128i x = _mm_mulhrs_epi16(*b, psin);
159 const __m128i y = _mm_mulhrs_epi16(*b, pcos);
160 if (flip) {
161 *a = y;
162 *b = x;
163 } else {
164 *a = x;
165 *b = y;
166 }
167 }
168
ButterflyRotation_SecondIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)169 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
170 __m128i* b,
171 const int angle,
172 const bool flip) {
173 const int16_t cos128 = Cos128(angle);
174 const int16_t sin128 = Sin128(angle);
175 const __m128i pcos = _mm_set1_epi16(cos128 << 3);
176 const __m128i psin = _mm_set1_epi16(sin128 << 3);
177 const __m128i x = _mm_mulhrs_epi16(*a, pcos);
178 const __m128i y = _mm_mulhrs_epi16(*a, psin);
179 if (flip) {
180 *a = y;
181 *b = x;
182 } else {
183 *a = x;
184 *b = y;
185 }
186 }
187
HadamardRotation(__m128i * a,__m128i * b,bool flip)188 LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
189 __m128i x, y;
190 if (flip) {
191 y = _mm_adds_epi16(*b, *a);
192 x = _mm_subs_epi16(*b, *a);
193 } else {
194 x = _mm_adds_epi16(*a, *b);
195 y = _mm_subs_epi16(*a, *b);
196 }
197 *a = x;
198 *b = y;
199 }
200
201 using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
202 bool flip);
203
ShiftResidual(const __m128i residual,const __m128i v_row_shift_add,const __m128i v_row_shift)204 LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
205 const __m128i v_row_shift_add,
206 const __m128i v_row_shift) {
207 const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
208 // The max row_shift is 2, so int16_t values greater than 0x7ffd may
209 // overflow. Generate a mask for this case.
210 const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
211 const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
212 // Assume int16_t values.
213 const __m128i a = _mm_sra_epi16(x, v_row_shift);
214 // Assume uint16_t values.
215 const __m128i b = _mm_srl_epi16(x, v_row_shift);
216 // Select the correct shifted value.
217 return _mm_blendv_epi8(a, b, mask);
218 }
219
220 //------------------------------------------------------------------------------
221 // Discrete Cosine Transforms (DCT).
222
223 template <int width>
DctDcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)224 LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, const void* source,
225 int non_zero_coeff_count,
226 bool should_round, int row_shift) {
227 if (non_zero_coeff_count > 1) {
228 return false;
229 }
230
231 auto* dst = static_cast<int16_t*>(dest);
232 const auto* const src = static_cast<const int16_t*>(source);
233
234 const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
235 const __m128i v_src =
236 (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
237 const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
238 const __m128i v_kTransformRowMultiplier =
239 _mm_set1_epi16(kTransformRowMultiplier << 3);
240 const __m128i v_src_round =
241 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
242 const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
243 const int16_t cos128 = Cos128(32);
244 const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
245
246 // Expand to 32 bits to prevent int16_t overflows during the shift add.
247 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
248 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
249 const __m128i a = _mm_cvtepi16_epi32(xy);
250 const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
251 const __m128i b = _mm_add_epi32(a, v_row_shift_add);
252 const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
253 const __m128i c = _mm_sra_epi32(b, v_row_shift);
254 const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
255 const __m128i xy_shifted = _mm_packs_epi32(c, c1);
256
257 if (width == 4) {
258 StoreLo8(dst, xy_shifted);
259 } else {
260 for (int i = 0; i < width; i += 8) {
261 StoreUnaligned16(dst, xy_shifted);
262 dst += 8;
263 }
264 }
265 return true;
266 }
267
268 template <int height>
DctDcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)269 LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, const void* source,
270 int non_zero_coeff_count,
271 int width) {
272 if (non_zero_coeff_count > 1) {
273 return false;
274 }
275
276 auto* dst = static_cast<int16_t*>(dest);
277 const auto* const src = static_cast<const int16_t*>(source);
278 const int16_t cos128 = Cos128(32);
279
280 // Calculate dc values for first row.
281 if (width == 4) {
282 const __m128i v_src = LoadLo8(src);
283 const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
284 StoreLo8(dst, xy);
285 } else {
286 int i = 0;
287 do {
288 const __m128i v_src = LoadUnaligned16(&src[i]);
289 const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
290 StoreUnaligned16(&dst[i], xy);
291 i += 8;
292 } while (i < width);
293 }
294
295 // Copy first row to the rest of the block.
296 for (int y = 1; y < height; ++y) {
297 memcpy(&dst[y * width], &src[(y - 1) * width], width * sizeof(dst[0]));
298 }
299 return true;
300 }
301
302 template <ButterflyRotationFunc bufferfly_rotation,
303 bool is_fast_bufferfly = false>
Dct4Stages(__m128i * s)304 LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
305 // stage 12.
306 if (is_fast_bufferfly) {
307 ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
308 ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
309 } else {
310 bufferfly_rotation(&s[0], &s[1], 32, true);
311 bufferfly_rotation(&s[2], &s[3], 48, false);
312 }
313
314 // stage 17.
315 HadamardRotation(&s[0], &s[3], false);
316 HadamardRotation(&s[1], &s[2], false);
317 }
318
319 // Process 4 dct4 rows or columns, depending on the transpose flag.
320 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Dct4_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)321 LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, const void* source,
322 int32_t step, bool transpose) {
323 auto* const dst = static_cast<int16_t*>(dest);
324 const auto* const src = static_cast<const int16_t*>(source);
325 __m128i s[4], x[4];
326
327 if (stage_is_rectangular) {
328 if (transpose) {
329 __m128i input[8];
330 LoadSrc<8, 8>(src, step, 0, input);
331 Transpose4x8To8x4_U16(input, x);
332 } else {
333 LoadSrc<16, 4>(src, step, 0, x);
334 }
335 } else {
336 LoadSrc<8, 4>(src, step, 0, x);
337 if (transpose) {
338 Transpose4x4_U16(x, x);
339 }
340 }
341 // stage 1.
342 // kBitReverseLookup 0, 2, 1, 3
343 s[0] = x[0];
344 s[1] = x[2];
345 s[2] = x[1];
346 s[3] = x[3];
347
348 Dct4Stages<bufferfly_rotation>(s);
349
350 if (stage_is_rectangular) {
351 if (transpose) {
352 __m128i output[8];
353 Transpose8x4To4x8_U16(s, output);
354 StoreDst<8, 8>(dst, step, 0, output);
355 } else {
356 StoreDst<16, 4>(dst, step, 0, s);
357 }
358 } else {
359 if (transpose) {
360 Transpose4x4_U16(s, s);
361 }
362 StoreDst<8, 4>(dst, step, 0, s);
363 }
364 }
365
366 template <ButterflyRotationFunc bufferfly_rotation,
367 bool is_fast_bufferfly = false>
Dct8Stages(__m128i * s)368 LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
369 // stage 8.
370 if (is_fast_bufferfly) {
371 ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
372 ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
373 } else {
374 bufferfly_rotation(&s[4], &s[7], 56, false);
375 bufferfly_rotation(&s[5], &s[6], 24, false);
376 }
377
378 // stage 13.
379 HadamardRotation(&s[4], &s[5], false);
380 HadamardRotation(&s[6], &s[7], true);
381
382 // stage 18.
383 bufferfly_rotation(&s[6], &s[5], 32, true);
384
385 // stage 22.
386 HadamardRotation(&s[0], &s[7], false);
387 HadamardRotation(&s[1], &s[6], false);
388 HadamardRotation(&s[2], &s[5], false);
389 HadamardRotation(&s[3], &s[4], false);
390 }
391
392 // Process dct8 rows or columns, depending on the transpose flag.
393 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Dct8_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)394 LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, const void* source,
395 int32_t step, bool transpose) {
396 auto* const dst = static_cast<int16_t*>(dest);
397 const auto* const src = static_cast<const int16_t*>(source);
398 __m128i s[8], x[8];
399
400 if (stage_is_rectangular) {
401 if (transpose) {
402 __m128i input[4];
403 LoadSrc<16, 4>(src, step, 0, input);
404 Transpose8x4To4x8_U16(input, x);
405 } else {
406 LoadSrc<8, 8>(src, step, 0, x);
407 }
408 } else {
409 if (transpose) {
410 __m128i input[8];
411 LoadSrc<16, 8>(src, step, 0, input);
412 Transpose8x8_U16(input, x);
413 } else {
414 LoadSrc<16, 8>(src, step, 0, x);
415 }
416 }
417
418 // stage 1.
419 // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
420 s[0] = x[0];
421 s[1] = x[4];
422 s[2] = x[2];
423 s[3] = x[6];
424 s[4] = x[1];
425 s[5] = x[5];
426 s[6] = x[3];
427 s[7] = x[7];
428
429 Dct4Stages<bufferfly_rotation>(s);
430 Dct8Stages<bufferfly_rotation>(s);
431
432 if (stage_is_rectangular) {
433 if (transpose) {
434 __m128i output[4];
435 Transpose4x8To8x4_U16(s, output);
436 StoreDst<16, 4>(dst, step, 0, output);
437 } else {
438 StoreDst<8, 8>(dst, step, 0, s);
439 }
440 } else {
441 if (transpose) {
442 __m128i output[8];
443 Transpose8x8_U16(s, output);
444 StoreDst<16, 8>(dst, step, 0, output);
445 } else {
446 StoreDst<16, 8>(dst, step, 0, s);
447 }
448 }
449 }
450
451 template <ButterflyRotationFunc bufferfly_rotation,
452 bool is_fast_bufferfly = false>
Dct16Stages(__m128i * s)453 LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
454 // stage 5.
455 if (is_fast_bufferfly) {
456 ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
457 ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
458 ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
459 ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
460 } else {
461 bufferfly_rotation(&s[8], &s[15], 60, false);
462 bufferfly_rotation(&s[9], &s[14], 28, false);
463 bufferfly_rotation(&s[10], &s[13], 44, false);
464 bufferfly_rotation(&s[11], &s[12], 12, false);
465 }
466
467 // stage 9.
468 HadamardRotation(&s[8], &s[9], false);
469 HadamardRotation(&s[10], &s[11], true);
470 HadamardRotation(&s[12], &s[13], false);
471 HadamardRotation(&s[14], &s[15], true);
472
473 // stage 14.
474 bufferfly_rotation(&s[14], &s[9], 48, true);
475 bufferfly_rotation(&s[13], &s[10], 112, true);
476
477 // stage 19.
478 HadamardRotation(&s[8], &s[11], false);
479 HadamardRotation(&s[9], &s[10], false);
480 HadamardRotation(&s[12], &s[15], true);
481 HadamardRotation(&s[13], &s[14], true);
482
483 // stage 23.
484 bufferfly_rotation(&s[13], &s[10], 32, true);
485 bufferfly_rotation(&s[12], &s[11], 32, true);
486
487 // stage 26.
488 HadamardRotation(&s[0], &s[15], false);
489 HadamardRotation(&s[1], &s[14], false);
490 HadamardRotation(&s[2], &s[13], false);
491 HadamardRotation(&s[3], &s[12], false);
492 HadamardRotation(&s[4], &s[11], false);
493 HadamardRotation(&s[5], &s[10], false);
494 HadamardRotation(&s[6], &s[9], false);
495 HadamardRotation(&s[7], &s[8], false);
496 }
497
498 // Process dct16 rows or columns, depending on the transpose flag.
499 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Dct16_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)500 LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, const void* source,
501 int32_t step, bool transpose) {
502 auto* const dst = static_cast<int16_t*>(dest);
503 const auto* const src = static_cast<const int16_t*>(source);
504 __m128i s[16], x[16];
505
506 if (stage_is_rectangular) {
507 if (transpose) {
508 __m128i input[4];
509 LoadSrc<16, 4>(src, step, 0, input);
510 Transpose8x4To4x8_U16(input, x);
511 LoadSrc<16, 4>(src, step, 8, input);
512 Transpose8x4To4x8_U16(input, &x[8]);
513 } else {
514 LoadSrc<8, 16>(src, step, 0, x);
515 }
516 } else {
517 if (transpose) {
518 for (int idx = 0; idx < 16; idx += 8) {
519 __m128i input[8];
520 LoadSrc<16, 8>(src, step, idx, input);
521 Transpose8x8_U16(input, &x[idx]);
522 }
523 } else {
524 LoadSrc<16, 16>(src, step, 0, x);
525 }
526 }
527
528 // stage 1
529 // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
530 s[0] = x[0];
531 s[1] = x[8];
532 s[2] = x[4];
533 s[3] = x[12];
534 s[4] = x[2];
535 s[5] = x[10];
536 s[6] = x[6];
537 s[7] = x[14];
538 s[8] = x[1];
539 s[9] = x[9];
540 s[10] = x[5];
541 s[11] = x[13];
542 s[12] = x[3];
543 s[13] = x[11];
544 s[14] = x[7];
545 s[15] = x[15];
546
547 Dct4Stages<bufferfly_rotation>(s);
548 Dct8Stages<bufferfly_rotation>(s);
549 Dct16Stages<bufferfly_rotation>(s);
550
551 if (stage_is_rectangular) {
552 if (transpose) {
553 __m128i output[4];
554 Transpose4x8To8x4_U16(s, output);
555 StoreDst<16, 4>(dst, step, 0, output);
556 Transpose4x8To8x4_U16(&s[8], output);
557 StoreDst<16, 4>(dst, step, 8, output);
558 } else {
559 StoreDst<8, 16>(dst, step, 0, s);
560 }
561 } else {
562 if (transpose) {
563 for (int idx = 0; idx < 16; idx += 8) {
564 __m128i output[8];
565 Transpose8x8_U16(&s[idx], output);
566 StoreDst<16, 8>(dst, step, idx, output);
567 }
568 } else {
569 StoreDst<16, 16>(dst, step, 0, s);
570 }
571 }
572 }
573
574 template <ButterflyRotationFunc bufferfly_rotation,
575 bool is_fast_butterfly = false>
Dct32Stages(__m128i * s)576 LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
577 // stage 3
578 if (is_fast_butterfly) {
579 ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
580 ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
581 ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
582 ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
583 ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
584 ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
585 ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
586 ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
587 } else {
588 bufferfly_rotation(&s[16], &s[31], 62, false);
589 bufferfly_rotation(&s[17], &s[30], 30, false);
590 bufferfly_rotation(&s[18], &s[29], 46, false);
591 bufferfly_rotation(&s[19], &s[28], 14, false);
592 bufferfly_rotation(&s[20], &s[27], 54, false);
593 bufferfly_rotation(&s[21], &s[26], 22, false);
594 bufferfly_rotation(&s[22], &s[25], 38, false);
595 bufferfly_rotation(&s[23], &s[24], 6, false);
596 }
597 // stage 6.
598 HadamardRotation(&s[16], &s[17], false);
599 HadamardRotation(&s[18], &s[19], true);
600 HadamardRotation(&s[20], &s[21], false);
601 HadamardRotation(&s[22], &s[23], true);
602 HadamardRotation(&s[24], &s[25], false);
603 HadamardRotation(&s[26], &s[27], true);
604 HadamardRotation(&s[28], &s[29], false);
605 HadamardRotation(&s[30], &s[31], true);
606
607 // stage 10.
608 bufferfly_rotation(&s[30], &s[17], 24 + 32, true);
609 bufferfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
610 bufferfly_rotation(&s[26], &s[21], 24, true);
611 bufferfly_rotation(&s[25], &s[22], 24 + 64, true);
612
613 // stage 15.
614 HadamardRotation(&s[16], &s[19], false);
615 HadamardRotation(&s[17], &s[18], false);
616 HadamardRotation(&s[20], &s[23], true);
617 HadamardRotation(&s[21], &s[22], true);
618 HadamardRotation(&s[24], &s[27], false);
619 HadamardRotation(&s[25], &s[26], false);
620 HadamardRotation(&s[28], &s[31], true);
621 HadamardRotation(&s[29], &s[30], true);
622
623 // stage 20.
624 bufferfly_rotation(&s[29], &s[18], 48, true);
625 bufferfly_rotation(&s[28], &s[19], 48, true);
626 bufferfly_rotation(&s[27], &s[20], 48 + 64, true);
627 bufferfly_rotation(&s[26], &s[21], 48 + 64, true);
628
629 // stage 24.
630 HadamardRotation(&s[16], &s[23], false);
631 HadamardRotation(&s[17], &s[22], false);
632 HadamardRotation(&s[18], &s[21], false);
633 HadamardRotation(&s[19], &s[20], false);
634 HadamardRotation(&s[24], &s[31], true);
635 HadamardRotation(&s[25], &s[30], true);
636 HadamardRotation(&s[26], &s[29], true);
637 HadamardRotation(&s[27], &s[28], true);
638
639 // stage 27.
640 bufferfly_rotation(&s[27], &s[20], 32, true);
641 bufferfly_rotation(&s[26], &s[21], 32, true);
642 bufferfly_rotation(&s[25], &s[22], 32, true);
643 bufferfly_rotation(&s[24], &s[23], 32, true);
644
645 // stage 29.
646 HadamardRotation(&s[0], &s[31], false);
647 HadamardRotation(&s[1], &s[30], false);
648 HadamardRotation(&s[2], &s[29], false);
649 HadamardRotation(&s[3], &s[28], false);
650 HadamardRotation(&s[4], &s[27], false);
651 HadamardRotation(&s[5], &s[26], false);
652 HadamardRotation(&s[6], &s[25], false);
653 HadamardRotation(&s[7], &s[24], false);
654 HadamardRotation(&s[8], &s[23], false);
655 HadamardRotation(&s[9], &s[22], false);
656 HadamardRotation(&s[10], &s[21], false);
657 HadamardRotation(&s[11], &s[20], false);
658 HadamardRotation(&s[12], &s[19], false);
659 HadamardRotation(&s[13], &s[18], false);
660 HadamardRotation(&s[14], &s[17], false);
661 HadamardRotation(&s[15], &s[16], false);
662 }
663
664 // Process dct32 rows or columns, depending on the transpose flag.
Dct32_SSE4_1(void * dest,const void * source,const int32_t step,const bool transpose)665 LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const void* source,
666 const int32_t step,
667 const bool transpose) {
668 auto* const dst = static_cast<int16_t*>(dest);
669 const auto* const src = static_cast<const int16_t*>(source);
670 __m128i s[32], x[32];
671
672 if (transpose) {
673 for (int idx = 0; idx < 32; idx += 8) {
674 __m128i input[8];
675 LoadSrc<16, 8>(src, step, idx, input);
676 Transpose8x8_U16(input, &x[idx]);
677 }
678 } else {
679 LoadSrc<16, 32>(src, step, 0, x);
680 }
681
682 // stage 1
683 // kBitReverseLookup
684 // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
685 s[0] = x[0];
686 s[1] = x[16];
687 s[2] = x[8];
688 s[3] = x[24];
689 s[4] = x[4];
690 s[5] = x[20];
691 s[6] = x[12];
692 s[7] = x[28];
693 s[8] = x[2];
694 s[9] = x[18];
695 s[10] = x[10];
696 s[11] = x[26];
697 s[12] = x[6];
698 s[13] = x[22];
699 s[14] = x[14];
700 s[15] = x[30];
701
702 // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
703 s[16] = x[1];
704 s[17] = x[17];
705 s[18] = x[9];
706 s[19] = x[25];
707 s[20] = x[5];
708 s[21] = x[21];
709 s[22] = x[13];
710 s[23] = x[29];
711 s[24] = x[3];
712 s[25] = x[19];
713 s[26] = x[11];
714 s[27] = x[27];
715 s[28] = x[7];
716 s[29] = x[23];
717 s[30] = x[15];
718 s[31] = x[31];
719
720 Dct4Stages<ButterflyRotation_8>(s);
721 Dct8Stages<ButterflyRotation_8>(s);
722 Dct16Stages<ButterflyRotation_8>(s);
723 Dct32Stages<ButterflyRotation_8>(s);
724
725 if (transpose) {
726 for (int idx = 0; idx < 32; idx += 8) {
727 __m128i output[8];
728 Transpose8x8_U16(&s[idx], output);
729 StoreDst<16, 8>(dst, step, idx, output);
730 }
731 } else {
732 StoreDst<16, 32>(dst, step, 0, s);
733 }
734 }
735
736 // Allow the compiler to call this function instead of force inlining. Tests
737 // show the performance is slightly faster.
Dct64_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)738 void Dct64_SSE4_1(void* dest, const void* source, int32_t step,
739 bool transpose) {
740 auto* const dst = static_cast<int16_t*>(dest);
741 const auto* const src = static_cast<const int16_t*>(source);
742 __m128i s[64], x[32];
743
744 if (transpose) {
745 // The last 32 values of every row are always zero if the |tx_width| is
746 // 64.
747 for (int idx = 0; idx < 32; idx += 8) {
748 __m128i input[8];
749 LoadSrc<16, 8>(src, step, idx, input);
750 Transpose8x8_U16(input, &x[idx]);
751 }
752 } else {
753 // The last 32 values of every column are always zero if the |tx_height| is
754 // 64.
755 LoadSrc<16, 32>(src, step, 0, x);
756 }
757
758 // stage 1
759 // kBitReverseLookup
760 // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
761 s[0] = x[0];
762 s[2] = x[16];
763 s[4] = x[8];
764 s[6] = x[24];
765 s[8] = x[4];
766 s[10] = x[20];
767 s[12] = x[12];
768 s[14] = x[28];
769
770 // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
771 s[16] = x[2];
772 s[18] = x[18];
773 s[20] = x[10];
774 s[22] = x[26];
775 s[24] = x[6];
776 s[26] = x[22];
777 s[28] = x[14];
778 s[30] = x[30];
779
780 // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
781 s[32] = x[1];
782 s[34] = x[17];
783 s[36] = x[9];
784 s[38] = x[25];
785 s[40] = x[5];
786 s[42] = x[21];
787 s[44] = x[13];
788 s[46] = x[29];
789
790 // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
791 s[48] = x[3];
792 s[50] = x[19];
793 s[52] = x[11];
794 s[54] = x[27];
795 s[56] = x[7];
796 s[58] = x[23];
797 s[60] = x[15];
798 s[62] = x[31];
799
800 Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
801 Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
802 Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
803 Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
804
805 //-- start dct 64 stages
806 // stage 2.
807 ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
808 ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
809 ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
810 ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
811 ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
812 ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
813 ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
814 ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
815 ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
816 ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
817 ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
818 ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
819 ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
820 ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
821 ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
822 ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
823
824 // stage 4.
825 HadamardRotation(&s[32], &s[33], false);
826 HadamardRotation(&s[34], &s[35], true);
827 HadamardRotation(&s[36], &s[37], false);
828 HadamardRotation(&s[38], &s[39], true);
829 HadamardRotation(&s[40], &s[41], false);
830 HadamardRotation(&s[42], &s[43], true);
831 HadamardRotation(&s[44], &s[45], false);
832 HadamardRotation(&s[46], &s[47], true);
833 HadamardRotation(&s[48], &s[49], false);
834 HadamardRotation(&s[50], &s[51], true);
835 HadamardRotation(&s[52], &s[53], false);
836 HadamardRotation(&s[54], &s[55], true);
837 HadamardRotation(&s[56], &s[57], false);
838 HadamardRotation(&s[58], &s[59], true);
839 HadamardRotation(&s[60], &s[61], false);
840 HadamardRotation(&s[62], &s[63], true);
841
842 // stage 7.
843 ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
844 ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
845 ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
846 ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
847 ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
848 ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
849 ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
850 ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
851
852 // stage 11.
853 HadamardRotation(&s[32], &s[35], false);
854 HadamardRotation(&s[33], &s[34], false);
855 HadamardRotation(&s[36], &s[39], true);
856 HadamardRotation(&s[37], &s[38], true);
857 HadamardRotation(&s[40], &s[43], false);
858 HadamardRotation(&s[41], &s[42], false);
859 HadamardRotation(&s[44], &s[47], true);
860 HadamardRotation(&s[45], &s[46], true);
861 HadamardRotation(&s[48], &s[51], false);
862 HadamardRotation(&s[49], &s[50], false);
863 HadamardRotation(&s[52], &s[55], true);
864 HadamardRotation(&s[53], &s[54], true);
865 HadamardRotation(&s[56], &s[59], false);
866 HadamardRotation(&s[57], &s[58], false);
867 HadamardRotation(&s[60], &s[63], true);
868 HadamardRotation(&s[61], &s[62], true);
869
870 // stage 16.
871 ButterflyRotation_8(&s[61], &s[34], 56, true);
872 ButterflyRotation_8(&s[60], &s[35], 56, true);
873 ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
874 ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
875 ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
876 ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
877 ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
878 ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
879
880 // stage 21.
881 HadamardRotation(&s[32], &s[39], false);
882 HadamardRotation(&s[33], &s[38], false);
883 HadamardRotation(&s[34], &s[37], false);
884 HadamardRotation(&s[35], &s[36], false);
885 HadamardRotation(&s[40], &s[47], true);
886 HadamardRotation(&s[41], &s[46], true);
887 HadamardRotation(&s[42], &s[45], true);
888 HadamardRotation(&s[43], &s[44], true);
889 HadamardRotation(&s[48], &s[55], false);
890 HadamardRotation(&s[49], &s[54], false);
891 HadamardRotation(&s[50], &s[53], false);
892 HadamardRotation(&s[51], &s[52], false);
893 HadamardRotation(&s[56], &s[63], true);
894 HadamardRotation(&s[57], &s[62], true);
895 HadamardRotation(&s[58], &s[61], true);
896 HadamardRotation(&s[59], &s[60], true);
897
898 // stage 25.
899 ButterflyRotation_8(&s[59], &s[36], 48, true);
900 ButterflyRotation_8(&s[58], &s[37], 48, true);
901 ButterflyRotation_8(&s[57], &s[38], 48, true);
902 ButterflyRotation_8(&s[56], &s[39], 48, true);
903 ButterflyRotation_8(&s[55], &s[40], 112, true);
904 ButterflyRotation_8(&s[54], &s[41], 112, true);
905 ButterflyRotation_8(&s[53], &s[42], 112, true);
906 ButterflyRotation_8(&s[52], &s[43], 112, true);
907
908 // stage 28.
909 HadamardRotation(&s[32], &s[47], false);
910 HadamardRotation(&s[33], &s[46], false);
911 HadamardRotation(&s[34], &s[45], false);
912 HadamardRotation(&s[35], &s[44], false);
913 HadamardRotation(&s[36], &s[43], false);
914 HadamardRotation(&s[37], &s[42], false);
915 HadamardRotation(&s[38], &s[41], false);
916 HadamardRotation(&s[39], &s[40], false);
917 HadamardRotation(&s[48], &s[63], true);
918 HadamardRotation(&s[49], &s[62], true);
919 HadamardRotation(&s[50], &s[61], true);
920 HadamardRotation(&s[51], &s[60], true);
921 HadamardRotation(&s[52], &s[59], true);
922 HadamardRotation(&s[53], &s[58], true);
923 HadamardRotation(&s[54], &s[57], true);
924 HadamardRotation(&s[55], &s[56], true);
925
926 // stage 30.
927 ButterflyRotation_8(&s[55], &s[40], 32, true);
928 ButterflyRotation_8(&s[54], &s[41], 32, true);
929 ButterflyRotation_8(&s[53], &s[42], 32, true);
930 ButterflyRotation_8(&s[52], &s[43], 32, true);
931 ButterflyRotation_8(&s[51], &s[44], 32, true);
932 ButterflyRotation_8(&s[50], &s[45], 32, true);
933 ButterflyRotation_8(&s[49], &s[46], 32, true);
934 ButterflyRotation_8(&s[48], &s[47], 32, true);
935
936 // stage 31.
937 for (int i = 0; i < 32; i += 4) {
938 HadamardRotation(&s[i], &s[63 - i], false);
939 HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
940 HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
941 HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
942 }
943 //-- end dct 64 stages
944
945 if (transpose) {
946 for (int idx = 0; idx < 64; idx += 8) {
947 __m128i output[8];
948 Transpose8x8_U16(&s[idx], output);
949 StoreDst<16, 8>(dst, step, idx, output);
950 }
951 } else {
952 StoreDst<16, 64>(dst, step, 0, s);
953 }
954 }
955
956 //------------------------------------------------------------------------------
957 // Asymmetric Discrete Sine Transforms (ADST).
958
959 template <bool stage_is_rectangular>
Adst4_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)960 LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, const void* source,
961 int32_t step, bool transpose) {
962 auto* const dst = static_cast<int16_t*>(dest);
963 const auto* const src = static_cast<const int16_t*>(source);
964 __m128i s[8], x[4];
965
966 if (stage_is_rectangular) {
967 if (transpose) {
968 __m128i input[8];
969 LoadSrc<8, 8>(src, step, 0, input);
970 Transpose4x8To8x4_U16(input, x);
971 } else {
972 LoadSrc<16, 4>(src, step, 0, x);
973 }
974 } else {
975 LoadSrc<8, 4>(src, step, 0, x);
976 if (transpose) {
977 Transpose4x4_U16(x, x);
978 }
979 }
980
981 const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
982 const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
983 const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
984 const __m128i kAdst4Multiplier_m0_1 =
985 _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
986 (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
987 const __m128i kAdst4Multiplier_3_0 =
988 _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
989 (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
990
991 // stage 1.
992 const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
993 const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
994 const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
995 const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
996 const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
997
998 s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
999 s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
1000
1001 // stage 2.
1002 // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
1003 const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
1004 const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
1005 const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
1006
1007 // stage 3.
1008 s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
1009 s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
1010 s[2] = b7;
1011 s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
1012
1013 // stage 4.
1014 s[0] = _mm_add_epi32(s[0], s[5]);
1015 s[1] = _mm_sub_epi32(s[1], s[6]);
1016
1017 // stages 5 and 6.
1018 x[0] = _mm_add_epi32(s[0], s[3]);
1019 x[1] = _mm_add_epi32(s[1], s[3]);
1020 x[2] = _mm_add_epi32(s[0], s[1]);
1021 x[3] = _mm_sub_epi32(x[2], s[3]);
1022
1023 x[0] = RightShiftWithRounding_S32(x[0], 12);
1024 x[1] = RightShiftWithRounding_S32(x[1], 12);
1025 x[2] = RightShiftWithRounding_S32(s[2], 12);
1026 x[3] = RightShiftWithRounding_S32(x[3], 12);
1027
1028 x[0] = _mm_packs_epi32(x[0], x[1]);
1029 x[2] = _mm_packs_epi32(x[2], x[3]);
1030 x[1] = _mm_srli_si128(x[0], 8);
1031 x[3] = _mm_srli_si128(x[2], 8);
1032
1033 if (stage_is_rectangular) {
1034 if (transpose) {
1035 __m128i output[8];
1036 Transpose8x4To4x8_U16(x, output);
1037 StoreDst<8, 8>(dst, step, 0, output);
1038 } else {
1039 StoreDst<16, 4>(dst, step, 0, x);
1040 }
1041 } else {
1042 if (transpose) {
1043 Transpose4x4_U16(x, x);
1044 }
1045 StoreDst<8, 4>(dst, step, 0, x);
1046 }
1047 }
1048
1049 constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
1050 3344, 0, 2482, 1321};
1051
Adst4DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1052 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, const void* source,
1053 int non_zero_coeff_count,
1054 bool should_round, int row_shift) {
1055 if (non_zero_coeff_count > 1) {
1056 return false;
1057 }
1058
1059 auto* dst = static_cast<int16_t*>(dest);
1060 const auto* const src = static_cast<const int16_t*>(source);
1061 const __m128i v_src =
1062 _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0), 0);
1063 const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1064 const __m128i v_kTransformRowMultiplier =
1065 _mm_set1_epi16(kTransformRowMultiplier << 3);
1066 const __m128i v_src_round =
1067 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1068 const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1069 const __m128i v_kAdst4DcOnlyMultipliers =
1070 LoadUnaligned16(kAdst4DcOnlyMultiplier);
1071 // s0*k0 s0*k1 s0*k2 s0*k1
1072 // +
1073 // s0*0 s0*0 s0*0 s0*k0
1074 const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
1075 const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
1076 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1077 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1078 const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
1079 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1080 const __m128i c = _mm_packs_epi32(b, b);
1081 StoreLo8(dst, c);
1082
1083 return true;
1084 }
1085
Adst4DcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)1086 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, const void* source,
1087 int non_zero_coeff_count,
1088 int width) {
1089 if (non_zero_coeff_count > 1) {
1090 return false;
1091 }
1092
1093 auto* dst = static_cast<int16_t*>(dest);
1094 const auto* const src = static_cast<const int16_t*>(source);
1095
1096 int i = 0;
1097 do {
1098 const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&src[i]));
1099 const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
1100 const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
1101 const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
1102 const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
1103 const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
1104 const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
1105 const __m128i x0 = s0;
1106 const __m128i x1 = s1;
1107 const __m128i x2 = s2;
1108 const __m128i x3 = _mm_add_epi32(s0, s1);
1109 const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
1110 const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
1111 const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
1112 const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
1113 const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
1114 const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
1115 StoreLo8(&dst[i], dst_0_1);
1116 StoreHi8(&dst[i + width * 1], dst_0_1);
1117 StoreLo8(&dst[i + width * 2], dst_2_3);
1118 StoreHi8(&dst[i + width * 3], dst_2_3);
1119 i += 4;
1120 } while (i < width);
1121
1122 return true;
1123 }
1124
1125 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Adst8_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)1126 LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, const void* source,
1127 int32_t step, bool transpose) {
1128 auto* const dst = static_cast<int16_t*>(dest);
1129 const auto* const src = static_cast<const int16_t*>(source);
1130 __m128i s[8], x[8];
1131
1132 if (stage_is_rectangular) {
1133 if (transpose) {
1134 __m128i input[4];
1135 LoadSrc<16, 4>(src, step, 0, input);
1136 Transpose8x4To4x8_U16(input, x);
1137 } else {
1138 LoadSrc<8, 8>(src, step, 0, x);
1139 }
1140 } else {
1141 if (transpose) {
1142 __m128i input[8];
1143 LoadSrc<16, 8>(src, step, 0, input);
1144 Transpose8x8_U16(input, x);
1145 } else {
1146 LoadSrc<16, 8>(src, step, 0, x);
1147 }
1148 }
1149
1150 // stage 1.
1151 s[0] = x[7];
1152 s[1] = x[0];
1153 s[2] = x[5];
1154 s[3] = x[2];
1155 s[4] = x[3];
1156 s[5] = x[4];
1157 s[6] = x[1];
1158 s[7] = x[6];
1159
1160 // stage 2.
1161 bufferfly_rotation(&s[0], &s[1], 60 - 0, true);
1162 bufferfly_rotation(&s[2], &s[3], 60 - 16, true);
1163 bufferfly_rotation(&s[4], &s[5], 60 - 32, true);
1164 bufferfly_rotation(&s[6], &s[7], 60 - 48, true);
1165
1166 // stage 3.
1167 HadamardRotation(&s[0], &s[4], false);
1168 HadamardRotation(&s[1], &s[5], false);
1169 HadamardRotation(&s[2], &s[6], false);
1170 HadamardRotation(&s[3], &s[7], false);
1171
1172 // stage 4.
1173 bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
1174 bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
1175
1176 // stage 5.
1177 HadamardRotation(&s[0], &s[2], false);
1178 HadamardRotation(&s[4], &s[6], false);
1179 HadamardRotation(&s[1], &s[3], false);
1180 HadamardRotation(&s[5], &s[7], false);
1181
1182 // stage 6.
1183 bufferfly_rotation(&s[2], &s[3], 32, true);
1184 bufferfly_rotation(&s[6], &s[7], 32, true);
1185
1186 // stage 7.
1187 const __m128i v_zero = _mm_setzero_si128();
1188 x[0] = s[0];
1189 x[1] = _mm_subs_epi16(v_zero, s[4]);
1190 x[2] = s[6];
1191 x[3] = _mm_subs_epi16(v_zero, s[2]);
1192 x[4] = s[3];
1193 x[5] = _mm_subs_epi16(v_zero, s[7]);
1194 x[6] = s[5];
1195 x[7] = _mm_subs_epi16(v_zero, s[1]);
1196
1197 if (stage_is_rectangular) {
1198 if (transpose) {
1199 __m128i output[4];
1200 Transpose4x8To8x4_U16(x, output);
1201 StoreDst<16, 4>(dst, step, 0, output);
1202 } else {
1203 StoreDst<8, 8>(dst, step, 0, x);
1204 }
1205 } else {
1206 if (transpose) {
1207 __m128i output[8];
1208 Transpose8x8_U16(x, output);
1209 StoreDst<16, 8>(dst, step, 0, output);
1210 } else {
1211 StoreDst<16, 8>(dst, step, 0, x);
1212 }
1213 }
1214 }
1215
Adst8DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1216 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, const void* source,
1217 int non_zero_coeff_count,
1218 bool should_round, int row_shift) {
1219 if (non_zero_coeff_count > 1) {
1220 return false;
1221 }
1222
1223 auto* dst = static_cast<int16_t*>(dest);
1224 const auto* const src = static_cast<const int16_t*>(source);
1225 __m128i s[8];
1226
1227 const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
1228 const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1229 const __m128i v_kTransformRowMultiplier =
1230 _mm_set1_epi16(kTransformRowMultiplier << 3);
1231 const __m128i v_src_round =
1232 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1233 // stage 1.
1234 s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1235
1236 // stage 2.
1237 ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1238
1239 // stage 3.
1240 s[4] = s[0];
1241 s[5] = s[1];
1242
1243 // stage 4.
1244 ButterflyRotation_4(&s[4], &s[5], 48, true);
1245
1246 // stage 5.
1247 s[2] = s[0];
1248 s[3] = s[1];
1249 s[6] = s[4];
1250 s[7] = s[5];
1251
1252 // stage 6.
1253 ButterflyRotation_4(&s[2], &s[3], 32, true);
1254 ButterflyRotation_4(&s[6], &s[7], 32, true);
1255
1256 // stage 7.
1257 __m128i x[8];
1258 const __m128i v_zero = _mm_setzero_si128();
1259 x[0] = s[0];
1260 x[1] = _mm_subs_epi16(v_zero, s[4]);
1261 x[2] = s[6];
1262 x[3] = _mm_subs_epi16(v_zero, s[2]);
1263 x[4] = s[3];
1264 x[5] = _mm_subs_epi16(v_zero, s[7]);
1265 x[6] = s[5];
1266 x[7] = _mm_subs_epi16(v_zero, s[1]);
1267
1268 const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
1269 const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
1270 const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
1271 const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
1272 const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1273 const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1274
1275 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1276 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1277 const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1278 const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1279 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1280 const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1281 StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
1282
1283 return true;
1284 }
1285
Adst8DcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)1286 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, const void* source,
1287 int non_zero_coeff_count,
1288 int width) {
1289 if (non_zero_coeff_count > 1) {
1290 return false;
1291 }
1292
1293 auto* dst = static_cast<int16_t*>(dest);
1294 const auto* const src = static_cast<const int16_t*>(source);
1295 __m128i s[8];
1296
1297 int i = 0;
1298 do {
1299 const __m128i v_src = LoadLo8(&src[i]);
1300 // stage 1.
1301 s[1] = v_src;
1302
1303 // stage 2.
1304 ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1305
1306 // stage 3.
1307 s[4] = s[0];
1308 s[5] = s[1];
1309
1310 // stage 4.
1311 ButterflyRotation_4(&s[4], &s[5], 48, true);
1312
1313 // stage 5.
1314 s[2] = s[0];
1315 s[3] = s[1];
1316 s[6] = s[4];
1317 s[7] = s[5];
1318
1319 // stage 6.
1320 ButterflyRotation_4(&s[2], &s[3], 32, true);
1321 ButterflyRotation_4(&s[6], &s[7], 32, true);
1322
1323 // stage 7.
1324 __m128i x[8];
1325 const __m128i v_zero = _mm_setzero_si128();
1326 x[0] = s[0];
1327 x[1] = _mm_subs_epi16(v_zero, s[4]);
1328 x[2] = s[6];
1329 x[3] = _mm_subs_epi16(v_zero, s[2]);
1330 x[4] = s[3];
1331 x[5] = _mm_subs_epi16(v_zero, s[7]);
1332 x[6] = s[5];
1333 x[7] = _mm_subs_epi16(v_zero, s[1]);
1334
1335 for (int j = 0; j < 8; ++j) {
1336 StoreLo8(&dst[j * width], x[j]);
1337 }
1338 i += 4;
1339 dst += 4;
1340 } while (i < width);
1341
1342 return true;
1343 }
1344
1345 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Adst16_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)1346 LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, const void* source,
1347 int32_t step, bool transpose) {
1348 auto* const dst = static_cast<int16_t*>(dest);
1349 const auto* const src = static_cast<const int16_t*>(source);
1350 __m128i s[16], x[16];
1351
1352 if (stage_is_rectangular) {
1353 if (transpose) {
1354 __m128i input[4];
1355 LoadSrc<16, 4>(src, step, 0, input);
1356 Transpose8x4To4x8_U16(input, x);
1357 LoadSrc<16, 4>(src, step, 8, input);
1358 Transpose8x4To4x8_U16(input, &x[8]);
1359 } else {
1360 LoadSrc<8, 16>(src, step, 0, x);
1361 }
1362 } else {
1363 if (transpose) {
1364 for (int idx = 0; idx < 16; idx += 8) {
1365 __m128i input[8];
1366 LoadSrc<16, 8>(src, step, idx, input);
1367 Transpose8x8_U16(input, &x[idx]);
1368 }
1369 } else {
1370 LoadSrc<16, 16>(src, step, 0, x);
1371 }
1372 }
1373
1374 // stage 1.
1375 s[0] = x[15];
1376 s[1] = x[0];
1377 s[2] = x[13];
1378 s[3] = x[2];
1379 s[4] = x[11];
1380 s[5] = x[4];
1381 s[6] = x[9];
1382 s[7] = x[6];
1383 s[8] = x[7];
1384 s[9] = x[8];
1385 s[10] = x[5];
1386 s[11] = x[10];
1387 s[12] = x[3];
1388 s[13] = x[12];
1389 s[14] = x[1];
1390 s[15] = x[14];
1391
1392 // stage 2.
1393 bufferfly_rotation(&s[0], &s[1], 62 - 0, true);
1394 bufferfly_rotation(&s[2], &s[3], 62 - 8, true);
1395 bufferfly_rotation(&s[4], &s[5], 62 - 16, true);
1396 bufferfly_rotation(&s[6], &s[7], 62 - 24, true);
1397 bufferfly_rotation(&s[8], &s[9], 62 - 32, true);
1398 bufferfly_rotation(&s[10], &s[11], 62 - 40, true);
1399 bufferfly_rotation(&s[12], &s[13], 62 - 48, true);
1400 bufferfly_rotation(&s[14], &s[15], 62 - 56, true);
1401
1402 // stage 3.
1403 HadamardRotation(&s[0], &s[8], false);
1404 HadamardRotation(&s[1], &s[9], false);
1405 HadamardRotation(&s[2], &s[10], false);
1406 HadamardRotation(&s[3], &s[11], false);
1407 HadamardRotation(&s[4], &s[12], false);
1408 HadamardRotation(&s[5], &s[13], false);
1409 HadamardRotation(&s[6], &s[14], false);
1410 HadamardRotation(&s[7], &s[15], false);
1411
1412 // stage 4.
1413 bufferfly_rotation(&s[8], &s[9], 56 - 0, true);
1414 bufferfly_rotation(&s[13], &s[12], 8 + 0, true);
1415 bufferfly_rotation(&s[10], &s[11], 56 - 32, true);
1416 bufferfly_rotation(&s[15], &s[14], 8 + 32, true);
1417
1418 // stage 5.
1419 HadamardRotation(&s[0], &s[4], false);
1420 HadamardRotation(&s[8], &s[12], false);
1421 HadamardRotation(&s[1], &s[5], false);
1422 HadamardRotation(&s[9], &s[13], false);
1423 HadamardRotation(&s[2], &s[6], false);
1424 HadamardRotation(&s[10], &s[14], false);
1425 HadamardRotation(&s[3], &s[7], false);
1426 HadamardRotation(&s[11], &s[15], false);
1427
1428 // stage 6.
1429 bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
1430 bufferfly_rotation(&s[12], &s[13], 48 - 0, true);
1431 bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
1432 bufferfly_rotation(&s[15], &s[14], 48 - 32, true);
1433
1434 // stage 7.
1435 HadamardRotation(&s[0], &s[2], false);
1436 HadamardRotation(&s[4], &s[6], false);
1437 HadamardRotation(&s[8], &s[10], false);
1438 HadamardRotation(&s[12], &s[14], false);
1439 HadamardRotation(&s[1], &s[3], false);
1440 HadamardRotation(&s[5], &s[7], false);
1441 HadamardRotation(&s[9], &s[11], false);
1442 HadamardRotation(&s[13], &s[15], false);
1443
1444 // stage 8.
1445 bufferfly_rotation(&s[2], &s[3], 32, true);
1446 bufferfly_rotation(&s[6], &s[7], 32, true);
1447 bufferfly_rotation(&s[10], &s[11], 32, true);
1448 bufferfly_rotation(&s[14], &s[15], 32, true);
1449
1450 // stage 9.
1451 const __m128i v_zero = _mm_setzero_si128();
1452 x[0] = s[0];
1453 x[1] = _mm_subs_epi16(v_zero, s[8]);
1454 x[2] = s[12];
1455 x[3] = _mm_subs_epi16(v_zero, s[4]);
1456 x[4] = s[6];
1457 x[5] = _mm_subs_epi16(v_zero, s[14]);
1458 x[6] = s[10];
1459 x[7] = _mm_subs_epi16(v_zero, s[2]);
1460 x[8] = s[3];
1461 x[9] = _mm_subs_epi16(v_zero, s[11]);
1462 x[10] = s[15];
1463 x[11] = _mm_subs_epi16(v_zero, s[7]);
1464 x[12] = s[5];
1465 x[13] = _mm_subs_epi16(v_zero, s[13]);
1466 x[14] = s[9];
1467 x[15] = _mm_subs_epi16(v_zero, s[1]);
1468
1469 if (stage_is_rectangular) {
1470 if (transpose) {
1471 __m128i output[4];
1472 Transpose4x8To8x4_U16(x, output);
1473 StoreDst<16, 4>(dst, step, 0, output);
1474 Transpose4x8To8x4_U16(&x[8], output);
1475 StoreDst<16, 4>(dst, step, 8, output);
1476 } else {
1477 StoreDst<8, 16>(dst, step, 0, x);
1478 }
1479 } else {
1480 if (transpose) {
1481 for (int idx = 0; idx < 16; idx += 8) {
1482 __m128i output[8];
1483 Transpose8x8_U16(&x[idx], output);
1484 StoreDst<16, 8>(dst, step, idx, output);
1485 }
1486 } else {
1487 StoreDst<16, 16>(dst, step, 0, x);
1488 }
1489 }
1490 }
1491
Adst16DcOnlyInternal(__m128i * s,__m128i * x)1492 LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
1493 // stage 2.
1494 ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
1495
1496 // stage 3.
1497 s[8] = s[0];
1498 s[9] = s[1];
1499
1500 // stage 4.
1501 ButterflyRotation_4(&s[8], &s[9], 56, true);
1502
1503 // stage 5.
1504 s[4] = s[0];
1505 s[12] = s[8];
1506 s[5] = s[1];
1507 s[13] = s[9];
1508
1509 // stage 6.
1510 ButterflyRotation_4(&s[4], &s[5], 48, true);
1511 ButterflyRotation_4(&s[12], &s[13], 48, true);
1512
1513 // stage 7.
1514 s[2] = s[0];
1515 s[6] = s[4];
1516 s[10] = s[8];
1517 s[14] = s[12];
1518 s[3] = s[1];
1519 s[7] = s[5];
1520 s[11] = s[9];
1521 s[15] = s[13];
1522
1523 // stage 8.
1524 ButterflyRotation_4(&s[2], &s[3], 32, true);
1525 ButterflyRotation_4(&s[6], &s[7], 32, true);
1526 ButterflyRotation_4(&s[10], &s[11], 32, true);
1527 ButterflyRotation_4(&s[14], &s[15], 32, true);
1528
1529 // stage 9.
1530 const __m128i v_zero = _mm_setzero_si128();
1531 x[0] = s[0];
1532 x[1] = _mm_subs_epi16(v_zero, s[8]);
1533 x[2] = s[12];
1534 x[3] = _mm_subs_epi16(v_zero, s[4]);
1535 x[4] = s[6];
1536 x[5] = _mm_subs_epi16(v_zero, s[14]);
1537 x[6] = s[10];
1538 x[7] = _mm_subs_epi16(v_zero, s[2]);
1539 x[8] = s[3];
1540 x[9] = _mm_subs_epi16(v_zero, s[11]);
1541 x[10] = s[15];
1542 x[11] = _mm_subs_epi16(v_zero, s[7]);
1543 x[12] = s[5];
1544 x[13] = _mm_subs_epi16(v_zero, s[13]);
1545 x[14] = s[9];
1546 x[15] = _mm_subs_epi16(v_zero, s[1]);
1547 }
1548
Adst16DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1549 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, const void* source,
1550 int non_zero_coeff_count,
1551 bool should_round, int row_shift) {
1552 if (non_zero_coeff_count > 1) {
1553 return false;
1554 }
1555
1556 auto* dst = static_cast<int16_t*>(dest);
1557 const auto* const src = static_cast<const int16_t*>(source);
1558 __m128i s[16];
1559 __m128i x[16];
1560
1561 const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
1562 const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1563 const __m128i v_kTransformRowMultiplier =
1564 _mm_set1_epi16(kTransformRowMultiplier << 3);
1565 const __m128i v_src_round =
1566 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1567 // stage 1.
1568 s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1569
1570 Adst16DcOnlyInternal(s, x);
1571
1572 for (int i = 0; i < 2; ++i) {
1573 const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
1574 const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
1575 const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
1576 const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
1577 const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1578 const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1579
1580 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1581 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1582 const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1583 const __m128i a1 =
1584 _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1585 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1586 const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1587 StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
1588 }
1589 return true;
1590 }
1591
Adst16DcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)1592 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, const void* source,
1593 int non_zero_coeff_count,
1594 int width) {
1595 if (non_zero_coeff_count > 1) {
1596 return false;
1597 }
1598
1599 auto* dst = static_cast<int16_t*>(dest);
1600 const auto* const src = static_cast<const int16_t*>(source);
1601
1602 int i = 0;
1603 do {
1604 __m128i s[16];
1605 __m128i x[16];
1606 const __m128i v_src = LoadUnaligned16(&src[i]);
1607 // stage 1.
1608 s[1] = v_src;
1609
1610 Adst16DcOnlyInternal(s, x);
1611
1612 for (int j = 0; j < 16; ++j) {
1613 StoreLo8(&dst[j * width], x[j]);
1614 }
1615 i += 4;
1616 dst += 4;
1617 } while (i < width);
1618
1619 return true;
1620 }
1621
1622 //------------------------------------------------------------------------------
1623 // Identity Transforms.
1624
1625 template <bool is_row_shift>
Identity4_SSE4_1(void * dest,const void * source,int32_t step)1626 LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, const void* source,
1627 int32_t step) {
1628 auto* const dst = static_cast<int16_t*>(dest);
1629 const auto* const src = static_cast<const int16_t*>(source);
1630
1631 if (is_row_shift) {
1632 const int shift = 1;
1633 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1634 const __m128i v_multiplier_one =
1635 _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1636 for (int i = 0; i < 4; i += 2) {
1637 const __m128i v_src = LoadUnaligned16(&src[i * step]);
1638 const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1639 const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
1640 const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1641 const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
1642 const __m128i b = _mm_srai_epi32(a, 12 + shift);
1643 const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
1644 StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
1645 }
1646 } else {
1647 const __m128i v_multiplier =
1648 _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
1649 for (int i = 0; i < 4; i += 2) {
1650 const __m128i v_src = LoadUnaligned16(&src[i * step]);
1651 const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
1652 const __m128i b = _mm_adds_epi16(a, v_src);
1653 StoreUnaligned16(&dst[i * step], b);
1654 }
1655 }
1656 }
1657
Identity4DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int tx_height)1658 LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, const void* source,
1659 int non_zero_coeff_count,
1660 bool should_round, int tx_height) {
1661 if (non_zero_coeff_count > 1) {
1662 return false;
1663 }
1664
1665 auto* dst = static_cast<int16_t*>(dest);
1666 const auto* const src = static_cast<const int16_t*>(source);
1667
1668 const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
1669 const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1670 const __m128i v_kTransformRowMultiplier =
1671 _mm_set1_epi16(kTransformRowMultiplier << 3);
1672 const __m128i v_src_round =
1673 _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1674 const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
1675
1676 const int shift = (tx_height < 16) ? 0 : 1;
1677 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1678 const __m128i v_multiplier_one =
1679 _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1680 const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
1681 const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
1682 const __m128i b = _mm_srai_epi32(a, 12 + shift);
1683 dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1684 return true;
1685 }
1686
Identity4ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1687 LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
1688 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1689 const int tx_width, const int tx_height, const int16_t* source) {
1690 const int stride = frame.columns();
1691 uint8_t* dst = frame[start_y] + start_x;
1692
1693 const __m128i v_multiplier_fraction =
1694 _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1695 const __m128i v_eight = _mm_set1_epi16(8);
1696
1697 if (tx_width == 4) {
1698 int i = 0;
1699 do {
1700 const __m128i v_src = LoadLo8(&source[i * tx_width]);
1701 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1702 const __m128i frame_data = Load4(dst);
1703 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1704 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1705 const __m128i b = _mm_srai_epi16(a, 4);
1706 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1707 const __m128i d = _mm_adds_epi16(c, b);
1708 Store4(dst, _mm_packus_epi16(d, d));
1709 dst += stride;
1710 } while (++i < tx_height);
1711 } else {
1712 int i = 0;
1713 do {
1714 const int row = i * tx_width;
1715 int j = 0;
1716 do {
1717 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1718 const __m128i v_src_mult =
1719 _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1720 const __m128i frame_data = LoadLo8(dst + j);
1721 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1722 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1723 const __m128i b = _mm_srai_epi16(a, 4);
1724 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1725 const __m128i d = _mm_adds_epi16(c, b);
1726 StoreLo8(dst + j, _mm_packus_epi16(d, d));
1727 j += 8;
1728 } while (j < tx_width);
1729 dst += stride;
1730 } while (++i < tx_height);
1731 }
1732 }
1733
Identity4RowColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1734 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
1735 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1736 const int tx_width, const int tx_height, const int16_t* source) {
1737 const int stride = frame.columns();
1738 uint8_t* dst = frame[start_y] + start_x;
1739
1740 const __m128i v_multiplier_fraction =
1741 _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1742 const __m128i v_eight = _mm_set1_epi16(8);
1743 const __m128i v_kTransformRowMultiplier =
1744 _mm_set1_epi16(kTransformRowMultiplier << 3);
1745
1746 if (tx_width == 4) {
1747 int i = 0;
1748 do {
1749 const __m128i v_src = LoadLo8(&source[i * tx_width]);
1750 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1751 const __m128i frame_data = Load4(dst);
1752 const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
1753 const __m128i v_src_mult2 =
1754 _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1755 const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1756 const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1757 const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1758 const __m128i b = _mm_srai_epi16(a, 4);
1759 const __m128i c = _mm_adds_epi16(frame_data16, b);
1760 Store4(dst, _mm_packus_epi16(c, c));
1761 dst += stride;
1762 } while (++i < tx_height);
1763 } else {
1764 int i = 0;
1765 do {
1766 const int row = i * tx_width;
1767 int j = 0;
1768 do {
1769 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1770 const __m128i v_src_round =
1771 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1772 const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
1773 const __m128i v_src_mult2 =
1774 _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1775 const __m128i frame_data = LoadLo8(dst + j);
1776 const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1777 const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1778 const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1779 const __m128i b = _mm_srai_epi16(a, 4);
1780 const __m128i c = _mm_adds_epi16(frame_data16, b);
1781 StoreLo8(dst + j, _mm_packus_epi16(c, c));
1782 j += 8;
1783 } while (j < tx_width);
1784 dst += stride;
1785 } while (++i < tx_height);
1786 }
1787 }
1788
Identity8Row32_SSE4_1(void * dest,const void * source,int32_t step)1789 LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, const void* source,
1790 int32_t step) {
1791 auto* const dst = static_cast<int16_t*>(dest);
1792 const auto* const src = static_cast<const int16_t*>(source);
1793
1794 // When combining the identity8 multiplier with the row shift, the
1795 // calculations for tx_height equal to 32 can be simplified from
1796 // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
1797 const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
1798 for (int h = 0; h < 4; ++h) {
1799 const __m128i v_src = LoadUnaligned16(&src[h * step]);
1800 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
1801 StoreUnaligned16(&dst[h * step], v_src_mult);
1802 }
1803 }
1804
Identity8Row4_SSE4_1(void * dest,const void * source,int32_t step)1805 LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, const void* source,
1806 int32_t step) {
1807 auto* const dst = static_cast<int16_t*>(dest);
1808 const auto* const src = static_cast<const int16_t*>(source);
1809
1810 for (int h = 0; h < 4; ++h) {
1811 const __m128i v_src = LoadUnaligned16(&src[h * step]);
1812 // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
1813 // saturating add here is ok.
1814 const __m128i a = _mm_adds_epi16(v_src, v_src);
1815 StoreUnaligned16(&dst[h * step], a);
1816 }
1817 }
1818
Identity8DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1819 LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, const void* source,
1820 int non_zero_coeff_count,
1821 bool should_round, int row_shift) {
1822 if (non_zero_coeff_count > 1) {
1823 return false;
1824 }
1825
1826 auto* dst = static_cast<int16_t*>(dest);
1827 const auto* const src = static_cast<const int16_t*>(source);
1828
1829 const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
1830 const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1831 const __m128i v_kTransformRowMultiplier =
1832 _mm_set1_epi16(kTransformRowMultiplier << 3);
1833 const __m128i v_src_round =
1834 _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1835 const __m128i v_src =
1836 _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
1837 const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
1838 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1839 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1840 const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
1841 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1842 dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1843 return true;
1844 }
1845
Identity8ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1846 LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
1847 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1848 const int tx_width, const int tx_height, const int16_t* source) {
1849 const int stride = frame.columns();
1850 uint8_t* dst = frame[start_y] + start_x;
1851 const __m128i v_eight = _mm_set1_epi16(8);
1852 if (tx_width == 4) {
1853 int i = 0;
1854 do {
1855 const int row = i * tx_width;
1856 const __m128i v_src = LoadLo8(&source[row]);
1857 const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1858 const __m128i frame_data = Load4(dst);
1859 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1860 const __m128i b = _mm_srai_epi16(a, 4);
1861 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1862 const __m128i d = _mm_adds_epi16(c, b);
1863 Store4(dst, _mm_packus_epi16(d, d));
1864 dst += stride;
1865 } while (++i < tx_height);
1866 } else {
1867 int i = 0;
1868 do {
1869 const int row = i * tx_width;
1870 int j = 0;
1871 do {
1872 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1873 const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1874 const __m128i frame_data = LoadLo8(dst + j);
1875 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1876 const __m128i b = _mm_srai_epi16(a, 4);
1877 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1878 const __m128i d = _mm_adds_epi16(c, b);
1879 StoreLo8(dst + j, _mm_packus_epi16(d, d));
1880 j += 8;
1881 } while (j < tx_width);
1882 dst += stride;
1883 } while (++i < tx_height);
1884 }
1885 }
1886
Identity16Row_SSE4_1(void * dest,const void * source,int32_t step,int shift)1887 LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, const void* source,
1888 int32_t step, int shift) {
1889 auto* const dst = static_cast<int16_t*>(dest);
1890 const auto* const src = static_cast<const int16_t*>(source);
1891
1892 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1893 const __m128i v_multiplier_one =
1894 _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1895 const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1896
1897 for (int h = 0; h < 4; ++h) {
1898 const __m128i v_src = LoadUnaligned16(&src[h * step]);
1899 const __m128i v_src2 = LoadUnaligned16(&src[h * step + 8]);
1900 const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
1901 const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
1902 const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
1903 const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
1904 const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
1905 const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
1906 const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
1907 const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
1908 const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
1909 const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
1910 const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
1911 const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
1912 StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
1913 StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
1914 }
1915 }
1916
Identity16DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int shift)1917 LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, const void* source,
1918 int non_zero_coeff_count,
1919 bool should_round, int shift) {
1920 if (non_zero_coeff_count > 1) {
1921 return false;
1922 }
1923
1924 auto* dst = static_cast<int16_t*>(dest);
1925 const auto* const src = static_cast<const int16_t*>(source);
1926
1927 const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
1928 const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1929 const __m128i v_kTransformRowMultiplier =
1930 _mm_set1_epi16(kTransformRowMultiplier << 3);
1931 const __m128i v_src_round0 =
1932 _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1933 const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
1934 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1935 const __m128i v_multiplier_one =
1936 _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1937 const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1938 const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1939 const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1940 const __m128i b = _mm_sra_epi32(a, v_shift);
1941 dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1942 return true;
1943 }
1944
Identity16ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1945 LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
1946 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1947 const int tx_width, const int tx_height, const int16_t* source) {
1948 const int stride = frame.columns();
1949 uint8_t* dst = frame[start_y] + start_x;
1950 const __m128i v_eight = _mm_set1_epi16(8);
1951 const __m128i v_multiplier =
1952 _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
1953
1954 if (tx_width == 4) {
1955 int i = 0;
1956 do {
1957 const __m128i v_src = LoadLo8(&source[i * tx_width]);
1958 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1959 const __m128i frame_data = Load4(dst);
1960 const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1961 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1962 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1963 const __m128i b = _mm_srai_epi16(a, 4);
1964 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1965 const __m128i d = _mm_adds_epi16(c, b);
1966 Store4(dst, _mm_packus_epi16(d, d));
1967 dst += stride;
1968 } while (++i < tx_height);
1969 } else {
1970 int i = 0;
1971 do {
1972 const int row = i * tx_width;
1973 int j = 0;
1974 do {
1975 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1976 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1977 const __m128i frame_data = LoadLo8(dst + j);
1978 const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1979 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1980 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1981 const __m128i b = _mm_srai_epi16(a, 4);
1982 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1983 const __m128i d = _mm_adds_epi16(c, b);
1984 StoreLo8(dst + j, _mm_packus_epi16(d, d));
1985 j += 8;
1986 } while (j < tx_width);
1987 dst += stride;
1988 } while (++i < tx_height);
1989 }
1990 }
1991
Identity32Row16_SSE4_1(void * dest,const void * source,const int32_t step)1992 LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
1993 const void* source,
1994 const int32_t step) {
1995 auto* const dst = static_cast<int16_t*>(dest);
1996 const auto* const src = static_cast<const int16_t*>(source);
1997
1998 // When combining the identity32 multiplier with the row shift, the
1999 // calculation for tx_height equal to 16 can be simplified from
2000 // ((A * 4) + 1) >> 1) to (A * 2).
2001 for (int h = 0; h < 4; ++h) {
2002 for (int i = 0; i < 32; i += 8) {
2003 const __m128i v_src = LoadUnaligned16(&src[h * step + i]);
2004 // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
2005 // saturating add here is ok.
2006 const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
2007 StoreUnaligned16(&dst[h * step + i], v_dst_i);
2008 }
2009 }
2010 }
2011
Identity32DcOnly(void * dest,const void * source,int non_zero_coeff_count)2012 LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, const void* source,
2013 int non_zero_coeff_count) {
2014 if (non_zero_coeff_count > 1) {
2015 return false;
2016 }
2017
2018 auto* dst = static_cast<int16_t*>(dest);
2019 const auto* const src = static_cast<const int16_t*>(source);
2020
2021 const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
2022 const __m128i v_kTransformRowMultiplier =
2023 _mm_set1_epi16(kTransformRowMultiplier << 3);
2024 const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
2025
2026 // When combining the identity32 multiplier with the row shift, the
2027 // calculation for tx_height equal to 16 can be simplified from
2028 // ((A * 4) + 1) >> 1) to (A * 2).
2029 const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
2030 dst[0] = _mm_extract_epi16(v_dst_0, 0);
2031 return true;
2032 }
2033
Identity32ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)2034 LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
2035 Array2DView<uint8_t> frame, const int start_x, const int start_y,
2036 const int tx_width, const int tx_height, const int16_t* source) {
2037 const int stride = frame.columns();
2038 uint8_t* dst = frame[start_y] + start_x;
2039 const __m128i v_two = _mm_set1_epi16(2);
2040
2041 int i = 0;
2042 do {
2043 const int row = i * tx_width;
2044 int j = 0;
2045 do {
2046 const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
2047 const __m128i frame_data = LoadLo8(dst + j);
2048 const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
2049 const __m128i b = _mm_srai_epi16(a, 2);
2050 const __m128i c = _mm_cvtepu8_epi16(frame_data);
2051 const __m128i d = _mm_adds_epi16(c, b);
2052 StoreLo8(dst + j, _mm_packus_epi16(d, d));
2053 j += 8;
2054 } while (j < tx_width);
2055 dst += stride;
2056 } while (++i < tx_height);
2057 }
2058
2059 //------------------------------------------------------------------------------
2060 // Walsh Hadamard Transform.
2061
2062 // Process 4 wht4 rows and columns.
Wht4_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const void * source,const int non_zero_coeff_count)2063 LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
2064 const int start_x, const int start_y,
2065 const void* source,
2066 const int non_zero_coeff_count) {
2067 const auto* const src = static_cast<const int16_t*>(source);
2068 __m128i s[4], x[4];
2069
2070 if (non_zero_coeff_count == 1) {
2071 // Special case: only src[0] is nonzero.
2072 // src[0] 0 0 0
2073 // 0 0 0 0
2074 // 0 0 0 0
2075 // 0 0 0 0
2076 //
2077 // After the row and column transforms are applied, we have:
2078 // f h h h
2079 // g i i i
2080 // g i i i
2081 // g i i i
2082 // where f, g, h, i are computed as follows.
2083 int16_t f = (src[0] >> 2) - (src[0] >> 3);
2084 const int16_t g = f >> 1;
2085 f = f - (f >> 1);
2086 const int16_t h = (src[0] >> 3) - (src[0] >> 4);
2087 const int16_t i = (src[0] >> 4);
2088 s[0] = _mm_set1_epi16(h);
2089 s[0] = _mm_insert_epi16(s[0], f, 0);
2090 s[1] = _mm_set1_epi16(i);
2091 s[1] = _mm_insert_epi16(s[1], g, 0);
2092 s[2] = s[3] = s[1];
2093 } else {
2094 x[0] = LoadLo8(&src[0 * 4]);
2095 x[2] = LoadLo8(&src[1 * 4]);
2096 x[3] = LoadLo8(&src[2 * 4]);
2097 x[1] = LoadLo8(&src[3 * 4]);
2098
2099 // Row transforms.
2100 Transpose4x4_U16(x, x);
2101 s[0] = _mm_srai_epi16(x[0], 2);
2102 s[2] = _mm_srai_epi16(x[1], 2);
2103 s[3] = _mm_srai_epi16(x[2], 2);
2104 s[1] = _mm_srai_epi16(x[3], 2);
2105 s[0] = _mm_add_epi16(s[0], s[2]);
2106 s[3] = _mm_sub_epi16(s[3], s[1]);
2107 __m128i e = _mm_sub_epi16(s[0], s[3]);
2108 e = _mm_srai_epi16(e, 1);
2109 s[1] = _mm_sub_epi16(e, s[1]);
2110 s[2] = _mm_sub_epi16(e, s[2]);
2111 s[0] = _mm_sub_epi16(s[0], s[1]);
2112 s[3] = _mm_add_epi16(s[3], s[2]);
2113 Transpose4x4_U16(s, s);
2114
2115 // Column transforms.
2116 s[0] = _mm_add_epi16(s[0], s[2]);
2117 s[3] = _mm_sub_epi16(s[3], s[1]);
2118 e = _mm_sub_epi16(s[0], s[3]);
2119 e = _mm_srai_epi16(e, 1);
2120 s[1] = _mm_sub_epi16(e, s[1]);
2121 s[2] = _mm_sub_epi16(e, s[2]);
2122 s[0] = _mm_sub_epi16(s[0], s[1]);
2123 s[3] = _mm_add_epi16(s[3], s[2]);
2124 }
2125
2126 // Store to frame.
2127 const int stride = frame.columns();
2128 uint8_t* dst = frame[start_y] + start_x;
2129 for (int row = 0; row < 4; ++row) {
2130 const __m128i frame_data = Load4(dst);
2131 const __m128i a = _mm_cvtepu8_epi16(frame_data);
2132 // Saturate to prevent overflowing int16_t
2133 const __m128i b = _mm_adds_epi16(a, s[row]);
2134 Store4(dst, _mm_packus_epi16(b, b));
2135 dst += stride;
2136 }
2137 }
2138
2139 //------------------------------------------------------------------------------
2140 // row/column transform loops
2141
2142 template <bool enable_flip_rows = false>
StoreToFrameWithRound(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source,TransformType tx_type)2143 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
2144 Array2DView<uint8_t> frame, const int start_x, const int start_y,
2145 const int tx_width, const int tx_height, const int16_t* source,
2146 TransformType tx_type) {
2147 const bool flip_rows =
2148 enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
2149 const __m128i v_eight = _mm_set1_epi16(8);
2150 const int stride = frame.columns();
2151 uint8_t* dst = frame[start_y] + start_x;
2152 if (tx_width == 4) {
2153 for (int i = 0; i < tx_height; ++i) {
2154 const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
2155 const __m128i residual = LoadLo8(&source[row]);
2156 const __m128i frame_data = Load4(dst);
2157 // Saturate to prevent overflowing int16_t
2158 const __m128i a = _mm_adds_epi16(residual, v_eight);
2159 const __m128i b = _mm_srai_epi16(a, 4);
2160 const __m128i c = _mm_cvtepu8_epi16(frame_data);
2161 const __m128i d = _mm_adds_epi16(c, b);
2162 Store4(dst, _mm_packus_epi16(d, d));
2163 dst += stride;
2164 }
2165 } else if (tx_width == 8) {
2166 for (int i = 0; i < tx_height; ++i) {
2167 const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
2168 const __m128i residual = LoadUnaligned16(&source[row]);
2169 const __m128i frame_data = LoadLo8(dst);
2170 // Saturate to prevent overflowing int16_t
2171 const __m128i b = _mm_adds_epi16(residual, v_eight);
2172 const __m128i c = _mm_srai_epi16(b, 4);
2173 const __m128i d = _mm_cvtepu8_epi16(frame_data);
2174 const __m128i e = _mm_adds_epi16(d, c);
2175 StoreLo8(dst, _mm_packus_epi16(e, e));
2176 dst += stride;
2177 }
2178 } else {
2179 for (int i = 0; i < tx_height; ++i) {
2180 const int y = start_y + i;
2181 const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
2182 int j = 0;
2183 do {
2184 const int x = start_x + j;
2185 const __m128i residual = LoadUnaligned16(&source[row + j]);
2186 const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
2187 const __m128i frame_data = LoadUnaligned16(frame[y] + x);
2188 const __m128i b = _mm_adds_epi16(residual, v_eight);
2189 const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
2190 const __m128i c = _mm_srai_epi16(b, 4);
2191 const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
2192 const __m128i d = _mm_cvtepu8_epi16(frame_data);
2193 const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
2194 const __m128i e = _mm_adds_epi16(d, c);
2195 const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
2196 StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
2197 j += 16;
2198 } while (j < tx_width);
2199 }
2200 }
2201 }
2202
2203 template <int tx_height>
FlipColumns(int16_t * source,int tx_width)2204 LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
2205 const __m128i word_reverse_8 =
2206 _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
2207 if (tx_width >= 16) {
2208 int i = 0;
2209 do {
2210 // read 16 shorts
2211 const __m128i v3210 = LoadUnaligned16(&source[i]);
2212 const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
2213 const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
2214 const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
2215 StoreUnaligned16(&source[i], v4567);
2216 StoreUnaligned16(&source[i + 8], v0123);
2217 i += 16;
2218 } while (i < tx_width * tx_height);
2219 } else if (tx_width == 8) {
2220 for (int i = 0; i < 8 * tx_height; i += 8) {
2221 const __m128i a = LoadUnaligned16(&source[i]);
2222 const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
2223 StoreUnaligned16(&source[i], b);
2224 }
2225 } else {
2226 const __m128i dual_word_reverse_4 =
2227 _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
2228 // Process two rows per iteration.
2229 for (int i = 0; i < 4 * tx_height; i += 8) {
2230 const __m128i a = LoadUnaligned16(&source[i]);
2231 const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
2232 StoreUnaligned16(&source[i], b);
2233 }
2234 }
2235 }
2236
2237 template <int tx_width>
ApplyRounding(int16_t * source,int num_rows)2238 LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
2239 const __m128i v_kTransformRowMultiplier =
2240 _mm_set1_epi16(kTransformRowMultiplier << 3);
2241 if (tx_width == 4) {
2242 // Process two rows per iteration.
2243 int i = 0;
2244 do {
2245 const __m128i a = LoadUnaligned16(&source[i]);
2246 const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2247 StoreUnaligned16(&source[i], b);
2248 i += 8;
2249 } while (i < tx_width * num_rows);
2250 } else {
2251 int i = 0;
2252 do {
2253 // The last 32 values of every row are always zero if the |tx_width| is
2254 // 64.
2255 const int non_zero_width = (tx_width < 64) ? tx_width : 32;
2256 int j = 0;
2257 do {
2258 const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
2259 const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2260 StoreUnaligned16(&source[i * tx_width + j], b);
2261 j += 8;
2262 } while (j < non_zero_width);
2263 } while (++i < num_rows);
2264 }
2265 }
2266
2267 template <int tx_width>
RowShift(int16_t * source,int num_rows,int row_shift)2268 LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
2269 int row_shift) {
2270 const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
2271 const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
2272 if (tx_width == 4) {
2273 // Process two rows per iteration.
2274 int i = 0;
2275 do {
2276 const __m128i residual = LoadUnaligned16(&source[i]);
2277 const __m128i shifted_residual =
2278 ShiftResidual(residual, v_row_shift_add, v_row_shift);
2279 StoreUnaligned16(&source[i], shifted_residual);
2280 i += 8;
2281 } while (i < tx_width * num_rows);
2282 } else {
2283 int i = 0;
2284 do {
2285 for (int j = 0; j < tx_width; j += 8) {
2286 const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
2287 const __m128i shifted_residual =
2288 ShiftResidual(residual, v_row_shift_add, v_row_shift);
2289 StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
2290 }
2291 } while (++i < num_rows);
2292 }
2293 }
2294
Dct4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2295 void Dct4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2296 void* src_buffer, int start_x, int start_y,
2297 void* dst_frame, bool is_row,
2298 int non_zero_coeff_count) {
2299 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2300 auto* src = static_cast<int16_t*>(src_buffer);
2301 const int tx_width = kTransformWidth[tx_size];
2302 const int tx_height = kTransformHeight[tx_size];
2303
2304 if (is_row) {
2305 const bool should_round = (tx_height == 8);
2306 const int row_shift = static_cast<int>(tx_height == 16);
2307
2308 if (DctDcOnly<4>(&src[0], &src[0], non_zero_coeff_count, should_round,
2309 row_shift)) {
2310 return;
2311 }
2312
2313 const int num_rows =
2314 GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
2315 if (should_round) {
2316 ApplyRounding<4>(src, num_rows);
2317 }
2318
2319 if (num_rows <= 4) {
2320 // Process 4 1d dct4 rows in parallel.
2321 Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], /*step=*/4,
2322 /*transpose=*/true);
2323 } else {
2324 // Process 8 1d dct4 rows in parallel per iteration.
2325 int i = 0;
2326 do {
2327 Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], &src[i * 4],
2328 /*step=*/4, /*transpose=*/true);
2329 i += 8;
2330 } while (i < num_rows);
2331 }
2332 if (tx_height == 16) {
2333 RowShift<4>(src, num_rows, 1);
2334 }
2335 return;
2336 }
2337
2338 assert(!is_row);
2339 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2340 FlipColumns<4>(src, tx_width);
2341 }
2342
2343 if (!DctDcOnlyColumn<4>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2344 if (tx_width == 4) {
2345 // Process 4 1d dct4 columns in parallel.
2346 Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], tx_width,
2347 /*transpose=*/false);
2348 } else {
2349 // Process 8 1d dct4 columns in parallel per iteration.
2350 int i = 0;
2351 do {
2352 Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], &src[i], tx_width,
2353 /*transpose=*/false);
2354 i += 8;
2355 } while (i < tx_width);
2356 }
2357 }
2358 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
2359 }
2360
Dct8TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2361 void Dct8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2362 void* src_buffer, int start_x, int start_y,
2363 void* dst_frame, bool is_row,
2364 int non_zero_coeff_count) {
2365 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2366 auto* src = static_cast<int16_t*>(src_buffer);
2367 const int tx_width = kTransformWidth[tx_size];
2368 const int tx_height = kTransformHeight[tx_size];
2369
2370 if (is_row) {
2371 const bool should_round = kShouldRound[tx_size];
2372 const uint8_t row_shift = kTransformRowShift[tx_size];
2373
2374 if (DctDcOnly<8>(&src[0], &src[0], non_zero_coeff_count, should_round,
2375 row_shift)) {
2376 return;
2377 }
2378
2379 const int num_rows =
2380 GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
2381 if (should_round) {
2382 ApplyRounding<8>(src, num_rows);
2383 }
2384
2385 if (num_rows <= 4) {
2386 // Process 4 1d dct8 rows in parallel.
2387 Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
2388 /*transpose=*/true);
2389 } else {
2390 // Process 8 1d dct8 rows in parallel per iteration.
2391 int i = 0;
2392 do {
2393 Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
2394 /*step=*/8, /*transpose=*/true);
2395 i += 8;
2396 } while (i < num_rows);
2397 }
2398 if (row_shift > 0) {
2399 RowShift<8>(src, num_rows, row_shift);
2400 }
2401 return;
2402 }
2403
2404 assert(!is_row);
2405 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2406 FlipColumns<8>(src, tx_width);
2407 }
2408
2409 if (!DctDcOnlyColumn<8>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2410 if (tx_width == 4) {
2411 // Process 4 1d dct8 columns in parallel.
2412 Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2413 /*transpose=*/false);
2414 } else {
2415 // Process 8 1d dct8 columns in parallel per iteration.
2416 int i = 0;
2417 do {
2418 Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2419 /*transpose=*/false);
2420 i += 8;
2421 } while (i < tx_width);
2422 }
2423 }
2424 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
2425 }
2426
Dct16TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2427 void Dct16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2428 void* src_buffer, int start_x, int start_y,
2429 void* dst_frame, bool is_row,
2430 int non_zero_coeff_count) {
2431 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2432 auto* src = static_cast<int16_t*>(src_buffer);
2433 const int tx_width = kTransformWidth[tx_size];
2434 const int tx_height = kTransformHeight[tx_size];
2435
2436 if (is_row) {
2437 const bool should_round = kShouldRound[tx_size];
2438 const uint8_t row_shift = kTransformRowShift[tx_size];
2439
2440 if (DctDcOnly<16>(&src[0], &src[0], non_zero_coeff_count, should_round,
2441 row_shift)) {
2442 return;
2443 }
2444
2445 const int num_rows =
2446 GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2447 if (should_round) {
2448 ApplyRounding<16>(src, num_rows);
2449 }
2450
2451 if (num_rows <= 4) {
2452 // Process 4 1d dct16 rows in parallel.
2453 Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
2454 /*transpose=*/true);
2455 } else {
2456 int i = 0;
2457 do {
2458 // Process 8 1d dct16 rows in parallel per iteration.
2459 Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
2460 /*transpose=*/true);
2461 i += 8;
2462 } while (i < num_rows);
2463 }
2464 // row_shift is always non zero here.
2465 RowShift<16>(src, num_rows, row_shift);
2466
2467 return;
2468 }
2469
2470 assert(!is_row);
2471 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2472 FlipColumns<16>(src, tx_width);
2473 }
2474
2475 if (!DctDcOnlyColumn<16>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2476 if (tx_width == 4) {
2477 // Process 4 1d dct16 columns in parallel.
2478 Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2479 /*transpose=*/false);
2480 } else {
2481 int i = 0;
2482 do {
2483 // Process 8 1d dct16 columns in parallel per iteration.
2484 Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2485 /*transpose=*/false);
2486 i += 8;
2487 } while (i < tx_width);
2488 }
2489 }
2490 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
2491 }
2492
Dct32TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2493 void Dct32TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2494 void* src_buffer, int start_x, int start_y,
2495 void* dst_frame, bool is_row,
2496 int non_zero_coeff_count) {
2497 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2498 auto* src = static_cast<int16_t*>(src_buffer);
2499 const int tx_width = kTransformWidth[tx_size];
2500 const int tx_height = kTransformHeight[tx_size];
2501
2502 if (is_row) {
2503 const bool should_round = kShouldRound[tx_size];
2504 const uint8_t row_shift = kTransformRowShift[tx_size];
2505
2506 if (DctDcOnly<32>(&src[0], &src[0], non_zero_coeff_count, should_round,
2507 row_shift)) {
2508 return;
2509 }
2510
2511 const int num_rows =
2512 GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2513 if (should_round) {
2514 ApplyRounding<32>(src, num_rows);
2515 }
2516 // Process 8 1d dct32 rows in parallel per iteration.
2517 int i = 0;
2518 do {
2519 Dct32_SSE4_1(&src[i * 32], &src[i * 32], 32, /*transpose=*/true);
2520 i += 8;
2521 } while (i < num_rows);
2522 // row_shift is always non zero here.
2523 RowShift<32>(src, num_rows, row_shift);
2524
2525 return;
2526 }
2527
2528 assert(!is_row);
2529 if (!DctDcOnlyColumn<32>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2530 // Process 8 1d dct32 columns in parallel per iteration.
2531 int i = 0;
2532 do {
2533 Dct32_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
2534 i += 8;
2535 } while (i < tx_width);
2536 }
2537 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
2538 }
2539
Dct64TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2540 void Dct64TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2541 void* src_buffer, int start_x, int start_y,
2542 void* dst_frame, bool is_row,
2543 int non_zero_coeff_count) {
2544 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2545 auto* src = static_cast<int16_t*>(src_buffer);
2546 const int tx_width = kTransformWidth[tx_size];
2547 const int tx_height = kTransformHeight[tx_size];
2548
2549 if (is_row) {
2550 const bool should_round = kShouldRound[tx_size];
2551 const uint8_t row_shift = kTransformRowShift[tx_size];
2552
2553 if (DctDcOnly<64>(&src[0], &src[0], non_zero_coeff_count, should_round,
2554 row_shift)) {
2555 return;
2556 }
2557
2558 const int num_rows =
2559 GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2560 if (should_round) {
2561 ApplyRounding<64>(src, num_rows);
2562 }
2563 // Process 8 1d dct64 rows in parallel per iteration.
2564 int i = 0;
2565 do {
2566 Dct64_SSE4_1(&src[i * 64], &src[i * 64], 64, /*transpose=*/true);
2567 i += 8;
2568 } while (i < num_rows);
2569 // row_shift is always non zero here.
2570 RowShift<64>(src, num_rows, row_shift);
2571
2572 return;
2573 }
2574
2575 assert(!is_row);
2576 if (!DctDcOnlyColumn<64>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2577 // Process 8 1d dct64 columns in parallel per iteration.
2578 int i = 0;
2579 do {
2580 Dct64_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
2581 i += 8;
2582 } while (i < tx_width);
2583 }
2584 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
2585 }
2586
Adst4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2587 void Adst4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2588 void* src_buffer, int start_x, int start_y,
2589 void* dst_frame, bool is_row,
2590 int non_zero_coeff_count) {
2591 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2592 auto* src = static_cast<int16_t*>(src_buffer);
2593 const int tx_width = kTransformWidth[tx_size];
2594 const int tx_height = kTransformHeight[tx_size];
2595
2596 if (is_row) {
2597 const uint8_t row_shift = static_cast<uint8_t>(tx_height == 16);
2598 const bool should_round = (tx_height == 8);
2599
2600 if (Adst4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2601 row_shift)) {
2602 return;
2603 }
2604
2605 const int num_rows =
2606 GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
2607 if (should_round) {
2608 ApplyRounding<4>(src, num_rows);
2609 }
2610
2611 // Process 4 1d adst4 rows in parallel per iteration.
2612 int i = 0;
2613 do {
2614 Adst4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4,
2615 /*transpose=*/true);
2616 i += 4;
2617 } while (i < num_rows);
2618
2619 if (row_shift != 0u) {
2620 RowShift<4>(src, num_rows, 1);
2621 }
2622 return;
2623 }
2624
2625 assert(!is_row);
2626 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2627 FlipColumns<4>(src, tx_width);
2628 }
2629
2630 if (!Adst4DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2631 // Process 4 1d adst4 columns in parallel per iteration.
2632 int i = 0;
2633 do {
2634 Adst4_SSE4_1<false>(&src[i], &src[i], tx_width, /*transpose=*/false);
2635 i += 4;
2636 } while (i < tx_width);
2637 }
2638 StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2639 tx_width, 4, src, tx_type);
2640 }
2641
Adst8TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2642 void Adst8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2643 void* src_buffer, int start_x, int start_y,
2644 void* dst_frame, bool is_row,
2645 int non_zero_coeff_count) {
2646 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2647 auto* src = static_cast<int16_t*>(src_buffer);
2648 const int tx_width = kTransformWidth[tx_size];
2649 const int tx_height = kTransformHeight[tx_size];
2650
2651 if (is_row) {
2652 const bool should_round = kShouldRound[tx_size];
2653 const uint8_t row_shift = kTransformRowShift[tx_size];
2654
2655 if (Adst8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2656 row_shift)) {
2657 return;
2658 }
2659
2660 const int num_rows =
2661 GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
2662 if (should_round) {
2663 ApplyRounding<8>(src, num_rows);
2664 }
2665
2666 if (num_rows <= 4) {
2667 // Process 4 1d adst8 rows in parallel.
2668 Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
2669 /*transpose=*/true);
2670 } else {
2671 // Process 8 1d adst8 rows in parallel per iteration.
2672 int i = 0;
2673 do {
2674 Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
2675 /*step=*/8,
2676 /*transpose=*/true);
2677 i += 8;
2678 } while (i < num_rows);
2679 }
2680 if (row_shift > 0) {
2681 RowShift<8>(src, num_rows, row_shift);
2682 }
2683 return;
2684 }
2685
2686 assert(!is_row);
2687 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2688 FlipColumns<8>(src, tx_width);
2689 }
2690
2691 if (!Adst8DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2692 if (tx_width == 4) {
2693 // Process 4 1d adst8 columns in parallel.
2694 Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2695 /*transpose=*/false);
2696 } else {
2697 // Process 8 1d adst8 columns in parallel per iteration.
2698 int i = 0;
2699 do {
2700 Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2701 /*transpose=*/false);
2702 i += 8;
2703 } while (i < tx_width);
2704 }
2705 }
2706 StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2707 tx_width, 8, src, tx_type);
2708 }
2709
Adst16TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2710 void Adst16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2711 void* src_buffer, int start_x, int start_y,
2712 void* dst_frame, bool is_row,
2713 int non_zero_coeff_count) {
2714 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2715 auto* src = static_cast<int16_t*>(src_buffer);
2716 const int tx_width = kTransformWidth[tx_size];
2717 const int tx_height = kTransformHeight[tx_size];
2718
2719 if (is_row) {
2720 const bool should_round = kShouldRound[tx_size];
2721 const uint8_t row_shift = kTransformRowShift[tx_size];
2722
2723 if (Adst16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2724 row_shift)) {
2725 return;
2726 }
2727
2728 const int num_rows =
2729 GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2730 if (should_round) {
2731 ApplyRounding<16>(src, num_rows);
2732 }
2733
2734 if (num_rows <= 4) {
2735 // Process 4 1d adst16 rows in parallel.
2736 Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
2737 /*transpose=*/true);
2738 } else {
2739 int i = 0;
2740 do {
2741 // Process 8 1d adst16 rows in parallel per iteration.
2742 Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16],
2743 16, /*transpose=*/true);
2744 i += 8;
2745 } while (i < num_rows);
2746 }
2747 // row_shift is always non zero here.
2748 RowShift<16>(src, num_rows, row_shift);
2749
2750 return;
2751 }
2752
2753 assert(!is_row);
2754 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2755 FlipColumns<16>(src, tx_width);
2756 }
2757
2758 if (!Adst16DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2759 if (tx_width == 4) {
2760 // Process 4 1d adst16 columns in parallel.
2761 Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2762 /*transpose=*/false);
2763 } else {
2764 int i = 0;
2765 do {
2766 // Process 8 1d adst16 columns in parallel per iteration.
2767 Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2768 /*transpose=*/false);
2769 i += 8;
2770 } while (i < tx_width);
2771 }
2772 }
2773 StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2774 tx_width, 16, src, tx_type);
2775 }
2776
Identity4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2777 void Identity4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2778 void* src_buffer, int start_x, int start_y,
2779 void* dst_frame, bool is_row,
2780 int non_zero_coeff_count) {
2781 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2782 auto* src = static_cast<int16_t*>(src_buffer);
2783 const int tx_width = kTransformWidth[tx_size];
2784 const int tx_height = kTransformHeight[tx_size];
2785
2786 if (is_row) {
2787 // Special case: Process row calculations during column transform call.
2788 // Improves performance.
2789 if (tx_type == kTransformTypeIdentityIdentity &&
2790 tx_size == kTransformSize4x4) {
2791 return;
2792 }
2793
2794 const bool should_round = (tx_height == 8);
2795 if (Identity4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2796 tx_height)) {
2797 return;
2798 }
2799
2800 const int num_rows =
2801 GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
2802 if (should_round) {
2803 ApplyRounding<4>(src, num_rows);
2804 }
2805 if (tx_height < 16) {
2806 int i = 0;
2807 do {
2808 Identity4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4);
2809 i += 4;
2810 } while (i < num_rows);
2811 } else {
2812 int i = 0;
2813 do {
2814 Identity4_SSE4_1<true>(&src[i * 4], &src[i * 4], /*step=*/4);
2815 i += 4;
2816 } while (i < num_rows);
2817 }
2818 return;
2819 }
2820 assert(!is_row);
2821 const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2822 // Special case: Process row calculations during column transform call.
2823 if (tx_type == kTransformTypeIdentityIdentity &&
2824 (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
2825 Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, height,
2826 src);
2827 return;
2828 }
2829
2830 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2831 FlipColumns<4>(src, tx_width);
2832 }
2833
2834 Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
2835 }
2836
Identity8TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2837 void Identity8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2838 void* src_buffer, int start_x, int start_y,
2839 void* dst_frame, bool is_row,
2840 int non_zero_coeff_count) {
2841 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2842 auto* src = static_cast<int16_t*>(src_buffer);
2843 const int tx_width = kTransformWidth[tx_size];
2844 const int tx_height = kTransformHeight[tx_size];
2845
2846 if (is_row) {
2847 // Special case: Process row calculations during column transform call.
2848 // Improves performance.
2849 if (tx_type == kTransformTypeIdentityIdentity &&
2850 tx_size == kTransformSize8x4) {
2851 return;
2852 }
2853
2854 const bool should_round = kShouldRound[tx_size];
2855 const uint8_t row_shift = kTransformRowShift[tx_size];
2856 if (Identity8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2857 row_shift)) {
2858 return;
2859 }
2860
2861 const int num_rows =
2862 GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
2863 if (should_round) {
2864 ApplyRounding<8>(src, num_rows);
2865 }
2866
2867 // When combining the identity8 multiplier with the row shift, the
2868 // calculations for tx_height == 8 and tx_height == 16 can be simplified
2869 // from ((A * 2) + 1) >> 1) to A.
2870 if ((tx_height & 0x18) != 0) {
2871 return;
2872 }
2873 if (tx_height == 32) {
2874 int i = 0;
2875 do {
2876 Identity8Row32_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
2877 i += 4;
2878 } while (i < num_rows);
2879 return;
2880 }
2881
2882 // Process kTransformSize8x4
2883 assert(tx_size == kTransformSize8x4);
2884 int i = 0;
2885 do {
2886 Identity8Row4_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
2887 i += 4;
2888 } while (i < num_rows);
2889 return;
2890 }
2891
2892 assert(!is_row);
2893 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2894 FlipColumns<8>(src, tx_width);
2895 }
2896
2897 const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2898 Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
2899 src);
2900 }
2901
Identity16TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2902 void Identity16TransformLoop_SSE4_1(TransformType tx_type,
2903 TransformSize tx_size, void* src_buffer,
2904 int start_x, int start_y, void* dst_frame,
2905 bool is_row, int non_zero_coeff_count) {
2906 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2907 auto* src = static_cast<int16_t*>(src_buffer);
2908 const int tx_width = kTransformWidth[tx_size];
2909 const int tx_height = kTransformHeight[tx_size];
2910
2911 if (is_row) {
2912 const bool should_round = kShouldRound[tx_size];
2913 const uint8_t row_shift = kTransformRowShift[tx_size];
2914 if (Identity16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2915 row_shift)) {
2916 return;
2917 }
2918
2919 const int num_rows =
2920 GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2921 if (should_round) {
2922 ApplyRounding<16>(src, num_rows);
2923 }
2924 int i = 0;
2925 do {
2926 Identity16Row_SSE4_1(&src[i * 16], &src[i * 16], /*step=*/16,
2927 kTransformRowShift[tx_size]);
2928 i += 4;
2929 } while (i < num_rows);
2930 return;
2931 }
2932
2933 assert(!is_row);
2934 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2935 FlipColumns<16>(src, tx_width);
2936 }
2937 const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2938 Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
2939 src);
2940 }
2941
Identity32TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2942 void Identity32TransformLoop_SSE4_1(TransformType tx_type,
2943 TransformSize tx_size, void* src_buffer,
2944 int start_x, int start_y, void* dst_frame,
2945 bool is_row, int non_zero_coeff_count) {
2946 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2947 auto* src = static_cast<int16_t*>(src_buffer);
2948 const int tx_width = kTransformWidth[tx_size];
2949 const int tx_height = kTransformHeight[tx_size];
2950
2951 if (is_row) {
2952 // When combining the identity32 multiplier with the row shift, the
2953 // calculations for tx_height == 8 and tx_height == 32 can be simplified
2954 // from ((A * 4) + 2) >> 2) to A.
2955 if ((tx_height & 0x28) != 0) {
2956 return;
2957 }
2958
2959 // Process kTransformSize32x16. The src is always rounded before the
2960 // identity transform and shifted by 1 afterwards.
2961
2962 if (Identity32DcOnly(&src[0], &src[0], non_zero_coeff_count)) {
2963 return;
2964 }
2965
2966 const int num_rows =
2967 GetNumRows<32>(tx_type, tx_height, non_zero_coeff_count);
2968
2969 // Process kTransformSize32x16
2970 assert(tx_size == kTransformSize32x16);
2971 ApplyRounding<32>(src, num_rows);
2972 int i = 0;
2973 do {
2974 Identity32Row16_SSE4_1(&src[i * 32], &src[i * 32], /*step=*/32);
2975 i += 4;
2976 } while (i < num_rows);
2977 return;
2978 }
2979
2980 assert(!is_row);
2981 const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2982 Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
2983 }
2984
Wht4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2985 void Wht4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2986 void* src_buffer, int start_x, int start_y,
2987 void* dst_frame, bool is_row,
2988 int non_zero_coeff_count) {
2989 assert(tx_type == kTransformTypeDctDct);
2990 assert(tx_size == kTransformSize4x4);
2991 static_cast<void>(tx_type);
2992 static_cast<void>(tx_size);
2993 if (is_row) {
2994 // Do both row and column transforms in the column-transform pass.
2995 return;
2996 }
2997
2998 assert(!is_row);
2999 // Process 4 1d wht4 rows and columns in parallel.
3000 const auto* src = static_cast<int16_t*>(src_buffer);
3001 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
3002 Wht4_SSE4_1(frame, start_x, start_y, src, non_zero_coeff_count);
3003 }
3004
3005 //------------------------------------------------------------------------------
3006
3007 template <typename Residual, typename Pixel>
InitAll(Dsp * const dsp)3008 void InitAll(Dsp* const dsp) {
3009 // Maximum transform size for Dct is 64.
3010 dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
3011 Dct4TransformLoop_SSE4_1;
3012 dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
3013 Dct8TransformLoop_SSE4_1;
3014 dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
3015 Dct16TransformLoop_SSE4_1;
3016 dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
3017 Dct32TransformLoop_SSE4_1;
3018 dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
3019 Dct64TransformLoop_SSE4_1;
3020
3021 // Maximum transform size for Adst is 16.
3022 dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
3023 Adst4TransformLoop_SSE4_1;
3024 dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
3025 Adst8TransformLoop_SSE4_1;
3026 dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
3027 Adst16TransformLoop_SSE4_1;
3028
3029 // Maximum transform size for Identity transform is 32.
3030 dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
3031 Identity4TransformLoop_SSE4_1;
3032 dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
3033 Identity8TransformLoop_SSE4_1;
3034 dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
3035 Identity16TransformLoop_SSE4_1;
3036 dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
3037 Identity32TransformLoop_SSE4_1;
3038
3039 // Maximum transform size for Wht is 4.
3040 dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
3041 Wht4TransformLoop_SSE4_1;
3042 }
3043
Init8bpp()3044 void Init8bpp() {
3045 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
3046 assert(dsp != nullptr);
3047 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
3048 InitAll<int16_t, uint8_t>(dsp);
3049 #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
3050 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
3051 dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
3052 Dct4TransformLoop_SSE4_1;
3053 #endif
3054 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
3055 dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
3056 Dct8TransformLoop_SSE4_1;
3057 #endif
3058 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
3059 dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
3060 Dct16TransformLoop_SSE4_1;
3061 #endif
3062 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
3063 dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
3064 Dct32TransformLoop_SSE4_1;
3065 #endif
3066 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
3067 dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
3068 Dct64TransformLoop_SSE4_1;
3069 #endif
3070 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
3071 dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
3072 Adst4TransformLoop_SSE4_1;
3073 #endif
3074 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
3075 dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
3076 Adst8TransformLoop_SSE4_1;
3077 #endif
3078 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
3079 dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
3080 Adst16TransformLoop_SSE4_1;
3081 #endif
3082 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
3083 dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
3084 Identity4TransformLoop_SSE4_1;
3085 #endif
3086 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
3087 dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
3088 Identity8TransformLoop_SSE4_1;
3089 #endif
3090 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
3091 dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
3092 Identity16TransformLoop_SSE4_1;
3093 #endif
3094 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
3095 dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
3096 Identity32TransformLoop_SSE4_1;
3097 #endif
3098 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
3099 dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
3100 Wht4TransformLoop_SSE4_1;
3101 #endif
3102 #endif
3103 }
3104
3105 } // namespace
3106 } // namespace low_bitdepth
3107
InverseTransformInit_SSE4_1()3108 void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
3109
3110 } // namespace dsp
3111 } // namespace libgav1
3112 #else // !LIBGAV1_ENABLE_SSE4_1
3113 namespace libgav1 {
3114 namespace dsp {
3115
InverseTransformInit_SSE4_1()3116 void InverseTransformInit_SSE4_1() {}
3117
3118 } // namespace dsp
3119 } // namespace libgav1
3120 #endif // LIBGAV1_ENABLE_SSE4_1
3121