1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/inverse_transform.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstdint>
25 #include <cstring>
26
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/dsp/x86/transpose_sse4.h"
31 #include "src/utils/array_2d.h"
32 #include "src/utils/common.h"
33 #include "src/utils/compiler_attributes.h"
34
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39
40 // Include the constants and utility functions inside the anonymous namespace.
41 #include "src/dsp/inverse_transform.inc"
42
43 template <int store_width, int store_count>
StoreDst(int16_t * dst,int32_t stride,int32_t idx,const __m128i * s)44 LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
45 const __m128i* s) {
46 // NOTE: It is expected that the compiler will unroll these loops.
47 if (store_width == 16) {
48 for (int i = 0; i < store_count; i += 4) {
49 StoreUnaligned16(&dst[i * stride + idx], s[i]);
50 StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
51 StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
52 StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
53 }
54 }
55 if (store_width == 8) {
56 for (int i = 0; i < store_count; i += 4) {
57 StoreLo8(&dst[i * stride + idx], s[i]);
58 StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
59 StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
60 StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
61 }
62 }
63 }
64
65 template <int load_width, int load_count>
LoadSrc(const int16_t * src,int32_t stride,int32_t idx,__m128i * x)66 LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
67 int32_t idx, __m128i* x) {
68 // NOTE: It is expected that the compiler will unroll these loops.
69 if (load_width == 16) {
70 for (int i = 0; i < load_count; i += 4) {
71 x[i] = LoadUnaligned16(&src[i * stride + idx]);
72 x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
73 x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
74 x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
75 }
76 }
77 if (load_width == 8) {
78 for (int i = 0; i < load_count; i += 4) {
79 x[i] = LoadLo8(&src[i * stride + idx]);
80 x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
81 x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
82 x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
83 }
84 }
85 }
86
87 // Butterfly rotate 4 values.
ButterflyRotation_4(__m128i * a,__m128i * b,const int angle,const bool flip)88 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
89 const int angle,
90 const bool flip) {
91 const int16_t cos128 = Cos128(angle);
92 const int16_t sin128 = Sin128(angle);
93 const __m128i psin_pcos = _mm_set1_epi32(
94 static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
95 const __m128i ba = _mm_unpacklo_epi16(*a, *b);
96 const __m128i ab = _mm_unpacklo_epi16(*b, *a);
97 const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
98 // -sin cos, -sin cos, -sin cos, -sin cos
99 const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
100 const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
101 const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
102 const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
103 const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
104 const __m128i x = _mm_packs_epi32(x1, x1);
105 const __m128i y = _mm_packs_epi32(y1, y1);
106 if (flip) {
107 *a = y;
108 *b = x;
109 } else {
110 *a = x;
111 *b = y;
112 }
113 }
114
115 // Butterfly rotate 8 values.
ButterflyRotation_8(__m128i * a,__m128i * b,const int angle,const bool flip)116 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
117 const int angle,
118 const bool flip) {
119 const int16_t cos128 = Cos128(angle);
120 const int16_t sin128 = Sin128(angle);
121 const __m128i psin_pcos = _mm_set1_epi32(
122 static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
123 const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
124 // -sin cos, -sin cos, -sin cos, -sin cos
125 const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
126 const __m128i ba = _mm_unpacklo_epi16(*a, *b);
127 const __m128i ab = _mm_unpacklo_epi16(*b, *a);
128 const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
129 const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
130 const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
131 const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
132 const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
133 const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
134 const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
135 const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
136 const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
137 const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
138 const __m128i x = _mm_packs_epi32(x1, x1_hi);
139 const __m128i y = _mm_packs_epi32(y1, y1_hi);
140 if (flip) {
141 *a = y;
142 *b = x;
143 } else {
144 *a = x;
145 *b = y;
146 }
147 }
148
ButterflyRotation_FirstIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)149 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
150 const int angle,
151 const bool flip) {
152 const int16_t cos128 = Cos128(angle);
153 const int16_t sin128 = Sin128(angle);
154 const __m128i pcos = _mm_set1_epi16(cos128 << 3);
155 const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
156 const __m128i x = _mm_mulhrs_epi16(*b, psin);
157 const __m128i y = _mm_mulhrs_epi16(*b, pcos);
158 if (flip) {
159 *a = y;
160 *b = x;
161 } else {
162 *a = x;
163 *b = y;
164 }
165 }
166
ButterflyRotation_SecondIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)167 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
168 __m128i* b,
169 const int angle,
170 const bool flip) {
171 const int16_t cos128 = Cos128(angle);
172 const int16_t sin128 = Sin128(angle);
173 const __m128i pcos = _mm_set1_epi16(cos128 << 3);
174 const __m128i psin = _mm_set1_epi16(sin128 << 3);
175 const __m128i x = _mm_mulhrs_epi16(*a, pcos);
176 const __m128i y = _mm_mulhrs_epi16(*a, psin);
177 if (flip) {
178 *a = y;
179 *b = x;
180 } else {
181 *a = x;
182 *b = y;
183 }
184 }
185
HadamardRotation(__m128i * a,__m128i * b,bool flip)186 LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
187 __m128i x, y;
188 if (flip) {
189 y = _mm_adds_epi16(*b, *a);
190 x = _mm_subs_epi16(*b, *a);
191 } else {
192 x = _mm_adds_epi16(*a, *b);
193 y = _mm_subs_epi16(*a, *b);
194 }
195 *a = x;
196 *b = y;
197 }
198
199 using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
200 bool flip);
201
ShiftResidual(const __m128i residual,const __m128i v_row_shift_add,const __m128i v_row_shift)202 LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
203 const __m128i v_row_shift_add,
204 const __m128i v_row_shift) {
205 const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
206 // The max row_shift is 2, so int16_t values greater than 0x7ffd may
207 // overflow. Generate a mask for this case.
208 const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
209 const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
210 // Assume int16_t values.
211 const __m128i a = _mm_sra_epi16(x, v_row_shift);
212 // Assume uint16_t values.
213 const __m128i b = _mm_srl_epi16(x, v_row_shift);
214 // Select the correct shifted value.
215 return _mm_blendv_epi8(a, b, mask);
216 }
217
218 //------------------------------------------------------------------------------
219 // Discrete Cosine Transforms (DCT).
220
221 template <int width>
DctDcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)222 LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
223 bool should_round, int row_shift) {
224 if (adjusted_tx_height > 1) return false;
225
226 auto* dst = static_cast<int16_t*>(dest);
227 const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
228 const __m128i v_src =
229 (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
230 const __m128i v_mask =
231 _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
232 const __m128i v_kTransformRowMultiplier =
233 _mm_set1_epi16(kTransformRowMultiplier << 3);
234 const __m128i v_src_round =
235 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
236 const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
237 const int16_t cos128 = Cos128(32);
238 const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
239
240 // Expand to 32 bits to prevent int16_t overflows during the shift add.
241 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
242 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
243 const __m128i a = _mm_cvtepi16_epi32(xy);
244 const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
245 const __m128i b = _mm_add_epi32(a, v_row_shift_add);
246 const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
247 const __m128i c = _mm_sra_epi32(b, v_row_shift);
248 const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
249 const __m128i xy_shifted = _mm_packs_epi32(c, c1);
250
251 if (width == 4) {
252 StoreLo8(dst, xy_shifted);
253 } else {
254 for (int i = 0; i < width; i += 8) {
255 StoreUnaligned16(dst, xy_shifted);
256 dst += 8;
257 }
258 }
259 return true;
260 }
261
262 template <int height>
DctDcOnlyColumn(void * dest,int adjusted_tx_height,int width)263 LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
264 int width) {
265 if (adjusted_tx_height > 1) return false;
266
267 auto* dst = static_cast<int16_t*>(dest);
268 const int16_t cos128 = Cos128(32);
269
270 // Calculate dc values for first row.
271 if (width == 4) {
272 const __m128i v_src = LoadLo8(dst);
273 const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
274 StoreLo8(dst, xy);
275 } else {
276 int i = 0;
277 do {
278 const __m128i v_src = LoadUnaligned16(&dst[i]);
279 const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
280 StoreUnaligned16(&dst[i], xy);
281 i += 8;
282 } while (i < width);
283 }
284
285 // Copy first row to the rest of the block.
286 for (int y = 1; y < height; ++y) {
287 memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
288 }
289 return true;
290 }
291
292 template <ButterflyRotationFunc butterfly_rotation,
293 bool is_fast_butterfly = false>
Dct4Stages(__m128i * s)294 LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
295 // stage 12.
296 if (is_fast_butterfly) {
297 ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
298 ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
299 } else {
300 butterfly_rotation(&s[0], &s[1], 32, true);
301 butterfly_rotation(&s[2], &s[3], 48, false);
302 }
303
304 // stage 17.
305 HadamardRotation(&s[0], &s[3], false);
306 HadamardRotation(&s[1], &s[2], false);
307 }
308
309 // Process 4 dct4 rows or columns, depending on the transpose flag.
310 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Dct4_SSE4_1(void * dest,int32_t step,bool transpose)311 LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
312 bool transpose) {
313 auto* const dst = static_cast<int16_t*>(dest);
314 __m128i s[4], x[4];
315
316 if (stage_is_rectangular) {
317 if (transpose) {
318 __m128i input[8];
319 LoadSrc<8, 8>(dst, step, 0, input);
320 Transpose4x8To8x4_U16(input, x);
321 } else {
322 LoadSrc<16, 4>(dst, step, 0, x);
323 }
324 } else {
325 LoadSrc<8, 4>(dst, step, 0, x);
326 if (transpose) {
327 Transpose4x4_U16(x, x);
328 }
329 }
330 // stage 1.
331 // kBitReverseLookup 0, 2, 1, 3
332 s[0] = x[0];
333 s[1] = x[2];
334 s[2] = x[1];
335 s[3] = x[3];
336
337 Dct4Stages<butterfly_rotation>(s);
338
339 if (stage_is_rectangular) {
340 if (transpose) {
341 __m128i output[8];
342 Transpose8x4To4x8_U16(s, output);
343 StoreDst<8, 8>(dst, step, 0, output);
344 } else {
345 StoreDst<16, 4>(dst, step, 0, s);
346 }
347 } else {
348 if (transpose) {
349 Transpose4x4_U16(s, s);
350 }
351 StoreDst<8, 4>(dst, step, 0, s);
352 }
353 }
354
355 template <ButterflyRotationFunc butterfly_rotation,
356 bool is_fast_butterfly = false>
Dct8Stages(__m128i * s)357 LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
358 // stage 8.
359 if (is_fast_butterfly) {
360 ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
361 ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
362 } else {
363 butterfly_rotation(&s[4], &s[7], 56, false);
364 butterfly_rotation(&s[5], &s[6], 24, false);
365 }
366
367 // stage 13.
368 HadamardRotation(&s[4], &s[5], false);
369 HadamardRotation(&s[6], &s[7], true);
370
371 // stage 18.
372 butterfly_rotation(&s[6], &s[5], 32, true);
373
374 // stage 22.
375 HadamardRotation(&s[0], &s[7], false);
376 HadamardRotation(&s[1], &s[6], false);
377 HadamardRotation(&s[2], &s[5], false);
378 HadamardRotation(&s[3], &s[4], false);
379 }
380
381 // Process dct8 rows or columns, depending on the transpose flag.
382 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Dct8_SSE4_1(void * dest,int32_t step,bool transpose)383 LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
384 bool transpose) {
385 auto* const dst = static_cast<int16_t*>(dest);
386 __m128i s[8], x[8];
387
388 if (stage_is_rectangular) {
389 if (transpose) {
390 __m128i input[4];
391 LoadSrc<16, 4>(dst, step, 0, input);
392 Transpose8x4To4x8_U16(input, x);
393 } else {
394 LoadSrc<8, 8>(dst, step, 0, x);
395 }
396 } else {
397 if (transpose) {
398 __m128i input[8];
399 LoadSrc<16, 8>(dst, step, 0, input);
400 Transpose8x8_U16(input, x);
401 } else {
402 LoadSrc<16, 8>(dst, step, 0, x);
403 }
404 }
405
406 // stage 1.
407 // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
408 s[0] = x[0];
409 s[1] = x[4];
410 s[2] = x[2];
411 s[3] = x[6];
412 s[4] = x[1];
413 s[5] = x[5];
414 s[6] = x[3];
415 s[7] = x[7];
416
417 Dct4Stages<butterfly_rotation>(s);
418 Dct8Stages<butterfly_rotation>(s);
419
420 if (stage_is_rectangular) {
421 if (transpose) {
422 __m128i output[4];
423 Transpose4x8To8x4_U16(s, output);
424 StoreDst<16, 4>(dst, step, 0, output);
425 } else {
426 StoreDst<8, 8>(dst, step, 0, s);
427 }
428 } else {
429 if (transpose) {
430 __m128i output[8];
431 Transpose8x8_U16(s, output);
432 StoreDst<16, 8>(dst, step, 0, output);
433 } else {
434 StoreDst<16, 8>(dst, step, 0, s);
435 }
436 }
437 }
438
439 template <ButterflyRotationFunc butterfly_rotation,
440 bool is_fast_butterfly = false>
Dct16Stages(__m128i * s)441 LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
442 // stage 5.
443 if (is_fast_butterfly) {
444 ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
445 ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
446 ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
447 ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
448 } else {
449 butterfly_rotation(&s[8], &s[15], 60, false);
450 butterfly_rotation(&s[9], &s[14], 28, false);
451 butterfly_rotation(&s[10], &s[13], 44, false);
452 butterfly_rotation(&s[11], &s[12], 12, false);
453 }
454
455 // stage 9.
456 HadamardRotation(&s[8], &s[9], false);
457 HadamardRotation(&s[10], &s[11], true);
458 HadamardRotation(&s[12], &s[13], false);
459 HadamardRotation(&s[14], &s[15], true);
460
461 // stage 14.
462 butterfly_rotation(&s[14], &s[9], 48, true);
463 butterfly_rotation(&s[13], &s[10], 112, true);
464
465 // stage 19.
466 HadamardRotation(&s[8], &s[11], false);
467 HadamardRotation(&s[9], &s[10], false);
468 HadamardRotation(&s[12], &s[15], true);
469 HadamardRotation(&s[13], &s[14], true);
470
471 // stage 23.
472 butterfly_rotation(&s[13], &s[10], 32, true);
473 butterfly_rotation(&s[12], &s[11], 32, true);
474
475 // stage 26.
476 HadamardRotation(&s[0], &s[15], false);
477 HadamardRotation(&s[1], &s[14], false);
478 HadamardRotation(&s[2], &s[13], false);
479 HadamardRotation(&s[3], &s[12], false);
480 HadamardRotation(&s[4], &s[11], false);
481 HadamardRotation(&s[5], &s[10], false);
482 HadamardRotation(&s[6], &s[9], false);
483 HadamardRotation(&s[7], &s[8], false);
484 }
485
486 // Process dct16 rows or columns, depending on the transpose flag.
487 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Dct16_SSE4_1(void * dest,int32_t step,bool transpose)488 LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
489 bool transpose) {
490 auto* const dst = static_cast<int16_t*>(dest);
491 __m128i s[16], x[16];
492
493 if (stage_is_rectangular) {
494 if (transpose) {
495 __m128i input[4];
496 LoadSrc<16, 4>(dst, step, 0, input);
497 Transpose8x4To4x8_U16(input, x);
498 LoadSrc<16, 4>(dst, step, 8, input);
499 Transpose8x4To4x8_U16(input, &x[8]);
500 } else {
501 LoadSrc<8, 16>(dst, step, 0, x);
502 }
503 } else {
504 if (transpose) {
505 for (int idx = 0; idx < 16; idx += 8) {
506 __m128i input[8];
507 LoadSrc<16, 8>(dst, step, idx, input);
508 Transpose8x8_U16(input, &x[idx]);
509 }
510 } else {
511 LoadSrc<16, 16>(dst, step, 0, x);
512 }
513 }
514
515 // stage 1
516 // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
517 s[0] = x[0];
518 s[1] = x[8];
519 s[2] = x[4];
520 s[3] = x[12];
521 s[4] = x[2];
522 s[5] = x[10];
523 s[6] = x[6];
524 s[7] = x[14];
525 s[8] = x[1];
526 s[9] = x[9];
527 s[10] = x[5];
528 s[11] = x[13];
529 s[12] = x[3];
530 s[13] = x[11];
531 s[14] = x[7];
532 s[15] = x[15];
533
534 Dct4Stages<butterfly_rotation>(s);
535 Dct8Stages<butterfly_rotation>(s);
536 Dct16Stages<butterfly_rotation>(s);
537
538 if (stage_is_rectangular) {
539 if (transpose) {
540 __m128i output[4];
541 Transpose4x8To8x4_U16(s, output);
542 StoreDst<16, 4>(dst, step, 0, output);
543 Transpose4x8To8x4_U16(&s[8], output);
544 StoreDst<16, 4>(dst, step, 8, output);
545 } else {
546 StoreDst<8, 16>(dst, step, 0, s);
547 }
548 } else {
549 if (transpose) {
550 for (int idx = 0; idx < 16; idx += 8) {
551 __m128i output[8];
552 Transpose8x8_U16(&s[idx], output);
553 StoreDst<16, 8>(dst, step, idx, output);
554 }
555 } else {
556 StoreDst<16, 16>(dst, step, 0, s);
557 }
558 }
559 }
560
561 template <ButterflyRotationFunc butterfly_rotation,
562 bool is_fast_butterfly = false>
Dct32Stages(__m128i * s)563 LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
564 // stage 3
565 if (is_fast_butterfly) {
566 ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
567 ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
568 ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
569 ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
570 ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
571 ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
572 ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
573 ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
574 } else {
575 butterfly_rotation(&s[16], &s[31], 62, false);
576 butterfly_rotation(&s[17], &s[30], 30, false);
577 butterfly_rotation(&s[18], &s[29], 46, false);
578 butterfly_rotation(&s[19], &s[28], 14, false);
579 butterfly_rotation(&s[20], &s[27], 54, false);
580 butterfly_rotation(&s[21], &s[26], 22, false);
581 butterfly_rotation(&s[22], &s[25], 38, false);
582 butterfly_rotation(&s[23], &s[24], 6, false);
583 }
584 // stage 6.
585 HadamardRotation(&s[16], &s[17], false);
586 HadamardRotation(&s[18], &s[19], true);
587 HadamardRotation(&s[20], &s[21], false);
588 HadamardRotation(&s[22], &s[23], true);
589 HadamardRotation(&s[24], &s[25], false);
590 HadamardRotation(&s[26], &s[27], true);
591 HadamardRotation(&s[28], &s[29], false);
592 HadamardRotation(&s[30], &s[31], true);
593
594 // stage 10.
595 butterfly_rotation(&s[30], &s[17], 24 + 32, true);
596 butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
597 butterfly_rotation(&s[26], &s[21], 24, true);
598 butterfly_rotation(&s[25], &s[22], 24 + 64, true);
599
600 // stage 15.
601 HadamardRotation(&s[16], &s[19], false);
602 HadamardRotation(&s[17], &s[18], false);
603 HadamardRotation(&s[20], &s[23], true);
604 HadamardRotation(&s[21], &s[22], true);
605 HadamardRotation(&s[24], &s[27], false);
606 HadamardRotation(&s[25], &s[26], false);
607 HadamardRotation(&s[28], &s[31], true);
608 HadamardRotation(&s[29], &s[30], true);
609
610 // stage 20.
611 butterfly_rotation(&s[29], &s[18], 48, true);
612 butterfly_rotation(&s[28], &s[19], 48, true);
613 butterfly_rotation(&s[27], &s[20], 48 + 64, true);
614 butterfly_rotation(&s[26], &s[21], 48 + 64, true);
615
616 // stage 24.
617 HadamardRotation(&s[16], &s[23], false);
618 HadamardRotation(&s[17], &s[22], false);
619 HadamardRotation(&s[18], &s[21], false);
620 HadamardRotation(&s[19], &s[20], false);
621 HadamardRotation(&s[24], &s[31], true);
622 HadamardRotation(&s[25], &s[30], true);
623 HadamardRotation(&s[26], &s[29], true);
624 HadamardRotation(&s[27], &s[28], true);
625
626 // stage 27.
627 butterfly_rotation(&s[27], &s[20], 32, true);
628 butterfly_rotation(&s[26], &s[21], 32, true);
629 butterfly_rotation(&s[25], &s[22], 32, true);
630 butterfly_rotation(&s[24], &s[23], 32, true);
631
632 // stage 29.
633 HadamardRotation(&s[0], &s[31], false);
634 HadamardRotation(&s[1], &s[30], false);
635 HadamardRotation(&s[2], &s[29], false);
636 HadamardRotation(&s[3], &s[28], false);
637 HadamardRotation(&s[4], &s[27], false);
638 HadamardRotation(&s[5], &s[26], false);
639 HadamardRotation(&s[6], &s[25], false);
640 HadamardRotation(&s[7], &s[24], false);
641 HadamardRotation(&s[8], &s[23], false);
642 HadamardRotation(&s[9], &s[22], false);
643 HadamardRotation(&s[10], &s[21], false);
644 HadamardRotation(&s[11], &s[20], false);
645 HadamardRotation(&s[12], &s[19], false);
646 HadamardRotation(&s[13], &s[18], false);
647 HadamardRotation(&s[14], &s[17], false);
648 HadamardRotation(&s[15], &s[16], false);
649 }
650
651 // Process dct32 rows or columns, depending on the transpose flag.
Dct32_SSE4_1(void * dest,const int32_t step,const bool transpose)652 LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
653 const bool transpose) {
654 auto* const dst = static_cast<int16_t*>(dest);
655 __m128i s[32], x[32];
656
657 if (transpose) {
658 for (int idx = 0; idx < 32; idx += 8) {
659 __m128i input[8];
660 LoadSrc<16, 8>(dst, step, idx, input);
661 Transpose8x8_U16(input, &x[idx]);
662 }
663 } else {
664 LoadSrc<16, 32>(dst, step, 0, x);
665 }
666
667 // stage 1
668 // kBitReverseLookup
669 // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
670 s[0] = x[0];
671 s[1] = x[16];
672 s[2] = x[8];
673 s[3] = x[24];
674 s[4] = x[4];
675 s[5] = x[20];
676 s[6] = x[12];
677 s[7] = x[28];
678 s[8] = x[2];
679 s[9] = x[18];
680 s[10] = x[10];
681 s[11] = x[26];
682 s[12] = x[6];
683 s[13] = x[22];
684 s[14] = x[14];
685 s[15] = x[30];
686
687 // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
688 s[16] = x[1];
689 s[17] = x[17];
690 s[18] = x[9];
691 s[19] = x[25];
692 s[20] = x[5];
693 s[21] = x[21];
694 s[22] = x[13];
695 s[23] = x[29];
696 s[24] = x[3];
697 s[25] = x[19];
698 s[26] = x[11];
699 s[27] = x[27];
700 s[28] = x[7];
701 s[29] = x[23];
702 s[30] = x[15];
703 s[31] = x[31];
704
705 Dct4Stages<ButterflyRotation_8>(s);
706 Dct8Stages<ButterflyRotation_8>(s);
707 Dct16Stages<ButterflyRotation_8>(s);
708 Dct32Stages<ButterflyRotation_8>(s);
709
710 if (transpose) {
711 for (int idx = 0; idx < 32; idx += 8) {
712 __m128i output[8];
713 Transpose8x8_U16(&s[idx], output);
714 StoreDst<16, 8>(dst, step, idx, output);
715 }
716 } else {
717 StoreDst<16, 32>(dst, step, 0, s);
718 }
719 }
720
721 // Allow the compiler to call this function instead of force inlining. Tests
722 // show the performance is slightly faster.
Dct64_SSE4_1(void * dest,int32_t step,bool transpose)723 void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
724 auto* const dst = static_cast<int16_t*>(dest);
725 __m128i s[64], x[32];
726
727 if (transpose) {
728 // The last 32 values of every row are always zero if the |tx_width| is
729 // 64.
730 for (int idx = 0; idx < 32; idx += 8) {
731 __m128i input[8];
732 LoadSrc<16, 8>(dst, step, idx, input);
733 Transpose8x8_U16(input, &x[idx]);
734 }
735 } else {
736 // The last 32 values of every column are always zero if the |tx_height| is
737 // 64.
738 LoadSrc<16, 32>(dst, step, 0, x);
739 }
740
741 // stage 1
742 // kBitReverseLookup
743 // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
744 s[0] = x[0];
745 s[2] = x[16];
746 s[4] = x[8];
747 s[6] = x[24];
748 s[8] = x[4];
749 s[10] = x[20];
750 s[12] = x[12];
751 s[14] = x[28];
752
753 // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
754 s[16] = x[2];
755 s[18] = x[18];
756 s[20] = x[10];
757 s[22] = x[26];
758 s[24] = x[6];
759 s[26] = x[22];
760 s[28] = x[14];
761 s[30] = x[30];
762
763 // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
764 s[32] = x[1];
765 s[34] = x[17];
766 s[36] = x[9];
767 s[38] = x[25];
768 s[40] = x[5];
769 s[42] = x[21];
770 s[44] = x[13];
771 s[46] = x[29];
772
773 // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
774 s[48] = x[3];
775 s[50] = x[19];
776 s[52] = x[11];
777 s[54] = x[27];
778 s[56] = x[7];
779 s[58] = x[23];
780 s[60] = x[15];
781 s[62] = x[31];
782
783 Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
784 Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
785 Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
786 Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
787
788 //-- start dct 64 stages
789 // stage 2.
790 ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
791 ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
792 ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
793 ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
794 ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
795 ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
796 ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
797 ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
798 ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
799 ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
800 ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
801 ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
802 ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
803 ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
804 ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
805 ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
806
807 // stage 4.
808 HadamardRotation(&s[32], &s[33], false);
809 HadamardRotation(&s[34], &s[35], true);
810 HadamardRotation(&s[36], &s[37], false);
811 HadamardRotation(&s[38], &s[39], true);
812 HadamardRotation(&s[40], &s[41], false);
813 HadamardRotation(&s[42], &s[43], true);
814 HadamardRotation(&s[44], &s[45], false);
815 HadamardRotation(&s[46], &s[47], true);
816 HadamardRotation(&s[48], &s[49], false);
817 HadamardRotation(&s[50], &s[51], true);
818 HadamardRotation(&s[52], &s[53], false);
819 HadamardRotation(&s[54], &s[55], true);
820 HadamardRotation(&s[56], &s[57], false);
821 HadamardRotation(&s[58], &s[59], true);
822 HadamardRotation(&s[60], &s[61], false);
823 HadamardRotation(&s[62], &s[63], true);
824
825 // stage 7.
826 ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
827 ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
828 ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
829 ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
830 ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
831 ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
832 ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
833 ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
834
835 // stage 11.
836 HadamardRotation(&s[32], &s[35], false);
837 HadamardRotation(&s[33], &s[34], false);
838 HadamardRotation(&s[36], &s[39], true);
839 HadamardRotation(&s[37], &s[38], true);
840 HadamardRotation(&s[40], &s[43], false);
841 HadamardRotation(&s[41], &s[42], false);
842 HadamardRotation(&s[44], &s[47], true);
843 HadamardRotation(&s[45], &s[46], true);
844 HadamardRotation(&s[48], &s[51], false);
845 HadamardRotation(&s[49], &s[50], false);
846 HadamardRotation(&s[52], &s[55], true);
847 HadamardRotation(&s[53], &s[54], true);
848 HadamardRotation(&s[56], &s[59], false);
849 HadamardRotation(&s[57], &s[58], false);
850 HadamardRotation(&s[60], &s[63], true);
851 HadamardRotation(&s[61], &s[62], true);
852
853 // stage 16.
854 ButterflyRotation_8(&s[61], &s[34], 56, true);
855 ButterflyRotation_8(&s[60], &s[35], 56, true);
856 ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
857 ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
858 ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
859 ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
860 ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
861 ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
862
863 // stage 21.
864 HadamardRotation(&s[32], &s[39], false);
865 HadamardRotation(&s[33], &s[38], false);
866 HadamardRotation(&s[34], &s[37], false);
867 HadamardRotation(&s[35], &s[36], false);
868 HadamardRotation(&s[40], &s[47], true);
869 HadamardRotation(&s[41], &s[46], true);
870 HadamardRotation(&s[42], &s[45], true);
871 HadamardRotation(&s[43], &s[44], true);
872 HadamardRotation(&s[48], &s[55], false);
873 HadamardRotation(&s[49], &s[54], false);
874 HadamardRotation(&s[50], &s[53], false);
875 HadamardRotation(&s[51], &s[52], false);
876 HadamardRotation(&s[56], &s[63], true);
877 HadamardRotation(&s[57], &s[62], true);
878 HadamardRotation(&s[58], &s[61], true);
879 HadamardRotation(&s[59], &s[60], true);
880
881 // stage 25.
882 ButterflyRotation_8(&s[59], &s[36], 48, true);
883 ButterflyRotation_8(&s[58], &s[37], 48, true);
884 ButterflyRotation_8(&s[57], &s[38], 48, true);
885 ButterflyRotation_8(&s[56], &s[39], 48, true);
886 ButterflyRotation_8(&s[55], &s[40], 112, true);
887 ButterflyRotation_8(&s[54], &s[41], 112, true);
888 ButterflyRotation_8(&s[53], &s[42], 112, true);
889 ButterflyRotation_8(&s[52], &s[43], 112, true);
890
891 // stage 28.
892 HadamardRotation(&s[32], &s[47], false);
893 HadamardRotation(&s[33], &s[46], false);
894 HadamardRotation(&s[34], &s[45], false);
895 HadamardRotation(&s[35], &s[44], false);
896 HadamardRotation(&s[36], &s[43], false);
897 HadamardRotation(&s[37], &s[42], false);
898 HadamardRotation(&s[38], &s[41], false);
899 HadamardRotation(&s[39], &s[40], false);
900 HadamardRotation(&s[48], &s[63], true);
901 HadamardRotation(&s[49], &s[62], true);
902 HadamardRotation(&s[50], &s[61], true);
903 HadamardRotation(&s[51], &s[60], true);
904 HadamardRotation(&s[52], &s[59], true);
905 HadamardRotation(&s[53], &s[58], true);
906 HadamardRotation(&s[54], &s[57], true);
907 HadamardRotation(&s[55], &s[56], true);
908
909 // stage 30.
910 ButterflyRotation_8(&s[55], &s[40], 32, true);
911 ButterflyRotation_8(&s[54], &s[41], 32, true);
912 ButterflyRotation_8(&s[53], &s[42], 32, true);
913 ButterflyRotation_8(&s[52], &s[43], 32, true);
914 ButterflyRotation_8(&s[51], &s[44], 32, true);
915 ButterflyRotation_8(&s[50], &s[45], 32, true);
916 ButterflyRotation_8(&s[49], &s[46], 32, true);
917 ButterflyRotation_8(&s[48], &s[47], 32, true);
918
919 // stage 31.
920 for (int i = 0; i < 32; i += 4) {
921 HadamardRotation(&s[i], &s[63 - i], false);
922 HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
923 HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
924 HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
925 }
926 //-- end dct 64 stages
927
928 if (transpose) {
929 for (int idx = 0; idx < 64; idx += 8) {
930 __m128i output[8];
931 Transpose8x8_U16(&s[idx], output);
932 StoreDst<16, 8>(dst, step, idx, output);
933 }
934 } else {
935 StoreDst<16, 64>(dst, step, 0, s);
936 }
937 }
938
939 //------------------------------------------------------------------------------
940 // Asymmetric Discrete Sine Transforms (ADST).
941
942 template <bool stage_is_rectangular>
Adst4_SSE4_1(void * dest,int32_t step,bool transpose)943 LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
944 bool transpose) {
945 auto* const dst = static_cast<int16_t*>(dest);
946 __m128i s[8], x[4];
947
948 if (stage_is_rectangular) {
949 if (transpose) {
950 __m128i input[8];
951 LoadSrc<8, 8>(dst, step, 0, input);
952 Transpose4x8To8x4_U16(input, x);
953 } else {
954 LoadSrc<16, 4>(dst, step, 0, x);
955 }
956 } else {
957 LoadSrc<8, 4>(dst, step, 0, x);
958 if (transpose) {
959 Transpose4x4_U16(x, x);
960 }
961 }
962
963 const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
964 const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
965 const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
966 const __m128i kAdst4Multiplier_m0_1 =
967 _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
968 (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
969 const __m128i kAdst4Multiplier_3_0 =
970 _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
971 (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
972
973 // stage 1.
974 const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
975 const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
976 const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
977 const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
978 const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
979
980 s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
981 s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
982
983 // stage 2.
984 // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
985 const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
986 const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
987 const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
988
989 // stage 3.
990 s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
991 s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
992 s[2] = b7;
993 s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
994
995 // stage 4.
996 s[0] = _mm_add_epi32(s[0], s[5]);
997 s[1] = _mm_sub_epi32(s[1], s[6]);
998
999 // stages 5 and 6.
1000 x[0] = _mm_add_epi32(s[0], s[3]);
1001 x[1] = _mm_add_epi32(s[1], s[3]);
1002 x[2] = _mm_add_epi32(s[0], s[1]);
1003 x[3] = _mm_sub_epi32(x[2], s[3]);
1004
1005 x[0] = RightShiftWithRounding_S32(x[0], 12);
1006 x[1] = RightShiftWithRounding_S32(x[1], 12);
1007 x[2] = RightShiftWithRounding_S32(s[2], 12);
1008 x[3] = RightShiftWithRounding_S32(x[3], 12);
1009
1010 x[0] = _mm_packs_epi32(x[0], x[1]);
1011 x[2] = _mm_packs_epi32(x[2], x[3]);
1012 x[1] = _mm_srli_si128(x[0], 8);
1013 x[3] = _mm_srli_si128(x[2], 8);
1014
1015 if (stage_is_rectangular) {
1016 if (transpose) {
1017 __m128i output[8];
1018 Transpose8x4To4x8_U16(x, output);
1019 StoreDst<8, 8>(dst, step, 0, output);
1020 } else {
1021 StoreDst<16, 4>(dst, step, 0, x);
1022 }
1023 } else {
1024 if (transpose) {
1025 Transpose4x4_U16(x, x);
1026 }
1027 StoreDst<8, 4>(dst, step, 0, x);
1028 }
1029 }
1030
1031 constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
1032 3344, 0, 2482, 1321};
1033
Adst4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1034 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
1035 bool should_round, int row_shift) {
1036 if (adjusted_tx_height > 1) return false;
1037
1038 auto* dst = static_cast<int16_t*>(dest);
1039 const __m128i v_src =
1040 _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
1041 const __m128i v_mask =
1042 _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1043 const __m128i v_kTransformRowMultiplier =
1044 _mm_set1_epi16(kTransformRowMultiplier << 3);
1045 const __m128i v_src_round =
1046 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1047 const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1048 const __m128i v_kAdst4DcOnlyMultipliers =
1049 LoadUnaligned16(kAdst4DcOnlyMultiplier);
1050 // s0*k0 s0*k1 s0*k2 s0*k1
1051 // +
1052 // s0*0 s0*0 s0*0 s0*k0
1053 const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
1054 const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
1055 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1056 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1057 const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
1058 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1059 const __m128i c = _mm_packs_epi32(b, b);
1060 StoreLo8(dst, c);
1061
1062 return true;
1063 }
1064
Adst4DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1065 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
1066 int width) {
1067 if (adjusted_tx_height > 1) return false;
1068
1069 auto* dst = static_cast<int16_t*>(dest);
1070 int i = 0;
1071 do {
1072 const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
1073 const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
1074 const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
1075 const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
1076 const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
1077 const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
1078 const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
1079 const __m128i x0 = s0;
1080 const __m128i x1 = s1;
1081 const __m128i x2 = s2;
1082 const __m128i x3 = _mm_add_epi32(s0, s1);
1083 const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
1084 const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
1085 const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
1086 const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
1087 const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
1088 const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
1089 StoreLo8(&dst[i], dst_0_1);
1090 StoreHi8(&dst[i + width * 1], dst_0_1);
1091 StoreLo8(&dst[i + width * 2], dst_2_3);
1092 StoreHi8(&dst[i + width * 3], dst_2_3);
1093 i += 4;
1094 } while (i < width);
1095
1096 return true;
1097 }
1098
1099 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Adst8_SSE4_1(void * dest,int32_t step,bool transpose)1100 LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
1101 bool transpose) {
1102 auto* const dst = static_cast<int16_t*>(dest);
1103 __m128i s[8], x[8];
1104
1105 if (stage_is_rectangular) {
1106 if (transpose) {
1107 __m128i input[4];
1108 LoadSrc<16, 4>(dst, step, 0, input);
1109 Transpose8x4To4x8_U16(input, x);
1110 } else {
1111 LoadSrc<8, 8>(dst, step, 0, x);
1112 }
1113 } else {
1114 if (transpose) {
1115 __m128i input[8];
1116 LoadSrc<16, 8>(dst, step, 0, input);
1117 Transpose8x8_U16(input, x);
1118 } else {
1119 LoadSrc<16, 8>(dst, step, 0, x);
1120 }
1121 }
1122
1123 // stage 1.
1124 s[0] = x[7];
1125 s[1] = x[0];
1126 s[2] = x[5];
1127 s[3] = x[2];
1128 s[4] = x[3];
1129 s[5] = x[4];
1130 s[6] = x[1];
1131 s[7] = x[6];
1132
1133 // stage 2.
1134 butterfly_rotation(&s[0], &s[1], 60 - 0, true);
1135 butterfly_rotation(&s[2], &s[3], 60 - 16, true);
1136 butterfly_rotation(&s[4], &s[5], 60 - 32, true);
1137 butterfly_rotation(&s[6], &s[7], 60 - 48, true);
1138
1139 // stage 3.
1140 HadamardRotation(&s[0], &s[4], false);
1141 HadamardRotation(&s[1], &s[5], false);
1142 HadamardRotation(&s[2], &s[6], false);
1143 HadamardRotation(&s[3], &s[7], false);
1144
1145 // stage 4.
1146 butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1147 butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1148
1149 // stage 5.
1150 HadamardRotation(&s[0], &s[2], false);
1151 HadamardRotation(&s[4], &s[6], false);
1152 HadamardRotation(&s[1], &s[3], false);
1153 HadamardRotation(&s[5], &s[7], false);
1154
1155 // stage 6.
1156 butterfly_rotation(&s[2], &s[3], 32, true);
1157 butterfly_rotation(&s[6], &s[7], 32, true);
1158
1159 // stage 7.
1160 const __m128i v_zero = _mm_setzero_si128();
1161 x[0] = s[0];
1162 x[1] = _mm_subs_epi16(v_zero, s[4]);
1163 x[2] = s[6];
1164 x[3] = _mm_subs_epi16(v_zero, s[2]);
1165 x[4] = s[3];
1166 x[5] = _mm_subs_epi16(v_zero, s[7]);
1167 x[6] = s[5];
1168 x[7] = _mm_subs_epi16(v_zero, s[1]);
1169
1170 if (stage_is_rectangular) {
1171 if (transpose) {
1172 __m128i output[4];
1173 Transpose4x8To8x4_U16(x, output);
1174 StoreDst<16, 4>(dst, step, 0, output);
1175 } else {
1176 StoreDst<8, 8>(dst, step, 0, x);
1177 }
1178 } else {
1179 if (transpose) {
1180 __m128i output[8];
1181 Transpose8x8_U16(x, output);
1182 StoreDst<16, 8>(dst, step, 0, output);
1183 } else {
1184 StoreDst<16, 8>(dst, step, 0, x);
1185 }
1186 }
1187 }
1188
Adst8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1189 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
1190 bool should_round, int row_shift) {
1191 if (adjusted_tx_height > 1) return false;
1192
1193 auto* dst = static_cast<int16_t*>(dest);
1194 __m128i s[8];
1195
1196 const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
1197 const __m128i v_mask =
1198 _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1199 const __m128i v_kTransformRowMultiplier =
1200 _mm_set1_epi16(kTransformRowMultiplier << 3);
1201 const __m128i v_src_round =
1202 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1203 // stage 1.
1204 s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1205
1206 // stage 2.
1207 ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1208
1209 // stage 3.
1210 s[4] = s[0];
1211 s[5] = s[1];
1212
1213 // stage 4.
1214 ButterflyRotation_4(&s[4], &s[5], 48, true);
1215
1216 // stage 5.
1217 s[2] = s[0];
1218 s[3] = s[1];
1219 s[6] = s[4];
1220 s[7] = s[5];
1221
1222 // stage 6.
1223 ButterflyRotation_4(&s[2], &s[3], 32, true);
1224 ButterflyRotation_4(&s[6], &s[7], 32, true);
1225
1226 // stage 7.
1227 __m128i x[8];
1228 const __m128i v_zero = _mm_setzero_si128();
1229 x[0] = s[0];
1230 x[1] = _mm_subs_epi16(v_zero, s[4]);
1231 x[2] = s[6];
1232 x[3] = _mm_subs_epi16(v_zero, s[2]);
1233 x[4] = s[3];
1234 x[5] = _mm_subs_epi16(v_zero, s[7]);
1235 x[6] = s[5];
1236 x[7] = _mm_subs_epi16(v_zero, s[1]);
1237
1238 const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
1239 const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
1240 const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
1241 const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
1242 const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1243 const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1244
1245 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1246 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1247 const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1248 const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1249 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1250 const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1251 StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
1252
1253 return true;
1254 }
1255
Adst8DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1256 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
1257 int width) {
1258 if (adjusted_tx_height > 1) return false;
1259
1260 auto* dst = static_cast<int16_t*>(dest);
1261 __m128i s[8];
1262
1263 int i = 0;
1264 do {
1265 const __m128i v_src = LoadLo8(dst);
1266 // stage 1.
1267 s[1] = v_src;
1268
1269 // stage 2.
1270 ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1271
1272 // stage 3.
1273 s[4] = s[0];
1274 s[5] = s[1];
1275
1276 // stage 4.
1277 ButterflyRotation_4(&s[4], &s[5], 48, true);
1278
1279 // stage 5.
1280 s[2] = s[0];
1281 s[3] = s[1];
1282 s[6] = s[4];
1283 s[7] = s[5];
1284
1285 // stage 6.
1286 ButterflyRotation_4(&s[2], &s[3], 32, true);
1287 ButterflyRotation_4(&s[6], &s[7], 32, true);
1288
1289 // stage 7.
1290 __m128i x[8];
1291 const __m128i v_zero = _mm_setzero_si128();
1292 x[0] = s[0];
1293 x[1] = _mm_subs_epi16(v_zero, s[4]);
1294 x[2] = s[6];
1295 x[3] = _mm_subs_epi16(v_zero, s[2]);
1296 x[4] = s[3];
1297 x[5] = _mm_subs_epi16(v_zero, s[7]);
1298 x[6] = s[5];
1299 x[7] = _mm_subs_epi16(v_zero, s[1]);
1300
1301 for (int j = 0; j < 8; ++j) {
1302 StoreLo8(&dst[j * width], x[j]);
1303 }
1304 i += 4;
1305 dst += 4;
1306 } while (i < width);
1307
1308 return true;
1309 }
1310
1311 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Adst16_SSE4_1(void * dest,int32_t step,bool transpose)1312 LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
1313 bool transpose) {
1314 auto* const dst = static_cast<int16_t*>(dest);
1315 __m128i s[16], x[16];
1316
1317 if (stage_is_rectangular) {
1318 if (transpose) {
1319 __m128i input[4];
1320 LoadSrc<16, 4>(dst, step, 0, input);
1321 Transpose8x4To4x8_U16(input, x);
1322 LoadSrc<16, 4>(dst, step, 8, input);
1323 Transpose8x4To4x8_U16(input, &x[8]);
1324 } else {
1325 LoadSrc<8, 16>(dst, step, 0, x);
1326 }
1327 } else {
1328 if (transpose) {
1329 for (int idx = 0; idx < 16; idx += 8) {
1330 __m128i input[8];
1331 LoadSrc<16, 8>(dst, step, idx, input);
1332 Transpose8x8_U16(input, &x[idx]);
1333 }
1334 } else {
1335 LoadSrc<16, 16>(dst, step, 0, x);
1336 }
1337 }
1338
1339 // stage 1.
1340 s[0] = x[15];
1341 s[1] = x[0];
1342 s[2] = x[13];
1343 s[3] = x[2];
1344 s[4] = x[11];
1345 s[5] = x[4];
1346 s[6] = x[9];
1347 s[7] = x[6];
1348 s[8] = x[7];
1349 s[9] = x[8];
1350 s[10] = x[5];
1351 s[11] = x[10];
1352 s[12] = x[3];
1353 s[13] = x[12];
1354 s[14] = x[1];
1355 s[15] = x[14];
1356
1357 // stage 2.
1358 butterfly_rotation(&s[0], &s[1], 62 - 0, true);
1359 butterfly_rotation(&s[2], &s[3], 62 - 8, true);
1360 butterfly_rotation(&s[4], &s[5], 62 - 16, true);
1361 butterfly_rotation(&s[6], &s[7], 62 - 24, true);
1362 butterfly_rotation(&s[8], &s[9], 62 - 32, true);
1363 butterfly_rotation(&s[10], &s[11], 62 - 40, true);
1364 butterfly_rotation(&s[12], &s[13], 62 - 48, true);
1365 butterfly_rotation(&s[14], &s[15], 62 - 56, true);
1366
1367 // stage 3.
1368 HadamardRotation(&s[0], &s[8], false);
1369 HadamardRotation(&s[1], &s[9], false);
1370 HadamardRotation(&s[2], &s[10], false);
1371 HadamardRotation(&s[3], &s[11], false);
1372 HadamardRotation(&s[4], &s[12], false);
1373 HadamardRotation(&s[5], &s[13], false);
1374 HadamardRotation(&s[6], &s[14], false);
1375 HadamardRotation(&s[7], &s[15], false);
1376
1377 // stage 4.
1378 butterfly_rotation(&s[8], &s[9], 56 - 0, true);
1379 butterfly_rotation(&s[13], &s[12], 8 + 0, true);
1380 butterfly_rotation(&s[10], &s[11], 56 - 32, true);
1381 butterfly_rotation(&s[15], &s[14], 8 + 32, true);
1382
1383 // stage 5.
1384 HadamardRotation(&s[0], &s[4], false);
1385 HadamardRotation(&s[8], &s[12], false);
1386 HadamardRotation(&s[1], &s[5], false);
1387 HadamardRotation(&s[9], &s[13], false);
1388 HadamardRotation(&s[2], &s[6], false);
1389 HadamardRotation(&s[10], &s[14], false);
1390 HadamardRotation(&s[3], &s[7], false);
1391 HadamardRotation(&s[11], &s[15], false);
1392
1393 // stage 6.
1394 butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1395 butterfly_rotation(&s[12], &s[13], 48 - 0, true);
1396 butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1397 butterfly_rotation(&s[15], &s[14], 48 - 32, true);
1398
1399 // stage 7.
1400 HadamardRotation(&s[0], &s[2], false);
1401 HadamardRotation(&s[4], &s[6], false);
1402 HadamardRotation(&s[8], &s[10], false);
1403 HadamardRotation(&s[12], &s[14], false);
1404 HadamardRotation(&s[1], &s[3], false);
1405 HadamardRotation(&s[5], &s[7], false);
1406 HadamardRotation(&s[9], &s[11], false);
1407 HadamardRotation(&s[13], &s[15], false);
1408
1409 // stage 8.
1410 butterfly_rotation(&s[2], &s[3], 32, true);
1411 butterfly_rotation(&s[6], &s[7], 32, true);
1412 butterfly_rotation(&s[10], &s[11], 32, true);
1413 butterfly_rotation(&s[14], &s[15], 32, true);
1414
1415 // stage 9.
1416 const __m128i v_zero = _mm_setzero_si128();
1417 x[0] = s[0];
1418 x[1] = _mm_subs_epi16(v_zero, s[8]);
1419 x[2] = s[12];
1420 x[3] = _mm_subs_epi16(v_zero, s[4]);
1421 x[4] = s[6];
1422 x[5] = _mm_subs_epi16(v_zero, s[14]);
1423 x[6] = s[10];
1424 x[7] = _mm_subs_epi16(v_zero, s[2]);
1425 x[8] = s[3];
1426 x[9] = _mm_subs_epi16(v_zero, s[11]);
1427 x[10] = s[15];
1428 x[11] = _mm_subs_epi16(v_zero, s[7]);
1429 x[12] = s[5];
1430 x[13] = _mm_subs_epi16(v_zero, s[13]);
1431 x[14] = s[9];
1432 x[15] = _mm_subs_epi16(v_zero, s[1]);
1433
1434 if (stage_is_rectangular) {
1435 if (transpose) {
1436 __m128i output[4];
1437 Transpose4x8To8x4_U16(x, output);
1438 StoreDst<16, 4>(dst, step, 0, output);
1439 Transpose4x8To8x4_U16(&x[8], output);
1440 StoreDst<16, 4>(dst, step, 8, output);
1441 } else {
1442 StoreDst<8, 16>(dst, step, 0, x);
1443 }
1444 } else {
1445 if (transpose) {
1446 for (int idx = 0; idx < 16; idx += 8) {
1447 __m128i output[8];
1448 Transpose8x8_U16(&x[idx], output);
1449 StoreDst<16, 8>(dst, step, idx, output);
1450 }
1451 } else {
1452 StoreDst<16, 16>(dst, step, 0, x);
1453 }
1454 }
1455 }
1456
Adst16DcOnlyInternal(__m128i * s,__m128i * x)1457 LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
1458 // stage 2.
1459 ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
1460
1461 // stage 3.
1462 s[8] = s[0];
1463 s[9] = s[1];
1464
1465 // stage 4.
1466 ButterflyRotation_4(&s[8], &s[9], 56, true);
1467
1468 // stage 5.
1469 s[4] = s[0];
1470 s[12] = s[8];
1471 s[5] = s[1];
1472 s[13] = s[9];
1473
1474 // stage 6.
1475 ButterflyRotation_4(&s[4], &s[5], 48, true);
1476 ButterflyRotation_4(&s[12], &s[13], 48, true);
1477
1478 // stage 7.
1479 s[2] = s[0];
1480 s[6] = s[4];
1481 s[10] = s[8];
1482 s[14] = s[12];
1483 s[3] = s[1];
1484 s[7] = s[5];
1485 s[11] = s[9];
1486 s[15] = s[13];
1487
1488 // stage 8.
1489 ButterflyRotation_4(&s[2], &s[3], 32, true);
1490 ButterflyRotation_4(&s[6], &s[7], 32, true);
1491 ButterflyRotation_4(&s[10], &s[11], 32, true);
1492 ButterflyRotation_4(&s[14], &s[15], 32, true);
1493
1494 // stage 9.
1495 const __m128i v_zero = _mm_setzero_si128();
1496 x[0] = s[0];
1497 x[1] = _mm_subs_epi16(v_zero, s[8]);
1498 x[2] = s[12];
1499 x[3] = _mm_subs_epi16(v_zero, s[4]);
1500 x[4] = s[6];
1501 x[5] = _mm_subs_epi16(v_zero, s[14]);
1502 x[6] = s[10];
1503 x[7] = _mm_subs_epi16(v_zero, s[2]);
1504 x[8] = s[3];
1505 x[9] = _mm_subs_epi16(v_zero, s[11]);
1506 x[10] = s[15];
1507 x[11] = _mm_subs_epi16(v_zero, s[7]);
1508 x[12] = s[5];
1509 x[13] = _mm_subs_epi16(v_zero, s[13]);
1510 x[14] = s[9];
1511 x[15] = _mm_subs_epi16(v_zero, s[1]);
1512 }
1513
Adst16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1514 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
1515 bool should_round, int row_shift) {
1516 if (adjusted_tx_height > 1) return false;
1517
1518 auto* dst = static_cast<int16_t*>(dest);
1519 __m128i s[16];
1520 __m128i x[16];
1521
1522 const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
1523 const __m128i v_mask =
1524 _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1525 const __m128i v_kTransformRowMultiplier =
1526 _mm_set1_epi16(kTransformRowMultiplier << 3);
1527 const __m128i v_src_round =
1528 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1529 // stage 1.
1530 s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1531
1532 Adst16DcOnlyInternal(s, x);
1533
1534 for (int i = 0; i < 2; ++i) {
1535 const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
1536 const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
1537 const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
1538 const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
1539 const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1540 const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1541
1542 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1543 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1544 const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1545 const __m128i a1 =
1546 _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1547 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1548 const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1549 StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
1550 }
1551 return true;
1552 }
1553
Adst16DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1554 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
1555 int adjusted_tx_height,
1556 int width) {
1557 if (adjusted_tx_height > 1) return false;
1558
1559 auto* dst = static_cast<int16_t*>(dest);
1560 int i = 0;
1561 do {
1562 __m128i s[16];
1563 __m128i x[16];
1564 const __m128i v_src = LoadUnaligned16(dst);
1565 // stage 1.
1566 s[1] = v_src;
1567
1568 Adst16DcOnlyInternal(s, x);
1569
1570 for (int j = 0; j < 16; ++j) {
1571 StoreLo8(&dst[j * width], x[j]);
1572 }
1573 i += 4;
1574 dst += 4;
1575 } while (i < width);
1576
1577 return true;
1578 }
1579
1580 //------------------------------------------------------------------------------
1581 // Identity Transforms.
1582
1583 template <bool is_row_shift>
Identity4_SSE4_1(void * dest,int32_t step)1584 LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
1585 auto* const dst = static_cast<int16_t*>(dest);
1586
1587 if (is_row_shift) {
1588 const int shift = 1;
1589 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1590 const __m128i v_multiplier_one =
1591 _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1592 for (int i = 0; i < 4; i += 2) {
1593 const __m128i v_src = LoadUnaligned16(&dst[i * step]);
1594 const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1595 const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
1596 const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1597 const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
1598 const __m128i b = _mm_srai_epi32(a, 12 + shift);
1599 const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
1600 StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
1601 }
1602 } else {
1603 const __m128i v_multiplier =
1604 _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
1605 for (int i = 0; i < 4; i += 2) {
1606 const __m128i v_src = LoadUnaligned16(&dst[i * step]);
1607 const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
1608 const __m128i b = _mm_adds_epi16(a, v_src);
1609 StoreUnaligned16(&dst[i * step], b);
1610 }
1611 }
1612 }
1613
Identity4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int tx_height)1614 LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
1615 bool should_round, int tx_height) {
1616 if (adjusted_tx_height > 1) return false;
1617
1618 auto* dst = static_cast<int16_t*>(dest);
1619 const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1620 const __m128i v_mask =
1621 _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1622 const __m128i v_kTransformRowMultiplier =
1623 _mm_set1_epi16(kTransformRowMultiplier << 3);
1624 const __m128i v_src_round =
1625 _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1626 const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
1627
1628 const int shift = (tx_height < 16) ? 0 : 1;
1629 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1630 const __m128i v_multiplier_one =
1631 _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1632 const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
1633 const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
1634 const __m128i b = _mm_srai_epi32(a, 12 + shift);
1635 dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1636 return true;
1637 }
1638
Identity4ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1639 LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
1640 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1641 const int tx_width, const int tx_height, const int16_t* source) {
1642 const int stride = frame.columns();
1643 uint8_t* dst = frame[start_y] + start_x;
1644
1645 const __m128i v_multiplier_fraction =
1646 _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1647 const __m128i v_eight = _mm_set1_epi16(8);
1648
1649 if (tx_width == 4) {
1650 int i = 0;
1651 do {
1652 const __m128i v_src = LoadLo8(&source[i * tx_width]);
1653 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1654 const __m128i frame_data = Load4(dst);
1655 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1656 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1657 const __m128i b = _mm_srai_epi16(a, 4);
1658 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1659 const __m128i d = _mm_adds_epi16(c, b);
1660 Store4(dst, _mm_packus_epi16(d, d));
1661 dst += stride;
1662 } while (++i < tx_height);
1663 } else {
1664 int i = 0;
1665 do {
1666 const int row = i * tx_width;
1667 int j = 0;
1668 do {
1669 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1670 const __m128i v_src_mult =
1671 _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1672 const __m128i frame_data = LoadLo8(dst + j);
1673 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1674 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1675 const __m128i b = _mm_srai_epi16(a, 4);
1676 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1677 const __m128i d = _mm_adds_epi16(c, b);
1678 StoreLo8(dst + j, _mm_packus_epi16(d, d));
1679 j += 8;
1680 } while (j < tx_width);
1681 dst += stride;
1682 } while (++i < tx_height);
1683 }
1684 }
1685
Identity4RowColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1686 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
1687 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1688 const int tx_width, const int tx_height, const int16_t* source) {
1689 const int stride = frame.columns();
1690 uint8_t* dst = frame[start_y] + start_x;
1691
1692 const __m128i v_multiplier_fraction =
1693 _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1694 const __m128i v_eight = _mm_set1_epi16(8);
1695 const __m128i v_kTransformRowMultiplier =
1696 _mm_set1_epi16(kTransformRowMultiplier << 3);
1697
1698 if (tx_width == 4) {
1699 int i = 0;
1700 do {
1701 const __m128i v_src = LoadLo8(&source[i * tx_width]);
1702 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1703 const __m128i frame_data = Load4(dst);
1704 const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
1705 const __m128i v_src_mult2 =
1706 _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1707 const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1708 const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1709 const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1710 const __m128i b = _mm_srai_epi16(a, 4);
1711 const __m128i c = _mm_adds_epi16(frame_data16, b);
1712 Store4(dst, _mm_packus_epi16(c, c));
1713 dst += stride;
1714 } while (++i < tx_height);
1715 } else {
1716 int i = 0;
1717 do {
1718 const int row = i * tx_width;
1719 int j = 0;
1720 do {
1721 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1722 const __m128i v_src_round =
1723 _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1724 const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
1725 const __m128i v_src_mult2 =
1726 _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1727 const __m128i frame_data = LoadLo8(dst + j);
1728 const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1729 const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1730 const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1731 const __m128i b = _mm_srai_epi16(a, 4);
1732 const __m128i c = _mm_adds_epi16(frame_data16, b);
1733 StoreLo8(dst + j, _mm_packus_epi16(c, c));
1734 j += 8;
1735 } while (j < tx_width);
1736 dst += stride;
1737 } while (++i < tx_height);
1738 }
1739 }
1740
Identity8Row32_SSE4_1(void * dest,int32_t step)1741 LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
1742 auto* const dst = static_cast<int16_t*>(dest);
1743
1744 // When combining the identity8 multiplier with the row shift, the
1745 // calculations for tx_height equal to 32 can be simplified from
1746 // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
1747 const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
1748 for (int h = 0; h < 4; ++h) {
1749 const __m128i v_src = LoadUnaligned16(&dst[h * step]);
1750 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
1751 StoreUnaligned16(&dst[h * step], v_src_mult);
1752 }
1753 }
1754
Identity8Row4_SSE4_1(void * dest,int32_t step)1755 LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
1756 auto* const dst = static_cast<int16_t*>(dest);
1757
1758 for (int h = 0; h < 4; ++h) {
1759 const __m128i v_src = LoadUnaligned16(&dst[h * step]);
1760 // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
1761 // saturating add here is ok.
1762 const __m128i a = _mm_adds_epi16(v_src, v_src);
1763 StoreUnaligned16(&dst[h * step], a);
1764 }
1765 }
1766
Identity8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1767 LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
1768 bool should_round, int row_shift) {
1769 if (adjusted_tx_height > 1) return false;
1770
1771 auto* dst = static_cast<int16_t*>(dest);
1772 const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1773 const __m128i v_mask =
1774 _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1775 const __m128i v_kTransformRowMultiplier =
1776 _mm_set1_epi16(kTransformRowMultiplier << 3);
1777 const __m128i v_src_round =
1778 _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1779 const __m128i v_src =
1780 _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
1781 const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
1782 const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1783 const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1784 const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
1785 const __m128i b = _mm_sra_epi32(a, v_row_shift);
1786 dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1787 return true;
1788 }
1789
Identity8ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1790 LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
1791 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1792 const int tx_width, const int tx_height, const int16_t* source) {
1793 const int stride = frame.columns();
1794 uint8_t* dst = frame[start_y] + start_x;
1795 const __m128i v_eight = _mm_set1_epi16(8);
1796 if (tx_width == 4) {
1797 int i = 0;
1798 do {
1799 const int row = i * tx_width;
1800 const __m128i v_src = LoadLo8(&source[row]);
1801 const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1802 const __m128i frame_data = Load4(dst);
1803 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1804 const __m128i b = _mm_srai_epi16(a, 4);
1805 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1806 const __m128i d = _mm_adds_epi16(c, b);
1807 Store4(dst, _mm_packus_epi16(d, d));
1808 dst += stride;
1809 } while (++i < tx_height);
1810 } else {
1811 int i = 0;
1812 do {
1813 const int row = i * tx_width;
1814 int j = 0;
1815 do {
1816 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1817 const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1818 const __m128i frame_data = LoadLo8(dst + j);
1819 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1820 const __m128i b = _mm_srai_epi16(a, 4);
1821 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1822 const __m128i d = _mm_adds_epi16(c, b);
1823 StoreLo8(dst + j, _mm_packus_epi16(d, d));
1824 j += 8;
1825 } while (j < tx_width);
1826 dst += stride;
1827 } while (++i < tx_height);
1828 }
1829 }
1830
Identity16Row_SSE4_1(void * dest,int32_t step,int shift)1831 LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
1832 int shift) {
1833 auto* const dst = static_cast<int16_t*>(dest);
1834
1835 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1836 const __m128i v_multiplier_one =
1837 _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1838 const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1839
1840 for (int h = 0; h < 4; ++h) {
1841 const __m128i v_src = LoadUnaligned16(&dst[h * step]);
1842 const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
1843 const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
1844 const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
1845 const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
1846 const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
1847 const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
1848 const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
1849 const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
1850 const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
1851 const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
1852 const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
1853 const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
1854 const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
1855 StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
1856 StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
1857 }
1858 }
1859
Identity16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int shift)1860 LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
1861 bool should_round, int shift) {
1862 if (adjusted_tx_height > 1) return false;
1863
1864 auto* dst = static_cast<int16_t*>(dest);
1865 const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1866 const __m128i v_mask =
1867 _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1868 const __m128i v_kTransformRowMultiplier =
1869 _mm_set1_epi16(kTransformRowMultiplier << 3);
1870 const __m128i v_src_round0 =
1871 _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1872 const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
1873 const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1874 const __m128i v_multiplier_one =
1875 _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1876 const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1877 const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1878 const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1879 const __m128i b = _mm_sra_epi32(a, v_shift);
1880 dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1881 return true;
1882 }
1883
Identity16ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1884 LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
1885 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1886 const int tx_width, const int tx_height, const int16_t* source) {
1887 const int stride = frame.columns();
1888 uint8_t* dst = frame[start_y] + start_x;
1889 const __m128i v_eight = _mm_set1_epi16(8);
1890 const __m128i v_multiplier =
1891 _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
1892
1893 if (tx_width == 4) {
1894 int i = 0;
1895 do {
1896 const __m128i v_src = LoadLo8(&source[i * tx_width]);
1897 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1898 const __m128i frame_data = Load4(dst);
1899 const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1900 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1901 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1902 const __m128i b = _mm_srai_epi16(a, 4);
1903 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1904 const __m128i d = _mm_adds_epi16(c, b);
1905 Store4(dst, _mm_packus_epi16(d, d));
1906 dst += stride;
1907 } while (++i < tx_height);
1908 } else {
1909 int i = 0;
1910 do {
1911 const int row = i * tx_width;
1912 int j = 0;
1913 do {
1914 const __m128i v_src = LoadUnaligned16(&source[row + j]);
1915 const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1916 const __m128i frame_data = LoadLo8(dst + j);
1917 const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1918 const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1919 const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1920 const __m128i b = _mm_srai_epi16(a, 4);
1921 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1922 const __m128i d = _mm_adds_epi16(c, b);
1923 StoreLo8(dst + j, _mm_packus_epi16(d, d));
1924 j += 8;
1925 } while (j < tx_width);
1926 dst += stride;
1927 } while (++i < tx_height);
1928 }
1929 }
1930
Identity32Row16_SSE4_1(void * dest,const int32_t step)1931 LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
1932 const int32_t step) {
1933 auto* const dst = static_cast<int16_t*>(dest);
1934
1935 // When combining the identity32 multiplier with the row shift, the
1936 // calculation for tx_height equal to 16 can be simplified from
1937 // ((A * 4) + 1) >> 1) to (A * 2).
1938 for (int h = 0; h < 4; ++h) {
1939 for (int i = 0; i < 32; i += 8) {
1940 const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
1941 // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
1942 // saturating add here is ok.
1943 const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1944 StoreUnaligned16(&dst[h * step + i], v_dst_i);
1945 }
1946 }
1947 }
1948
Identity32DcOnly(void * dest,int adjusted_tx_height)1949 LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
1950 int adjusted_tx_height) {
1951 if (adjusted_tx_height > 1) return false;
1952
1953 auto* dst = static_cast<int16_t*>(dest);
1954 const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1955 const __m128i v_kTransformRowMultiplier =
1956 _mm_set1_epi16(kTransformRowMultiplier << 3);
1957 const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1958
1959 // When combining the identity32 multiplier with the row shift, the
1960 // calculation for tx_height equal to 16 can be simplified from
1961 // ((A * 4) + 1) >> 1) to (A * 2).
1962 const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
1963 dst[0] = _mm_extract_epi16(v_dst_0, 0);
1964 return true;
1965 }
1966
Identity32ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1967 LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
1968 Array2DView<uint8_t> frame, const int start_x, const int start_y,
1969 const int tx_width, const int tx_height, const int16_t* source) {
1970 const int stride = frame.columns();
1971 uint8_t* dst = frame[start_y] + start_x;
1972 const __m128i v_two = _mm_set1_epi16(2);
1973
1974 int i = 0;
1975 do {
1976 const int row = i * tx_width;
1977 int j = 0;
1978 do {
1979 const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
1980 const __m128i frame_data = LoadLo8(dst + j);
1981 const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
1982 const __m128i b = _mm_srai_epi16(a, 2);
1983 const __m128i c = _mm_cvtepu8_epi16(frame_data);
1984 const __m128i d = _mm_adds_epi16(c, b);
1985 StoreLo8(dst + j, _mm_packus_epi16(d, d));
1986 j += 8;
1987 } while (j < tx_width);
1988 dst += stride;
1989 } while (++i < tx_height);
1990 }
1991
1992 //------------------------------------------------------------------------------
1993 // Walsh Hadamard Transform.
1994
1995 // Process 4 wht4 rows and columns.
Wht4_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const void * source,const int adjusted_tx_height)1996 LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
1997 const int start_x, const int start_y,
1998 const void* source,
1999 const int adjusted_tx_height) {
2000 const auto* const src = static_cast<const int16_t*>(source);
2001 __m128i s[4], x[4];
2002
2003 if (adjusted_tx_height == 1) {
2004 // Special case: only src[0] is nonzero.
2005 // src[0] 0 0 0
2006 // 0 0 0 0
2007 // 0 0 0 0
2008 // 0 0 0 0
2009 //
2010 // After the row and column transforms are applied, we have:
2011 // f h h h
2012 // g i i i
2013 // g i i i
2014 // g i i i
2015 // where f, g, h, i are computed as follows.
2016 int16_t f = (src[0] >> 2) - (src[0] >> 3);
2017 const int16_t g = f >> 1;
2018 f = f - (f >> 1);
2019 const int16_t h = (src[0] >> 3) - (src[0] >> 4);
2020 const int16_t i = (src[0] >> 4);
2021 s[0] = _mm_set1_epi16(h);
2022 s[0] = _mm_insert_epi16(s[0], f, 0);
2023 s[1] = _mm_set1_epi16(i);
2024 s[1] = _mm_insert_epi16(s[1], g, 0);
2025 s[2] = s[3] = s[1];
2026 } else {
2027 x[0] = LoadLo8(&src[0 * 4]);
2028 x[2] = LoadLo8(&src[1 * 4]);
2029 x[3] = LoadLo8(&src[2 * 4]);
2030 x[1] = LoadLo8(&src[3 * 4]);
2031
2032 // Row transforms.
2033 Transpose4x4_U16(x, x);
2034 s[0] = _mm_srai_epi16(x[0], 2);
2035 s[2] = _mm_srai_epi16(x[1], 2);
2036 s[3] = _mm_srai_epi16(x[2], 2);
2037 s[1] = _mm_srai_epi16(x[3], 2);
2038 s[0] = _mm_add_epi16(s[0], s[2]);
2039 s[3] = _mm_sub_epi16(s[3], s[1]);
2040 __m128i e = _mm_sub_epi16(s[0], s[3]);
2041 e = _mm_srai_epi16(e, 1);
2042 s[1] = _mm_sub_epi16(e, s[1]);
2043 s[2] = _mm_sub_epi16(e, s[2]);
2044 s[0] = _mm_sub_epi16(s[0], s[1]);
2045 s[3] = _mm_add_epi16(s[3], s[2]);
2046 Transpose4x4_U16(s, s);
2047
2048 // Column transforms.
2049 s[0] = _mm_add_epi16(s[0], s[2]);
2050 s[3] = _mm_sub_epi16(s[3], s[1]);
2051 e = _mm_sub_epi16(s[0], s[3]);
2052 e = _mm_srai_epi16(e, 1);
2053 s[1] = _mm_sub_epi16(e, s[1]);
2054 s[2] = _mm_sub_epi16(e, s[2]);
2055 s[0] = _mm_sub_epi16(s[0], s[1]);
2056 s[3] = _mm_add_epi16(s[3], s[2]);
2057 }
2058
2059 // Store to frame.
2060 const int stride = frame.columns();
2061 uint8_t* dst = frame[start_y] + start_x;
2062 for (int row = 0; row < 4; ++row) {
2063 const __m128i frame_data = Load4(dst);
2064 const __m128i a = _mm_cvtepu8_epi16(frame_data);
2065 // Saturate to prevent overflowing int16_t
2066 const __m128i b = _mm_adds_epi16(a, s[row]);
2067 Store4(dst, _mm_packus_epi16(b, b));
2068 dst += stride;
2069 }
2070 }
2071
2072 //------------------------------------------------------------------------------
2073 // row/column transform loops
2074
2075 template <bool enable_flip_rows = false>
StoreToFrameWithRound(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source,TransformType tx_type)2076 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
2077 Array2DView<uint8_t> frame, const int start_x, const int start_y,
2078 const int tx_width, const int tx_height, const int16_t* source,
2079 TransformType tx_type) {
2080 const bool flip_rows =
2081 enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
2082 const __m128i v_eight = _mm_set1_epi16(8);
2083 const int stride = frame.columns();
2084 uint8_t* dst = frame[start_y] + start_x;
2085 if (tx_width == 4) {
2086 for (int i = 0; i < tx_height; ++i) {
2087 const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
2088 const __m128i residual = LoadLo8(&source[row]);
2089 const __m128i frame_data = Load4(dst);
2090 // Saturate to prevent overflowing int16_t
2091 const __m128i a = _mm_adds_epi16(residual, v_eight);
2092 const __m128i b = _mm_srai_epi16(a, 4);
2093 const __m128i c = _mm_cvtepu8_epi16(frame_data);
2094 const __m128i d = _mm_adds_epi16(c, b);
2095 Store4(dst, _mm_packus_epi16(d, d));
2096 dst += stride;
2097 }
2098 } else if (tx_width == 8) {
2099 for (int i = 0; i < tx_height; ++i) {
2100 const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
2101 const __m128i residual = LoadUnaligned16(&source[row]);
2102 const __m128i frame_data = LoadLo8(dst);
2103 // Saturate to prevent overflowing int16_t
2104 const __m128i b = _mm_adds_epi16(residual, v_eight);
2105 const __m128i c = _mm_srai_epi16(b, 4);
2106 const __m128i d = _mm_cvtepu8_epi16(frame_data);
2107 const __m128i e = _mm_adds_epi16(d, c);
2108 StoreLo8(dst, _mm_packus_epi16(e, e));
2109 dst += stride;
2110 }
2111 } else {
2112 for (int i = 0; i < tx_height; ++i) {
2113 const int y = start_y + i;
2114 const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
2115 int j = 0;
2116 do {
2117 const int x = start_x + j;
2118 const __m128i residual = LoadUnaligned16(&source[row + j]);
2119 const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
2120 const __m128i frame_data = LoadUnaligned16(frame[y] + x);
2121 const __m128i b = _mm_adds_epi16(residual, v_eight);
2122 const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
2123 const __m128i c = _mm_srai_epi16(b, 4);
2124 const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
2125 const __m128i d = _mm_cvtepu8_epi16(frame_data);
2126 const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
2127 const __m128i e = _mm_adds_epi16(d, c);
2128 const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
2129 StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
2130 j += 16;
2131 } while (j < tx_width);
2132 }
2133 }
2134 }
2135
2136 template <int tx_height>
FlipColumns(int16_t * source,int tx_width)2137 LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
2138 const __m128i word_reverse_8 =
2139 _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
2140 if (tx_width >= 16) {
2141 int i = 0;
2142 do {
2143 // read 16 shorts
2144 const __m128i v3210 = LoadUnaligned16(&source[i]);
2145 const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
2146 const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
2147 const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
2148 StoreUnaligned16(&source[i], v4567);
2149 StoreUnaligned16(&source[i + 8], v0123);
2150 i += 16;
2151 } while (i < tx_width * tx_height);
2152 } else if (tx_width == 8) {
2153 for (int i = 0; i < 8 * tx_height; i += 8) {
2154 const __m128i a = LoadUnaligned16(&source[i]);
2155 const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
2156 StoreUnaligned16(&source[i], b);
2157 }
2158 } else {
2159 const __m128i dual_word_reverse_4 =
2160 _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
2161 // Process two rows per iteration.
2162 for (int i = 0; i < 4 * tx_height; i += 8) {
2163 const __m128i a = LoadUnaligned16(&source[i]);
2164 const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
2165 StoreUnaligned16(&source[i], b);
2166 }
2167 }
2168 }
2169
2170 template <int tx_width>
ApplyRounding(int16_t * source,int num_rows)2171 LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
2172 const __m128i v_kTransformRowMultiplier =
2173 _mm_set1_epi16(kTransformRowMultiplier << 3);
2174 if (tx_width == 4) {
2175 // Process two rows per iteration.
2176 int i = 0;
2177 do {
2178 const __m128i a = LoadUnaligned16(&source[i]);
2179 const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2180 StoreUnaligned16(&source[i], b);
2181 i += 8;
2182 } while (i < tx_width * num_rows);
2183 } else {
2184 int i = 0;
2185 do {
2186 // The last 32 values of every row are always zero if the |tx_width| is
2187 // 64.
2188 const int non_zero_width = (tx_width < 64) ? tx_width : 32;
2189 int j = 0;
2190 do {
2191 const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
2192 const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2193 StoreUnaligned16(&source[i * tx_width + j], b);
2194 j += 8;
2195 } while (j < non_zero_width);
2196 } while (++i < num_rows);
2197 }
2198 }
2199
2200 template <int tx_width>
RowShift(int16_t * source,int num_rows,int row_shift)2201 LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
2202 int row_shift) {
2203 const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
2204 const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
2205 if (tx_width == 4) {
2206 // Process two rows per iteration.
2207 int i = 0;
2208 do {
2209 const __m128i residual = LoadUnaligned16(&source[i]);
2210 const __m128i shifted_residual =
2211 ShiftResidual(residual, v_row_shift_add, v_row_shift);
2212 StoreUnaligned16(&source[i], shifted_residual);
2213 i += 8;
2214 } while (i < tx_width * num_rows);
2215 } else {
2216 int i = 0;
2217 do {
2218 for (int j = 0; j < tx_width; j += 8) {
2219 const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
2220 const __m128i shifted_residual =
2221 ShiftResidual(residual, v_row_shift_add, v_row_shift);
2222 StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
2223 }
2224 } while (++i < num_rows);
2225 }
2226 }
2227
Dct4TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2228 void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2229 TransformSize tx_size, int adjusted_tx_height,
2230 void* src_buffer, int /*start_x*/,
2231 int /*start_y*/, void* /*dst_frame*/) {
2232 auto* src = static_cast<int16_t*>(src_buffer);
2233 const int tx_height = kTransformHeight[tx_size];
2234 const bool should_round = (tx_height == 8);
2235 const int row_shift = static_cast<int>(tx_height == 16);
2236
2237 if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
2238 return;
2239 }
2240
2241 if (should_round) {
2242 ApplyRounding<4>(src, adjusted_tx_height);
2243 }
2244
2245 if (adjusted_tx_height <= 4) {
2246 // Process 4 1d dct4 rows in parallel.
2247 Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
2248 /*transpose=*/true);
2249 } else {
2250 // Process 8 1d dct4 rows in parallel per iteration.
2251 int i = 0;
2252 do {
2253 Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
2254 /*transpose=*/true);
2255 i += 8;
2256 } while (i < adjusted_tx_height);
2257 }
2258 if (tx_height == 16) {
2259 RowShift<4>(src, adjusted_tx_height, 1);
2260 }
2261 }
2262
Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2263 void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
2264 TransformSize tx_size,
2265 int adjusted_tx_height, void* src_buffer,
2266 int start_x, int start_y, void* dst_frame) {
2267 auto* src = static_cast<int16_t*>(src_buffer);
2268 const int tx_width = kTransformWidth[tx_size];
2269
2270 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2271 FlipColumns<4>(src, tx_width);
2272 }
2273
2274 if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
2275 if (tx_width == 4) {
2276 // Process 4 1d dct4 columns in parallel.
2277 Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
2278 /*transpose=*/false);
2279 } else {
2280 // Process 8 1d dct4 columns in parallel per iteration.
2281 int i = 0;
2282 do {
2283 Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
2284 /*transpose=*/false);
2285 i += 8;
2286 } while (i < tx_width);
2287 }
2288 }
2289 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2290 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
2291 }
2292
Dct8TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2293 void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2294 TransformSize tx_size, int adjusted_tx_height,
2295 void* src_buffer, int /*start_x*/,
2296 int /*start_y*/, void* /*dst_frame*/) {
2297 auto* src = static_cast<int16_t*>(src_buffer);
2298 const bool should_round = kShouldRound[tx_size];
2299 const uint8_t row_shift = kTransformRowShift[tx_size];
2300
2301 if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
2302 return;
2303 }
2304
2305 if (should_round) {
2306 ApplyRounding<8>(src, adjusted_tx_height);
2307 }
2308
2309 if (adjusted_tx_height <= 4) {
2310 // Process 4 1d dct8 rows in parallel.
2311 Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
2312 } else {
2313 // Process 8 1d dct8 rows in parallel per iteration.
2314 int i = 0;
2315 do {
2316 Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
2317 /*transpose=*/true);
2318 i += 8;
2319 } while (i < adjusted_tx_height);
2320 }
2321 if (row_shift > 0) {
2322 RowShift<8>(src, adjusted_tx_height, row_shift);
2323 }
2324 }
2325
Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2326 void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
2327 TransformSize tx_size,
2328 int adjusted_tx_height, void* src_buffer,
2329 int start_x, int start_y, void* dst_frame) {
2330 auto* src = static_cast<int16_t*>(src_buffer);
2331 const int tx_width = kTransformWidth[tx_size];
2332
2333 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2334 FlipColumns<8>(src, tx_width);
2335 }
2336
2337 if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
2338 if (tx_width == 4) {
2339 // Process 4 1d dct8 columns in parallel.
2340 Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2341 } else {
2342 // Process 8 1d dct8 columns in parallel per iteration.
2343 int i = 0;
2344 do {
2345 Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2346 /*transpose=*/false);
2347 i += 8;
2348 } while (i < tx_width);
2349 }
2350 }
2351 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2352 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
2353 }
2354
Dct16TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2355 void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2356 TransformSize tx_size, int adjusted_tx_height,
2357 void* src_buffer, int /*start_x*/,
2358 int /*start_y*/, void* /*dst_frame*/) {
2359 auto* src = static_cast<int16_t*>(src_buffer);
2360 const bool should_round = kShouldRound[tx_size];
2361 const uint8_t row_shift = kTransformRowShift[tx_size];
2362
2363 if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
2364 return;
2365 }
2366
2367 if (should_round) {
2368 ApplyRounding<16>(src, adjusted_tx_height);
2369 }
2370
2371 if (adjusted_tx_height <= 4) {
2372 // Process 4 1d dct16 rows in parallel.
2373 Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
2374 } else {
2375 int i = 0;
2376 do {
2377 // Process 8 1d dct16 rows in parallel per iteration.
2378 Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
2379 /*transpose=*/true);
2380 i += 8;
2381 } while (i < adjusted_tx_height);
2382 }
2383 // row_shift is always non zero here.
2384 RowShift<16>(src, adjusted_tx_height, row_shift);
2385 }
2386
Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2387 void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
2388 TransformSize tx_size,
2389 int adjusted_tx_height, void* src_buffer,
2390 int start_x, int start_y,
2391 void* dst_frame) {
2392 auto* src = static_cast<int16_t*>(src_buffer);
2393 const int tx_width = kTransformWidth[tx_size];
2394
2395 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2396 FlipColumns<16>(src, tx_width);
2397 }
2398
2399 if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
2400 if (tx_width == 4) {
2401 // Process 4 1d dct16 columns in parallel.
2402 Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2403 } else {
2404 int i = 0;
2405 do {
2406 // Process 8 1d dct16 columns in parallel per iteration.
2407 Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2408 /*transpose=*/false);
2409 i += 8;
2410 } while (i < tx_width);
2411 }
2412 }
2413 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2414 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
2415 }
2416
Dct32TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2417 void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2418 TransformSize tx_size, int adjusted_tx_height,
2419 void* src_buffer, int /*start_x*/,
2420 int /*start_y*/, void* /*dst_frame*/) {
2421 auto* src = static_cast<int16_t*>(src_buffer);
2422 const bool should_round = kShouldRound[tx_size];
2423 const uint8_t row_shift = kTransformRowShift[tx_size];
2424
2425 if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
2426 return;
2427 }
2428
2429 if (should_round) {
2430 ApplyRounding<32>(src, adjusted_tx_height);
2431 }
2432 // Process 8 1d dct32 rows in parallel per iteration.
2433 int i = 0;
2434 do {
2435 Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
2436 i += 8;
2437 } while (i < adjusted_tx_height);
2438 // row_shift is always non zero here.
2439 RowShift<32>(src, adjusted_tx_height, row_shift);
2440 }
2441
Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2442 void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
2443 TransformSize tx_size,
2444 int adjusted_tx_height, void* src_buffer,
2445 int start_x, int start_y,
2446 void* dst_frame) {
2447 auto* src = static_cast<int16_t*>(src_buffer);
2448 const int tx_width = kTransformWidth[tx_size];
2449
2450 if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
2451 // Process 8 1d dct32 columns in parallel per iteration.
2452 int i = 0;
2453 do {
2454 Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
2455 i += 8;
2456 } while (i < tx_width);
2457 }
2458 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2459 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
2460 }
2461
Dct64TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2462 void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2463 TransformSize tx_size, int adjusted_tx_height,
2464 void* src_buffer, int /*start_x*/,
2465 int /*start_y*/, void* /*dst_frame*/) {
2466 auto* src = static_cast<int16_t*>(src_buffer);
2467 const bool should_round = kShouldRound[tx_size];
2468 const uint8_t row_shift = kTransformRowShift[tx_size];
2469
2470 if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
2471 return;
2472 }
2473
2474 if (should_round) {
2475 ApplyRounding<64>(src, adjusted_tx_height);
2476 }
2477 // Process 8 1d dct64 rows in parallel per iteration.
2478 int i = 0;
2479 do {
2480 Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
2481 i += 8;
2482 } while (i < adjusted_tx_height);
2483 // row_shift is always non zero here.
2484 RowShift<64>(src, adjusted_tx_height, row_shift);
2485 }
2486
Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2487 void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
2488 TransformSize tx_size,
2489 int adjusted_tx_height, void* src_buffer,
2490 int start_x, int start_y,
2491 void* dst_frame) {
2492 auto* src = static_cast<int16_t*>(src_buffer);
2493 const int tx_width = kTransformWidth[tx_size];
2494
2495 if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
2496 // Process 8 1d dct64 columns in parallel per iteration.
2497 int i = 0;
2498 do {
2499 Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
2500 i += 8;
2501 } while (i < tx_width);
2502 }
2503 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2504 StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
2505 }
2506
Adst4TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2507 void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2508 TransformSize tx_size, int adjusted_tx_height,
2509 void* src_buffer, int /*start_x*/,
2510 int /*start_y*/, void* /*dst_frame*/) {
2511 auto* src = static_cast<int16_t*>(src_buffer);
2512 const int tx_height = kTransformHeight[tx_size];
2513 const int row_shift = static_cast<int>(tx_height == 16);
2514 const bool should_round = (tx_height == 8);
2515
2516 if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2517 return;
2518 }
2519
2520 if (should_round) {
2521 ApplyRounding<4>(src, adjusted_tx_height);
2522 }
2523
2524 // Process 4 1d adst4 rows in parallel per iteration.
2525 int i = 0;
2526 do {
2527 Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
2528 i += 4;
2529 } while (i < adjusted_tx_height);
2530
2531 if (row_shift != 0) {
2532 RowShift<4>(src, adjusted_tx_height, 1);
2533 }
2534 }
2535
Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2536 void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
2537 TransformSize tx_size,
2538 int adjusted_tx_height, void* src_buffer,
2539 int start_x, int start_y,
2540 void* dst_frame) {
2541 auto* src = static_cast<int16_t*>(src_buffer);
2542 const int tx_width = kTransformWidth[tx_size];
2543
2544 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2545 FlipColumns<4>(src, tx_width);
2546 }
2547
2548 if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2549 // Process 4 1d adst4 columns in parallel per iteration.
2550 int i = 0;
2551 do {
2552 Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
2553 i += 4;
2554 } while (i < tx_width);
2555 }
2556 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2557 StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2558 tx_width, 4, src, tx_type);
2559 }
2560
Adst8TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2561 void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2562 TransformSize tx_size, int adjusted_tx_height,
2563 void* src_buffer, int /*start_x*/,
2564 int /*start_y*/, void* /*dst_frame*/) {
2565 auto* src = static_cast<int16_t*>(src_buffer);
2566 const bool should_round = kShouldRound[tx_size];
2567 const uint8_t row_shift = kTransformRowShift[tx_size];
2568
2569 if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2570 return;
2571 }
2572
2573 if (should_round) {
2574 ApplyRounding<8>(src, adjusted_tx_height);
2575 }
2576
2577 if (adjusted_tx_height <= 4) {
2578 // Process 4 1d adst8 rows in parallel.
2579 Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
2580 /*transpose=*/true);
2581 } else {
2582 // Process 8 1d adst8 rows in parallel per iteration.
2583 int i = 0;
2584 do {
2585 Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
2586 /*transpose=*/true);
2587 i += 8;
2588 } while (i < adjusted_tx_height);
2589 }
2590 if (row_shift > 0) {
2591 RowShift<8>(src, adjusted_tx_height, row_shift);
2592 }
2593 }
2594
Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2595 void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
2596 TransformSize tx_size,
2597 int adjusted_tx_height, void* src_buffer,
2598 int start_x, int start_y,
2599 void* dst_frame) {
2600 auto* src = static_cast<int16_t*>(src_buffer);
2601 const int tx_width = kTransformWidth[tx_size];
2602
2603 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2604 FlipColumns<8>(src, tx_width);
2605 }
2606
2607 if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2608 if (tx_width == 4) {
2609 // Process 4 1d adst8 columns in parallel.
2610 Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2611 } else {
2612 // Process 8 1d adst8 columns in parallel per iteration.
2613 int i = 0;
2614 do {
2615 Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2616 /*transpose=*/false);
2617 i += 8;
2618 } while (i < tx_width);
2619 }
2620 }
2621 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2622 StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2623 tx_width, 8, src, tx_type);
2624 }
2625
Adst16TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2626 void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2627 TransformSize tx_size,
2628 int adjusted_tx_height, void* src_buffer,
2629 int /*start_x*/, int /*start_y*/,
2630 void* /*dst_frame*/) {
2631 auto* src = static_cast<int16_t*>(src_buffer);
2632 const bool should_round = kShouldRound[tx_size];
2633 const uint8_t row_shift = kTransformRowShift[tx_size];
2634
2635 if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2636 return;
2637 }
2638
2639 if (should_round) {
2640 ApplyRounding<16>(src, adjusted_tx_height);
2641 }
2642
2643 if (adjusted_tx_height <= 4) {
2644 // Process 4 1d adst16 rows in parallel.
2645 Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
2646 } else {
2647 int i = 0;
2648 do {
2649 // Process 8 1d adst16 rows in parallel per iteration.
2650 Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
2651 /*transpose=*/true);
2652 i += 8;
2653 } while (i < adjusted_tx_height);
2654 }
2655 // row_shift is always non zero here.
2656 RowShift<16>(src, adjusted_tx_height, row_shift);
2657 }
2658
Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2659 void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
2660 TransformSize tx_size,
2661 int adjusted_tx_height, void* src_buffer,
2662 int start_x, int start_y,
2663 void* dst_frame) {
2664 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2665 auto* src = static_cast<int16_t*>(src_buffer);
2666 const int tx_width = kTransformWidth[tx_size];
2667
2668 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2669 FlipColumns<16>(src, tx_width);
2670 }
2671
2672 if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2673 if (tx_width == 4) {
2674 // Process 4 1d adst16 columns in parallel.
2675 Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2676 } else {
2677 int i = 0;
2678 do {
2679 // Process 8 1d adst16 columns in parallel per iteration.
2680 Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2681 /*transpose=*/false);
2682 i += 8;
2683 } while (i < tx_width);
2684 }
2685 }
2686 StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2687 tx_width, 16, src, tx_type);
2688 }
2689
Identity4TransformLoopRow_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2690 void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
2691 TransformSize tx_size,
2692 int adjusted_tx_height, void* src_buffer,
2693 int /*start_x*/, int /*start_y*/,
2694 void* /*dst_frame*/) {
2695 // Special case: Process row calculations during column transform call.
2696 // Improves performance.
2697 if (tx_type == kTransformTypeIdentityIdentity &&
2698 tx_size == kTransformSize4x4) {
2699 return;
2700 }
2701
2702 auto* src = static_cast<int16_t*>(src_buffer);
2703 const int tx_height = kTransformHeight[tx_size];
2704 const bool should_round = (tx_height == 8);
2705 if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
2706 return;
2707 }
2708
2709 if (should_round) {
2710 ApplyRounding<4>(src, adjusted_tx_height);
2711 }
2712 if (tx_height < 16) {
2713 int i = 0;
2714 do {
2715 Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
2716 i += 4;
2717 } while (i < adjusted_tx_height);
2718 } else {
2719 int i = 0;
2720 do {
2721 Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
2722 i += 4;
2723 } while (i < adjusted_tx_height);
2724 }
2725 }
2726
Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2727 void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
2728 TransformSize tx_size,
2729 int adjusted_tx_height,
2730 void* src_buffer, int start_x,
2731 int start_y, void* dst_frame) {
2732 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2733 auto* src = static_cast<int16_t*>(src_buffer);
2734 const int tx_width = kTransformWidth[tx_size];
2735
2736 // Special case: Process row calculations during column transform call.
2737 if (tx_type == kTransformTypeIdentityIdentity &&
2738 (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
2739 Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
2740 adjusted_tx_height, src);
2741 return;
2742 }
2743
2744 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2745 FlipColumns<4>(src, tx_width);
2746 }
2747
2748 Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
2749 adjusted_tx_height, src);
2750 }
2751
Identity8TransformLoopRow_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2752 void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
2753 TransformSize tx_size,
2754 int adjusted_tx_height, void* src_buffer,
2755 int /*start_x*/, int /*start_y*/,
2756 void* /*dst_frame*/) {
2757 // Special case: Process row calculations during column transform call.
2758 // Improves performance.
2759 if (tx_type == kTransformTypeIdentityIdentity &&
2760 tx_size == kTransformSize8x4) {
2761 return;
2762 }
2763
2764 auto* src = static_cast<int16_t*>(src_buffer);
2765 const int tx_height = kTransformHeight[tx_size];
2766 const bool should_round = kShouldRound[tx_size];
2767 const uint8_t row_shift = kTransformRowShift[tx_size];
2768 if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2769 return;
2770 }
2771
2772 if (should_round) {
2773 ApplyRounding<8>(src, adjusted_tx_height);
2774 }
2775
2776 // When combining the identity8 multiplier with the row shift, the
2777 // calculations for tx_height == 8 and tx_height == 16 can be simplified
2778 // from ((A * 2) + 1) >> 1) to A.
2779 if ((tx_height & 0x18) != 0) {
2780 return;
2781 }
2782 if (tx_height == 32) {
2783 int i = 0;
2784 do {
2785 Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
2786 i += 4;
2787 } while (i < adjusted_tx_height);
2788 return;
2789 }
2790
2791 assert(tx_size == kTransformSize8x4);
2792 int i = 0;
2793 do {
2794 Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
2795 i += 4;
2796 } while (i < adjusted_tx_height);
2797 }
2798
Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2799 void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
2800 TransformSize tx_size,
2801 int adjusted_tx_height,
2802 void* src_buffer, int start_x,
2803 int start_y, void* dst_frame) {
2804 auto* src = static_cast<int16_t*>(src_buffer);
2805 const int tx_width = kTransformWidth[tx_size];
2806
2807 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2808 FlipColumns<8>(src, tx_width);
2809 }
2810
2811 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2812 Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
2813 adjusted_tx_height, src);
2814 }
2815
Identity16TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2816 void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2817 TransformSize tx_size,
2818 int adjusted_tx_height, void* src_buffer,
2819 int /*start_x*/, int /*start_y*/,
2820 void* /*dst_frame*/) {
2821 auto* src = static_cast<int16_t*>(src_buffer);
2822 const bool should_round = kShouldRound[tx_size];
2823 const uint8_t row_shift = kTransformRowShift[tx_size];
2824 if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2825 return;
2826 }
2827
2828 if (should_round) {
2829 ApplyRounding<16>(src, adjusted_tx_height);
2830 }
2831 int i = 0;
2832 do {
2833 Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
2834 kTransformRowShift[tx_size]);
2835 i += 4;
2836 } while (i < adjusted_tx_height);
2837 }
2838
Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2839 void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
2840 TransformSize tx_size,
2841 int adjusted_tx_height,
2842 void* src_buffer, int start_x,
2843 int start_y, void* dst_frame) {
2844 auto* src = static_cast<int16_t*>(src_buffer);
2845 const int tx_width = kTransformWidth[tx_size];
2846
2847 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2848 FlipColumns<16>(src, tx_width);
2849 }
2850 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2851 Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
2852 adjusted_tx_height, src);
2853 }
2854
Identity32TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2855 void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2856 TransformSize tx_size,
2857 int adjusted_tx_height, void* src_buffer,
2858 int /*start_x*/, int /*start_y*/,
2859 void* /*dst_frame*/) {
2860 const int tx_height = kTransformHeight[tx_size];
2861 // When combining the identity32 multiplier with the row shift, the
2862 // calculations for tx_height == 8 and tx_height == 32 can be simplified
2863 // from ((A * 4) + 2) >> 2) to A.
2864 if ((tx_height & 0x28) != 0) {
2865 return;
2866 }
2867
2868 // Process kTransformSize32x16. The src is always rounded before the
2869 // identity transform and shifted by 1 afterwards.
2870 auto* src = static_cast<int16_t*>(src_buffer);
2871 if (Identity32DcOnly(src, adjusted_tx_height)) {
2872 return;
2873 }
2874
2875 assert(tx_size == kTransformSize32x16);
2876 ApplyRounding<32>(src, adjusted_tx_height);
2877 int i = 0;
2878 do {
2879 Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
2880 i += 4;
2881 } while (i < adjusted_tx_height);
2882 }
2883
Identity32TransformLoopColumn_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2884 void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
2885 TransformSize tx_size,
2886 int adjusted_tx_height,
2887 void* src_buffer, int start_x,
2888 int start_y, void* dst_frame) {
2889 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2890 auto* src = static_cast<int16_t*>(src_buffer);
2891 const int tx_width = kTransformWidth[tx_size];
2892
2893 Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
2894 adjusted_tx_height, src);
2895 }
2896
Wht4TransformLoopRow_SSE4_1(TransformType tx_type,TransformSize tx_size,int,void *,int,int,void *)2897 void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
2898 int /*adjusted_tx_height*/,
2899 void* /*src_buffer*/, int /*start_x*/,
2900 int /*start_y*/, void* /*dst_frame*/) {
2901 assert(tx_type == kTransformTypeDctDct);
2902 assert(tx_size == kTransformSize4x4);
2903 static_cast<void>(tx_type);
2904 static_cast<void>(tx_size);
2905 // Do both row and column transforms in the column-transform pass.
2906 }
2907
Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2908 void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
2909 TransformSize tx_size,
2910 int adjusted_tx_height, void* src_buffer,
2911 int start_x, int start_y, void* dst_frame) {
2912 assert(tx_type == kTransformTypeDctDct);
2913 assert(tx_size == kTransformSize4x4);
2914 static_cast<void>(tx_type);
2915 static_cast<void>(tx_size);
2916
2917 // Do both row and column transforms in the column-transform pass.
2918 // Process 4 1d wht4 rows and columns in parallel.
2919 const auto* src = static_cast<int16_t*>(src_buffer);
2920 auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2921 Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
2922 }
2923
2924 //------------------------------------------------------------------------------
2925
Init8bpp()2926 void Init8bpp() {
2927 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
2928 assert(dsp != nullptr);
2929
2930 // Maximum transform size for Dct is 64.
2931 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
2932 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
2933 Dct4TransformLoopRow_SSE4_1;
2934 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
2935 Dct4TransformLoopColumn_SSE4_1;
2936 #endif
2937 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
2938 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
2939 Dct8TransformLoopRow_SSE4_1;
2940 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
2941 Dct8TransformLoopColumn_SSE4_1;
2942 #endif
2943 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
2944 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
2945 Dct16TransformLoopRow_SSE4_1;
2946 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
2947 Dct16TransformLoopColumn_SSE4_1;
2948 #endif
2949 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
2950 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
2951 Dct32TransformLoopRow_SSE4_1;
2952 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
2953 Dct32TransformLoopColumn_SSE4_1;
2954 #endif
2955 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
2956 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
2957 Dct64TransformLoopRow_SSE4_1;
2958 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
2959 Dct64TransformLoopColumn_SSE4_1;
2960 #endif
2961
2962 // Maximum transform size for Adst is 16.
2963 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
2964 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
2965 Adst4TransformLoopRow_SSE4_1;
2966 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
2967 Adst4TransformLoopColumn_SSE4_1;
2968 #endif
2969 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
2970 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
2971 Adst8TransformLoopRow_SSE4_1;
2972 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
2973 Adst8TransformLoopColumn_SSE4_1;
2974 #endif
2975 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
2976 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
2977 Adst16TransformLoopRow_SSE4_1;
2978 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
2979 Adst16TransformLoopColumn_SSE4_1;
2980 #endif
2981
2982 // Maximum transform size for Identity transform is 32.
2983 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
2984 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
2985 Identity4TransformLoopRow_SSE4_1;
2986 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
2987 Identity4TransformLoopColumn_SSE4_1;
2988 #endif
2989 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
2990 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
2991 Identity8TransformLoopRow_SSE4_1;
2992 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
2993 Identity8TransformLoopColumn_SSE4_1;
2994 #endif
2995 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
2996 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
2997 Identity16TransformLoopRow_SSE4_1;
2998 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
2999 Identity16TransformLoopColumn_SSE4_1;
3000 #endif
3001 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
3002 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
3003 Identity32TransformLoopRow_SSE4_1;
3004 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
3005 Identity32TransformLoopColumn_SSE4_1;
3006 #endif
3007
3008 // Maximum transform size for Wht is 4.
3009 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
3010 dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
3011 Wht4TransformLoopRow_SSE4_1;
3012 dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
3013 Wht4TransformLoopColumn_SSE4_1;
3014 #endif
3015 }
3016
3017 } // namespace
3018 } // namespace low_bitdepth
3019
InverseTransformInit_SSE4_1()3020 void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
3021
3022 } // namespace dsp
3023 } // namespace libgav1
3024 #else // !LIBGAV1_TARGETING_SSE4_1
3025 namespace libgav1 {
3026 namespace dsp {
3027
InverseTransformInit_SSE4_1()3028 void InverseTransformInit_SSE4_1() {}
3029
3030 } // namespace dsp
3031 } // namespace libgav1
3032 #endif // LIBGAV1_TARGETING_SSE4_1
3033