• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/inverse_transform.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <smmintrin.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstdint>
25 #include <cstring>
26 
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/dsp/x86/transpose_sse4.h"
31 #include "src/utils/array_2d.h"
32 #include "src/utils/common.h"
33 #include "src/utils/compiler_attributes.h"
34 
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39 
40 // Include the constants and utility functions inside the anonymous namespace.
41 #include "src/dsp/inverse_transform.inc"
42 
43 template <int store_width, int store_count>
StoreDst(int16_t * dst,int32_t stride,int32_t idx,const __m128i * s)44 LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
45                                     const __m128i* s) {
46   // NOTE: It is expected that the compiler will unroll these loops.
47   if (store_width == 16) {
48     for (int i = 0; i < store_count; i += 4) {
49       StoreUnaligned16(&dst[i * stride + idx], s[i]);
50       StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
51       StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
52       StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
53     }
54   }
55   if (store_width == 8) {
56     for (int i = 0; i < store_count; i += 4) {
57       StoreLo8(&dst[i * stride + idx], s[i]);
58       StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
59       StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
60       StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
61     }
62   }
63 }
64 
65 template <int load_width, int load_count>
LoadSrc(const int16_t * src,int32_t stride,int32_t idx,__m128i * x)66 LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
67                                    int32_t idx, __m128i* x) {
68   // NOTE: It is expected that the compiler will unroll these loops.
69   if (load_width == 16) {
70     for (int i = 0; i < load_count; i += 4) {
71       x[i] = LoadUnaligned16(&src[i * stride + idx]);
72       x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
73       x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
74       x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
75     }
76   }
77   if (load_width == 8) {
78     for (int i = 0; i < load_count; i += 4) {
79       x[i] = LoadLo8(&src[i * stride + idx]);
80       x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
81       x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
82       x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
83     }
84   }
85 }
86 
87 // Butterfly rotate 4 values.
ButterflyRotation_4(__m128i * a,__m128i * b,const int angle,const bool flip)88 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
89                                                const int angle,
90                                                const bool flip) {
91   const int16_t cos128 = Cos128(angle);
92   const int16_t sin128 = Sin128(angle);
93   const __m128i psin_pcos = _mm_set1_epi32(
94       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
95   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
96   const __m128i ab = _mm_unpacklo_epi16(*b, *a);
97   const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
98   // -sin cos, -sin cos, -sin cos, -sin cos
99   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
100   const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
101   const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
102   const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
103   const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
104   const __m128i x = _mm_packs_epi32(x1, x1);
105   const __m128i y = _mm_packs_epi32(y1, y1);
106   if (flip) {
107     *a = y;
108     *b = x;
109   } else {
110     *a = x;
111     *b = y;
112   }
113 }
114 
115 // Butterfly rotate 8 values.
ButterflyRotation_8(__m128i * a,__m128i * b,const int angle,const bool flip)116 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
117                                                const int angle,
118                                                const bool flip) {
119   const int16_t cos128 = Cos128(angle);
120   const int16_t sin128 = Sin128(angle);
121   const __m128i psin_pcos = _mm_set1_epi32(
122       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
123   const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
124   // -sin cos, -sin cos, -sin cos, -sin cos
125   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
126   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
127   const __m128i ab = _mm_unpacklo_epi16(*b, *a);
128   const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
129   const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
130   const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
131   const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
132   const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
133   const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
134   const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
135   const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
136   const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
137   const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
138   const __m128i x = _mm_packs_epi32(x1, x1_hi);
139   const __m128i y = _mm_packs_epi32(y1, y1_hi);
140   if (flip) {
141     *a = y;
142     *b = x;
143   } else {
144     *a = x;
145     *b = y;
146   }
147 }
148 
ButterflyRotation_FirstIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)149 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
150                                                          const int angle,
151                                                          const bool flip) {
152   const int16_t cos128 = Cos128(angle);
153   const int16_t sin128 = Sin128(angle);
154   const __m128i pcos = _mm_set1_epi16(cos128 << 3);
155   const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
156   const __m128i x = _mm_mulhrs_epi16(*b, psin);
157   const __m128i y = _mm_mulhrs_epi16(*b, pcos);
158   if (flip) {
159     *a = y;
160     *b = x;
161   } else {
162     *a = x;
163     *b = y;
164   }
165 }
166 
ButterflyRotation_SecondIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)167 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
168                                                           __m128i* b,
169                                                           const int angle,
170                                                           const bool flip) {
171   const int16_t cos128 = Cos128(angle);
172   const int16_t sin128 = Sin128(angle);
173   const __m128i pcos = _mm_set1_epi16(cos128 << 3);
174   const __m128i psin = _mm_set1_epi16(sin128 << 3);
175   const __m128i x = _mm_mulhrs_epi16(*a, pcos);
176   const __m128i y = _mm_mulhrs_epi16(*a, psin);
177   if (flip) {
178     *a = y;
179     *b = x;
180   } else {
181     *a = x;
182     *b = y;
183   }
184 }
185 
HadamardRotation(__m128i * a,__m128i * b,bool flip)186 LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
187   __m128i x, y;
188   if (flip) {
189     y = _mm_adds_epi16(*b, *a);
190     x = _mm_subs_epi16(*b, *a);
191   } else {
192     x = _mm_adds_epi16(*a, *b);
193     y = _mm_subs_epi16(*a, *b);
194   }
195   *a = x;
196   *b = y;
197 }
198 
199 using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
200                                        bool flip);
201 
ShiftResidual(const __m128i residual,const __m128i v_row_shift_add,const __m128i v_row_shift)202 LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
203                                             const __m128i v_row_shift_add,
204                                             const __m128i v_row_shift) {
205   const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
206   // The max row_shift is 2, so int16_t values greater than 0x7ffd may
207   // overflow.  Generate a mask for this case.
208   const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
209   const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
210   // Assume int16_t values.
211   const __m128i a = _mm_sra_epi16(x, v_row_shift);
212   // Assume uint16_t values.
213   const __m128i b = _mm_srl_epi16(x, v_row_shift);
214   // Select the correct shifted value.
215   return _mm_blendv_epi8(a, b, mask);
216 }
217 
218 //------------------------------------------------------------------------------
219 // Discrete Cosine Transforms (DCT).
220 
221 template <int width>
DctDcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)222 LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
223                                      bool should_round, int row_shift) {
224   if (adjusted_tx_height > 1) return false;
225 
226   auto* dst = static_cast<int16_t*>(dest);
227   const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
228   const __m128i v_src =
229       (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
230   const __m128i v_mask =
231       _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
232   const __m128i v_kTransformRowMultiplier =
233       _mm_set1_epi16(kTransformRowMultiplier << 3);
234   const __m128i v_src_round =
235       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
236   const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
237   const int16_t cos128 = Cos128(32);
238   const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
239 
240   // Expand to 32 bits to prevent int16_t overflows during the shift add.
241   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
242   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
243   const __m128i a = _mm_cvtepi16_epi32(xy);
244   const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
245   const __m128i b = _mm_add_epi32(a, v_row_shift_add);
246   const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
247   const __m128i c = _mm_sra_epi32(b, v_row_shift);
248   const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
249   const __m128i xy_shifted = _mm_packs_epi32(c, c1);
250 
251   if (width == 4) {
252     StoreLo8(dst, xy_shifted);
253   } else {
254     for (int i = 0; i < width; i += 8) {
255       StoreUnaligned16(dst, xy_shifted);
256       dst += 8;
257     }
258   }
259   return true;
260 }
261 
262 template <int height>
DctDcOnlyColumn(void * dest,int adjusted_tx_height,int width)263 LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
264                                            int width) {
265   if (adjusted_tx_height > 1) return false;
266 
267   auto* dst = static_cast<int16_t*>(dest);
268   const int16_t cos128 = Cos128(32);
269 
270   // Calculate dc values for first row.
271   if (width == 4) {
272     const __m128i v_src = LoadLo8(dst);
273     const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
274     StoreLo8(dst, xy);
275   } else {
276     int i = 0;
277     do {
278       const __m128i v_src = LoadUnaligned16(&dst[i]);
279       const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
280       StoreUnaligned16(&dst[i], xy);
281       i += 8;
282     } while (i < width);
283   }
284 
285   // Copy first row to the rest of the block.
286   for (int y = 1; y < height; ++y) {
287     memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
288   }
289   return true;
290 }
291 
292 template <ButterflyRotationFunc butterfly_rotation,
293           bool is_fast_butterfly = false>
Dct4Stages(__m128i * s)294 LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
295   // stage 12.
296   if (is_fast_butterfly) {
297     ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
298     ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
299   } else {
300     butterfly_rotation(&s[0], &s[1], 32, true);
301     butterfly_rotation(&s[2], &s[3], 48, false);
302   }
303 
304   // stage 17.
305   HadamardRotation(&s[0], &s[3], false);
306   HadamardRotation(&s[1], &s[2], false);
307 }
308 
309 // Process 4 dct4 rows or columns, depending on the transpose flag.
310 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Dct4_SSE4_1(void * dest,int32_t step,bool transpose)311 LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
312                                        bool transpose) {
313   auto* const dst = static_cast<int16_t*>(dest);
314   __m128i s[4], x[4];
315 
316   if (stage_is_rectangular) {
317     if (transpose) {
318       __m128i input[8];
319       LoadSrc<8, 8>(dst, step, 0, input);
320       Transpose4x8To8x4_U16(input, x);
321     } else {
322       LoadSrc<16, 4>(dst, step, 0, x);
323     }
324   } else {
325     LoadSrc<8, 4>(dst, step, 0, x);
326     if (transpose) {
327       Transpose4x4_U16(x, x);
328     }
329   }
330   // stage 1.
331   // kBitReverseLookup 0, 2, 1, 3
332   s[0] = x[0];
333   s[1] = x[2];
334   s[2] = x[1];
335   s[3] = x[3];
336 
337   Dct4Stages<butterfly_rotation>(s);
338 
339   if (stage_is_rectangular) {
340     if (transpose) {
341       __m128i output[8];
342       Transpose8x4To4x8_U16(s, output);
343       StoreDst<8, 8>(dst, step, 0, output);
344     } else {
345       StoreDst<16, 4>(dst, step, 0, s);
346     }
347   } else {
348     if (transpose) {
349       Transpose4x4_U16(s, s);
350     }
351     StoreDst<8, 4>(dst, step, 0, s);
352   }
353 }
354 
355 template <ButterflyRotationFunc butterfly_rotation,
356           bool is_fast_butterfly = false>
Dct8Stages(__m128i * s)357 LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
358   // stage 8.
359   if (is_fast_butterfly) {
360     ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
361     ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
362   } else {
363     butterfly_rotation(&s[4], &s[7], 56, false);
364     butterfly_rotation(&s[5], &s[6], 24, false);
365   }
366 
367   // stage 13.
368   HadamardRotation(&s[4], &s[5], false);
369   HadamardRotation(&s[6], &s[7], true);
370 
371   // stage 18.
372   butterfly_rotation(&s[6], &s[5], 32, true);
373 
374   // stage 22.
375   HadamardRotation(&s[0], &s[7], false);
376   HadamardRotation(&s[1], &s[6], false);
377   HadamardRotation(&s[2], &s[5], false);
378   HadamardRotation(&s[3], &s[4], false);
379 }
380 
381 // Process dct8 rows or columns, depending on the transpose flag.
382 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Dct8_SSE4_1(void * dest,int32_t step,bool transpose)383 LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
384                                        bool transpose) {
385   auto* const dst = static_cast<int16_t*>(dest);
386   __m128i s[8], x[8];
387 
388   if (stage_is_rectangular) {
389     if (transpose) {
390       __m128i input[4];
391       LoadSrc<16, 4>(dst, step, 0, input);
392       Transpose8x4To4x8_U16(input, x);
393     } else {
394       LoadSrc<8, 8>(dst, step, 0, x);
395     }
396   } else {
397     if (transpose) {
398       __m128i input[8];
399       LoadSrc<16, 8>(dst, step, 0, input);
400       Transpose8x8_U16(input, x);
401     } else {
402       LoadSrc<16, 8>(dst, step, 0, x);
403     }
404   }
405 
406   // stage 1.
407   // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
408   s[0] = x[0];
409   s[1] = x[4];
410   s[2] = x[2];
411   s[3] = x[6];
412   s[4] = x[1];
413   s[5] = x[5];
414   s[6] = x[3];
415   s[7] = x[7];
416 
417   Dct4Stages<butterfly_rotation>(s);
418   Dct8Stages<butterfly_rotation>(s);
419 
420   if (stage_is_rectangular) {
421     if (transpose) {
422       __m128i output[4];
423       Transpose4x8To8x4_U16(s, output);
424       StoreDst<16, 4>(dst, step, 0, output);
425     } else {
426       StoreDst<8, 8>(dst, step, 0, s);
427     }
428   } else {
429     if (transpose) {
430       __m128i output[8];
431       Transpose8x8_U16(s, output);
432       StoreDst<16, 8>(dst, step, 0, output);
433     } else {
434       StoreDst<16, 8>(dst, step, 0, s);
435     }
436   }
437 }
438 
439 template <ButterflyRotationFunc butterfly_rotation,
440           bool is_fast_butterfly = false>
Dct16Stages(__m128i * s)441 LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
442   // stage 5.
443   if (is_fast_butterfly) {
444     ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
445     ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
446     ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
447     ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
448   } else {
449     butterfly_rotation(&s[8], &s[15], 60, false);
450     butterfly_rotation(&s[9], &s[14], 28, false);
451     butterfly_rotation(&s[10], &s[13], 44, false);
452     butterfly_rotation(&s[11], &s[12], 12, false);
453   }
454 
455   // stage 9.
456   HadamardRotation(&s[8], &s[9], false);
457   HadamardRotation(&s[10], &s[11], true);
458   HadamardRotation(&s[12], &s[13], false);
459   HadamardRotation(&s[14], &s[15], true);
460 
461   // stage 14.
462   butterfly_rotation(&s[14], &s[9], 48, true);
463   butterfly_rotation(&s[13], &s[10], 112, true);
464 
465   // stage 19.
466   HadamardRotation(&s[8], &s[11], false);
467   HadamardRotation(&s[9], &s[10], false);
468   HadamardRotation(&s[12], &s[15], true);
469   HadamardRotation(&s[13], &s[14], true);
470 
471   // stage 23.
472   butterfly_rotation(&s[13], &s[10], 32, true);
473   butterfly_rotation(&s[12], &s[11], 32, true);
474 
475   // stage 26.
476   HadamardRotation(&s[0], &s[15], false);
477   HadamardRotation(&s[1], &s[14], false);
478   HadamardRotation(&s[2], &s[13], false);
479   HadamardRotation(&s[3], &s[12], false);
480   HadamardRotation(&s[4], &s[11], false);
481   HadamardRotation(&s[5], &s[10], false);
482   HadamardRotation(&s[6], &s[9], false);
483   HadamardRotation(&s[7], &s[8], false);
484 }
485 
486 // Process dct16 rows or columns, depending on the transpose flag.
487 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Dct16_SSE4_1(void * dest,int32_t step,bool transpose)488 LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
489                                         bool transpose) {
490   auto* const dst = static_cast<int16_t*>(dest);
491   __m128i s[16], x[16];
492 
493   if (stage_is_rectangular) {
494     if (transpose) {
495       __m128i input[4];
496       LoadSrc<16, 4>(dst, step, 0, input);
497       Transpose8x4To4x8_U16(input, x);
498       LoadSrc<16, 4>(dst, step, 8, input);
499       Transpose8x4To4x8_U16(input, &x[8]);
500     } else {
501       LoadSrc<8, 16>(dst, step, 0, x);
502     }
503   } else {
504     if (transpose) {
505       for (int idx = 0; idx < 16; idx += 8) {
506         __m128i input[8];
507         LoadSrc<16, 8>(dst, step, idx, input);
508         Transpose8x8_U16(input, &x[idx]);
509       }
510     } else {
511       LoadSrc<16, 16>(dst, step, 0, x);
512     }
513   }
514 
515   // stage 1
516   // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
517   s[0] = x[0];
518   s[1] = x[8];
519   s[2] = x[4];
520   s[3] = x[12];
521   s[4] = x[2];
522   s[5] = x[10];
523   s[6] = x[6];
524   s[7] = x[14];
525   s[8] = x[1];
526   s[9] = x[9];
527   s[10] = x[5];
528   s[11] = x[13];
529   s[12] = x[3];
530   s[13] = x[11];
531   s[14] = x[7];
532   s[15] = x[15];
533 
534   Dct4Stages<butterfly_rotation>(s);
535   Dct8Stages<butterfly_rotation>(s);
536   Dct16Stages<butterfly_rotation>(s);
537 
538   if (stage_is_rectangular) {
539     if (transpose) {
540       __m128i output[4];
541       Transpose4x8To8x4_U16(s, output);
542       StoreDst<16, 4>(dst, step, 0, output);
543       Transpose4x8To8x4_U16(&s[8], output);
544       StoreDst<16, 4>(dst, step, 8, output);
545     } else {
546       StoreDst<8, 16>(dst, step, 0, s);
547     }
548   } else {
549     if (transpose) {
550       for (int idx = 0; idx < 16; idx += 8) {
551         __m128i output[8];
552         Transpose8x8_U16(&s[idx], output);
553         StoreDst<16, 8>(dst, step, idx, output);
554       }
555     } else {
556       StoreDst<16, 16>(dst, step, 0, s);
557     }
558   }
559 }
560 
561 template <ButterflyRotationFunc butterfly_rotation,
562           bool is_fast_butterfly = false>
Dct32Stages(__m128i * s)563 LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
564   // stage 3
565   if (is_fast_butterfly) {
566     ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
567     ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
568     ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
569     ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
570     ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
571     ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
572     ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
573     ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
574   } else {
575     butterfly_rotation(&s[16], &s[31], 62, false);
576     butterfly_rotation(&s[17], &s[30], 30, false);
577     butterfly_rotation(&s[18], &s[29], 46, false);
578     butterfly_rotation(&s[19], &s[28], 14, false);
579     butterfly_rotation(&s[20], &s[27], 54, false);
580     butterfly_rotation(&s[21], &s[26], 22, false);
581     butterfly_rotation(&s[22], &s[25], 38, false);
582     butterfly_rotation(&s[23], &s[24], 6, false);
583   }
584   // stage 6.
585   HadamardRotation(&s[16], &s[17], false);
586   HadamardRotation(&s[18], &s[19], true);
587   HadamardRotation(&s[20], &s[21], false);
588   HadamardRotation(&s[22], &s[23], true);
589   HadamardRotation(&s[24], &s[25], false);
590   HadamardRotation(&s[26], &s[27], true);
591   HadamardRotation(&s[28], &s[29], false);
592   HadamardRotation(&s[30], &s[31], true);
593 
594   // stage 10.
595   butterfly_rotation(&s[30], &s[17], 24 + 32, true);
596   butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
597   butterfly_rotation(&s[26], &s[21], 24, true);
598   butterfly_rotation(&s[25], &s[22], 24 + 64, true);
599 
600   // stage 15.
601   HadamardRotation(&s[16], &s[19], false);
602   HadamardRotation(&s[17], &s[18], false);
603   HadamardRotation(&s[20], &s[23], true);
604   HadamardRotation(&s[21], &s[22], true);
605   HadamardRotation(&s[24], &s[27], false);
606   HadamardRotation(&s[25], &s[26], false);
607   HadamardRotation(&s[28], &s[31], true);
608   HadamardRotation(&s[29], &s[30], true);
609 
610   // stage 20.
611   butterfly_rotation(&s[29], &s[18], 48, true);
612   butterfly_rotation(&s[28], &s[19], 48, true);
613   butterfly_rotation(&s[27], &s[20], 48 + 64, true);
614   butterfly_rotation(&s[26], &s[21], 48 + 64, true);
615 
616   // stage 24.
617   HadamardRotation(&s[16], &s[23], false);
618   HadamardRotation(&s[17], &s[22], false);
619   HadamardRotation(&s[18], &s[21], false);
620   HadamardRotation(&s[19], &s[20], false);
621   HadamardRotation(&s[24], &s[31], true);
622   HadamardRotation(&s[25], &s[30], true);
623   HadamardRotation(&s[26], &s[29], true);
624   HadamardRotation(&s[27], &s[28], true);
625 
626   // stage 27.
627   butterfly_rotation(&s[27], &s[20], 32, true);
628   butterfly_rotation(&s[26], &s[21], 32, true);
629   butterfly_rotation(&s[25], &s[22], 32, true);
630   butterfly_rotation(&s[24], &s[23], 32, true);
631 
632   // stage 29.
633   HadamardRotation(&s[0], &s[31], false);
634   HadamardRotation(&s[1], &s[30], false);
635   HadamardRotation(&s[2], &s[29], false);
636   HadamardRotation(&s[3], &s[28], false);
637   HadamardRotation(&s[4], &s[27], false);
638   HadamardRotation(&s[5], &s[26], false);
639   HadamardRotation(&s[6], &s[25], false);
640   HadamardRotation(&s[7], &s[24], false);
641   HadamardRotation(&s[8], &s[23], false);
642   HadamardRotation(&s[9], &s[22], false);
643   HadamardRotation(&s[10], &s[21], false);
644   HadamardRotation(&s[11], &s[20], false);
645   HadamardRotation(&s[12], &s[19], false);
646   HadamardRotation(&s[13], &s[18], false);
647   HadamardRotation(&s[14], &s[17], false);
648   HadamardRotation(&s[15], &s[16], false);
649 }
650 
651 // Process dct32 rows or columns, depending on the transpose flag.
Dct32_SSE4_1(void * dest,const int32_t step,const bool transpose)652 LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
653                                         const bool transpose) {
654   auto* const dst = static_cast<int16_t*>(dest);
655   __m128i s[32], x[32];
656 
657   if (transpose) {
658     for (int idx = 0; idx < 32; idx += 8) {
659       __m128i input[8];
660       LoadSrc<16, 8>(dst, step, idx, input);
661       Transpose8x8_U16(input, &x[idx]);
662     }
663   } else {
664     LoadSrc<16, 32>(dst, step, 0, x);
665   }
666 
667   // stage 1
668   // kBitReverseLookup
669   // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
670   s[0] = x[0];
671   s[1] = x[16];
672   s[2] = x[8];
673   s[3] = x[24];
674   s[4] = x[4];
675   s[5] = x[20];
676   s[6] = x[12];
677   s[7] = x[28];
678   s[8] = x[2];
679   s[9] = x[18];
680   s[10] = x[10];
681   s[11] = x[26];
682   s[12] = x[6];
683   s[13] = x[22];
684   s[14] = x[14];
685   s[15] = x[30];
686 
687   // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
688   s[16] = x[1];
689   s[17] = x[17];
690   s[18] = x[9];
691   s[19] = x[25];
692   s[20] = x[5];
693   s[21] = x[21];
694   s[22] = x[13];
695   s[23] = x[29];
696   s[24] = x[3];
697   s[25] = x[19];
698   s[26] = x[11];
699   s[27] = x[27];
700   s[28] = x[7];
701   s[29] = x[23];
702   s[30] = x[15];
703   s[31] = x[31];
704 
705   Dct4Stages<ButterflyRotation_8>(s);
706   Dct8Stages<ButterflyRotation_8>(s);
707   Dct16Stages<ButterflyRotation_8>(s);
708   Dct32Stages<ButterflyRotation_8>(s);
709 
710   if (transpose) {
711     for (int idx = 0; idx < 32; idx += 8) {
712       __m128i output[8];
713       Transpose8x8_U16(&s[idx], output);
714       StoreDst<16, 8>(dst, step, idx, output);
715     }
716   } else {
717     StoreDst<16, 32>(dst, step, 0, s);
718   }
719 }
720 
721 // Allow the compiler to call this function instead of force inlining. Tests
722 // show the performance is slightly faster.
Dct64_SSE4_1(void * dest,int32_t step,bool transpose)723 void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
724   auto* const dst = static_cast<int16_t*>(dest);
725   __m128i s[64], x[32];
726 
727   if (transpose) {
728     // The last 32 values of every row are always zero if the |tx_width| is
729     // 64.
730     for (int idx = 0; idx < 32; idx += 8) {
731       __m128i input[8];
732       LoadSrc<16, 8>(dst, step, idx, input);
733       Transpose8x8_U16(input, &x[idx]);
734     }
735   } else {
736     // The last 32 values of every column are always zero if the |tx_height| is
737     // 64.
738     LoadSrc<16, 32>(dst, step, 0, x);
739   }
740 
741   // stage 1
742   // kBitReverseLookup
743   // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
744   s[0] = x[0];
745   s[2] = x[16];
746   s[4] = x[8];
747   s[6] = x[24];
748   s[8] = x[4];
749   s[10] = x[20];
750   s[12] = x[12];
751   s[14] = x[28];
752 
753   // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
754   s[16] = x[2];
755   s[18] = x[18];
756   s[20] = x[10];
757   s[22] = x[26];
758   s[24] = x[6];
759   s[26] = x[22];
760   s[28] = x[14];
761   s[30] = x[30];
762 
763   // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
764   s[32] = x[1];
765   s[34] = x[17];
766   s[36] = x[9];
767   s[38] = x[25];
768   s[40] = x[5];
769   s[42] = x[21];
770   s[44] = x[13];
771   s[46] = x[29];
772 
773   // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
774   s[48] = x[3];
775   s[50] = x[19];
776   s[52] = x[11];
777   s[54] = x[27];
778   s[56] = x[7];
779   s[58] = x[23];
780   s[60] = x[15];
781   s[62] = x[31];
782 
783   Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
784   Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
785   Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
786   Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
787 
788   //-- start dct 64 stages
789   // stage 2.
790   ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
791   ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
792   ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
793   ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
794   ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
795   ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
796   ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
797   ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
798   ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
799   ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
800   ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
801   ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
802   ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
803   ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
804   ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
805   ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
806 
807   // stage 4.
808   HadamardRotation(&s[32], &s[33], false);
809   HadamardRotation(&s[34], &s[35], true);
810   HadamardRotation(&s[36], &s[37], false);
811   HadamardRotation(&s[38], &s[39], true);
812   HadamardRotation(&s[40], &s[41], false);
813   HadamardRotation(&s[42], &s[43], true);
814   HadamardRotation(&s[44], &s[45], false);
815   HadamardRotation(&s[46], &s[47], true);
816   HadamardRotation(&s[48], &s[49], false);
817   HadamardRotation(&s[50], &s[51], true);
818   HadamardRotation(&s[52], &s[53], false);
819   HadamardRotation(&s[54], &s[55], true);
820   HadamardRotation(&s[56], &s[57], false);
821   HadamardRotation(&s[58], &s[59], true);
822   HadamardRotation(&s[60], &s[61], false);
823   HadamardRotation(&s[62], &s[63], true);
824 
825   // stage 7.
826   ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
827   ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
828   ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
829   ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
830   ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
831   ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
832   ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
833   ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
834 
835   // stage 11.
836   HadamardRotation(&s[32], &s[35], false);
837   HadamardRotation(&s[33], &s[34], false);
838   HadamardRotation(&s[36], &s[39], true);
839   HadamardRotation(&s[37], &s[38], true);
840   HadamardRotation(&s[40], &s[43], false);
841   HadamardRotation(&s[41], &s[42], false);
842   HadamardRotation(&s[44], &s[47], true);
843   HadamardRotation(&s[45], &s[46], true);
844   HadamardRotation(&s[48], &s[51], false);
845   HadamardRotation(&s[49], &s[50], false);
846   HadamardRotation(&s[52], &s[55], true);
847   HadamardRotation(&s[53], &s[54], true);
848   HadamardRotation(&s[56], &s[59], false);
849   HadamardRotation(&s[57], &s[58], false);
850   HadamardRotation(&s[60], &s[63], true);
851   HadamardRotation(&s[61], &s[62], true);
852 
853   // stage 16.
854   ButterflyRotation_8(&s[61], &s[34], 56, true);
855   ButterflyRotation_8(&s[60], &s[35], 56, true);
856   ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
857   ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
858   ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
859   ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
860   ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
861   ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
862 
863   // stage 21.
864   HadamardRotation(&s[32], &s[39], false);
865   HadamardRotation(&s[33], &s[38], false);
866   HadamardRotation(&s[34], &s[37], false);
867   HadamardRotation(&s[35], &s[36], false);
868   HadamardRotation(&s[40], &s[47], true);
869   HadamardRotation(&s[41], &s[46], true);
870   HadamardRotation(&s[42], &s[45], true);
871   HadamardRotation(&s[43], &s[44], true);
872   HadamardRotation(&s[48], &s[55], false);
873   HadamardRotation(&s[49], &s[54], false);
874   HadamardRotation(&s[50], &s[53], false);
875   HadamardRotation(&s[51], &s[52], false);
876   HadamardRotation(&s[56], &s[63], true);
877   HadamardRotation(&s[57], &s[62], true);
878   HadamardRotation(&s[58], &s[61], true);
879   HadamardRotation(&s[59], &s[60], true);
880 
881   // stage 25.
882   ButterflyRotation_8(&s[59], &s[36], 48, true);
883   ButterflyRotation_8(&s[58], &s[37], 48, true);
884   ButterflyRotation_8(&s[57], &s[38], 48, true);
885   ButterflyRotation_8(&s[56], &s[39], 48, true);
886   ButterflyRotation_8(&s[55], &s[40], 112, true);
887   ButterflyRotation_8(&s[54], &s[41], 112, true);
888   ButterflyRotation_8(&s[53], &s[42], 112, true);
889   ButterflyRotation_8(&s[52], &s[43], 112, true);
890 
891   // stage 28.
892   HadamardRotation(&s[32], &s[47], false);
893   HadamardRotation(&s[33], &s[46], false);
894   HadamardRotation(&s[34], &s[45], false);
895   HadamardRotation(&s[35], &s[44], false);
896   HadamardRotation(&s[36], &s[43], false);
897   HadamardRotation(&s[37], &s[42], false);
898   HadamardRotation(&s[38], &s[41], false);
899   HadamardRotation(&s[39], &s[40], false);
900   HadamardRotation(&s[48], &s[63], true);
901   HadamardRotation(&s[49], &s[62], true);
902   HadamardRotation(&s[50], &s[61], true);
903   HadamardRotation(&s[51], &s[60], true);
904   HadamardRotation(&s[52], &s[59], true);
905   HadamardRotation(&s[53], &s[58], true);
906   HadamardRotation(&s[54], &s[57], true);
907   HadamardRotation(&s[55], &s[56], true);
908 
909   // stage 30.
910   ButterflyRotation_8(&s[55], &s[40], 32, true);
911   ButterflyRotation_8(&s[54], &s[41], 32, true);
912   ButterflyRotation_8(&s[53], &s[42], 32, true);
913   ButterflyRotation_8(&s[52], &s[43], 32, true);
914   ButterflyRotation_8(&s[51], &s[44], 32, true);
915   ButterflyRotation_8(&s[50], &s[45], 32, true);
916   ButterflyRotation_8(&s[49], &s[46], 32, true);
917   ButterflyRotation_8(&s[48], &s[47], 32, true);
918 
919   // stage 31.
920   for (int i = 0; i < 32; i += 4) {
921     HadamardRotation(&s[i], &s[63 - i], false);
922     HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
923     HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
924     HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
925   }
926   //-- end dct 64 stages
927 
928   if (transpose) {
929     for (int idx = 0; idx < 64; idx += 8) {
930       __m128i output[8];
931       Transpose8x8_U16(&s[idx], output);
932       StoreDst<16, 8>(dst, step, idx, output);
933     }
934   } else {
935     StoreDst<16, 64>(dst, step, 0, s);
936   }
937 }
938 
939 //------------------------------------------------------------------------------
940 // Asymmetric Discrete Sine Transforms (ADST).
941 
942 template <bool stage_is_rectangular>
Adst4_SSE4_1(void * dest,int32_t step,bool transpose)943 LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
944                                         bool transpose) {
945   auto* const dst = static_cast<int16_t*>(dest);
946   __m128i s[8], x[4];
947 
948   if (stage_is_rectangular) {
949     if (transpose) {
950       __m128i input[8];
951       LoadSrc<8, 8>(dst, step, 0, input);
952       Transpose4x8To8x4_U16(input, x);
953     } else {
954       LoadSrc<16, 4>(dst, step, 0, x);
955     }
956   } else {
957     LoadSrc<8, 4>(dst, step, 0, x);
958     if (transpose) {
959       Transpose4x4_U16(x, x);
960     }
961   }
962 
963   const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
964   const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
965   const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
966   const __m128i kAdst4Multiplier_m0_1 =
967       _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
968                      (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
969   const __m128i kAdst4Multiplier_3_0 =
970       _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
971                      (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
972 
973   // stage 1.
974   const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
975   const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
976   const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
977   const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
978   const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
979 
980   s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
981   s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
982 
983   // stage 2.
984   // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
985   const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
986   const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
987   const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
988 
989   // stage 3.
990   s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
991   s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
992   s[2] = b7;
993   s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
994 
995   // stage 4.
996   s[0] = _mm_add_epi32(s[0], s[5]);
997   s[1] = _mm_sub_epi32(s[1], s[6]);
998 
999   // stages 5 and 6.
1000   x[0] = _mm_add_epi32(s[0], s[3]);
1001   x[1] = _mm_add_epi32(s[1], s[3]);
1002   x[2] = _mm_add_epi32(s[0], s[1]);
1003   x[3] = _mm_sub_epi32(x[2], s[3]);
1004 
1005   x[0] = RightShiftWithRounding_S32(x[0], 12);
1006   x[1] = RightShiftWithRounding_S32(x[1], 12);
1007   x[2] = RightShiftWithRounding_S32(s[2], 12);
1008   x[3] = RightShiftWithRounding_S32(x[3], 12);
1009 
1010   x[0] = _mm_packs_epi32(x[0], x[1]);
1011   x[2] = _mm_packs_epi32(x[2], x[3]);
1012   x[1] = _mm_srli_si128(x[0], 8);
1013   x[3] = _mm_srli_si128(x[2], 8);
1014 
1015   if (stage_is_rectangular) {
1016     if (transpose) {
1017       __m128i output[8];
1018       Transpose8x4To4x8_U16(x, output);
1019       StoreDst<8, 8>(dst, step, 0, output);
1020     } else {
1021       StoreDst<16, 4>(dst, step, 0, x);
1022     }
1023   } else {
1024     if (transpose) {
1025       Transpose4x4_U16(x, x);
1026     }
1027     StoreDst<8, 4>(dst, step, 0, x);
1028   }
1029 }
1030 
1031 constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
1032                                                3344, 0, 2482, 1321};
1033 
Adst4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1034 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
1035                                        bool should_round, int row_shift) {
1036   if (adjusted_tx_height > 1) return false;
1037 
1038   auto* dst = static_cast<int16_t*>(dest);
1039   const __m128i v_src =
1040       _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
1041   const __m128i v_mask =
1042       _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1043   const __m128i v_kTransformRowMultiplier =
1044       _mm_set1_epi16(kTransformRowMultiplier << 3);
1045   const __m128i v_src_round =
1046       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1047   const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1048   const __m128i v_kAdst4DcOnlyMultipliers =
1049       LoadUnaligned16(kAdst4DcOnlyMultiplier);
1050   // s0*k0 s0*k1 s0*k2 s0*k1
1051   // +
1052   // s0*0  s0*0  s0*0  s0*k0
1053   const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
1054   const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
1055   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1056   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1057   const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
1058   const __m128i b = _mm_sra_epi32(a, v_row_shift);
1059   const __m128i c = _mm_packs_epi32(b, b);
1060   StoreLo8(dst, c);
1061 
1062   return true;
1063 }
1064 
Adst4DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1065 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
1066                                              int width) {
1067   if (adjusted_tx_height > 1) return false;
1068 
1069   auto* dst = static_cast<int16_t*>(dest);
1070   int i = 0;
1071   do {
1072     const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
1073     const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
1074     const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
1075     const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
1076     const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
1077     const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
1078     const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
1079     const __m128i x0 = s0;
1080     const __m128i x1 = s1;
1081     const __m128i x2 = s2;
1082     const __m128i x3 = _mm_add_epi32(s0, s1);
1083     const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
1084     const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
1085     const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
1086     const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
1087     const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
1088     const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
1089     StoreLo8(&dst[i], dst_0_1);
1090     StoreHi8(&dst[i + width * 1], dst_0_1);
1091     StoreLo8(&dst[i + width * 2], dst_2_3);
1092     StoreHi8(&dst[i + width * 3], dst_2_3);
1093     i += 4;
1094   } while (i < width);
1095 
1096   return true;
1097 }
1098 
1099 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Adst8_SSE4_1(void * dest,int32_t step,bool transpose)1100 LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
1101                                         bool transpose) {
1102   auto* const dst = static_cast<int16_t*>(dest);
1103   __m128i s[8], x[8];
1104 
1105   if (stage_is_rectangular) {
1106     if (transpose) {
1107       __m128i input[4];
1108       LoadSrc<16, 4>(dst, step, 0, input);
1109       Transpose8x4To4x8_U16(input, x);
1110     } else {
1111       LoadSrc<8, 8>(dst, step, 0, x);
1112     }
1113   } else {
1114     if (transpose) {
1115       __m128i input[8];
1116       LoadSrc<16, 8>(dst, step, 0, input);
1117       Transpose8x8_U16(input, x);
1118     } else {
1119       LoadSrc<16, 8>(dst, step, 0, x);
1120     }
1121   }
1122 
1123   // stage 1.
1124   s[0] = x[7];
1125   s[1] = x[0];
1126   s[2] = x[5];
1127   s[3] = x[2];
1128   s[4] = x[3];
1129   s[5] = x[4];
1130   s[6] = x[1];
1131   s[7] = x[6];
1132 
1133   // stage 2.
1134   butterfly_rotation(&s[0], &s[1], 60 - 0, true);
1135   butterfly_rotation(&s[2], &s[3], 60 - 16, true);
1136   butterfly_rotation(&s[4], &s[5], 60 - 32, true);
1137   butterfly_rotation(&s[6], &s[7], 60 - 48, true);
1138 
1139   // stage 3.
1140   HadamardRotation(&s[0], &s[4], false);
1141   HadamardRotation(&s[1], &s[5], false);
1142   HadamardRotation(&s[2], &s[6], false);
1143   HadamardRotation(&s[3], &s[7], false);
1144 
1145   // stage 4.
1146   butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1147   butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1148 
1149   // stage 5.
1150   HadamardRotation(&s[0], &s[2], false);
1151   HadamardRotation(&s[4], &s[6], false);
1152   HadamardRotation(&s[1], &s[3], false);
1153   HadamardRotation(&s[5], &s[7], false);
1154 
1155   // stage 6.
1156   butterfly_rotation(&s[2], &s[3], 32, true);
1157   butterfly_rotation(&s[6], &s[7], 32, true);
1158 
1159   // stage 7.
1160   const __m128i v_zero = _mm_setzero_si128();
1161   x[0] = s[0];
1162   x[1] = _mm_subs_epi16(v_zero, s[4]);
1163   x[2] = s[6];
1164   x[3] = _mm_subs_epi16(v_zero, s[2]);
1165   x[4] = s[3];
1166   x[5] = _mm_subs_epi16(v_zero, s[7]);
1167   x[6] = s[5];
1168   x[7] = _mm_subs_epi16(v_zero, s[1]);
1169 
1170   if (stage_is_rectangular) {
1171     if (transpose) {
1172       __m128i output[4];
1173       Transpose4x8To8x4_U16(x, output);
1174       StoreDst<16, 4>(dst, step, 0, output);
1175     } else {
1176       StoreDst<8, 8>(dst, step, 0, x);
1177     }
1178   } else {
1179     if (transpose) {
1180       __m128i output[8];
1181       Transpose8x8_U16(x, output);
1182       StoreDst<16, 8>(dst, step, 0, output);
1183     } else {
1184       StoreDst<16, 8>(dst, step, 0, x);
1185     }
1186   }
1187 }
1188 
Adst8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1189 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
1190                                        bool should_round, int row_shift) {
1191   if (adjusted_tx_height > 1) return false;
1192 
1193   auto* dst = static_cast<int16_t*>(dest);
1194   __m128i s[8];
1195 
1196   const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
1197   const __m128i v_mask =
1198       _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1199   const __m128i v_kTransformRowMultiplier =
1200       _mm_set1_epi16(kTransformRowMultiplier << 3);
1201   const __m128i v_src_round =
1202       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1203   // stage 1.
1204   s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1205 
1206   // stage 2.
1207   ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1208 
1209   // stage 3.
1210   s[4] = s[0];
1211   s[5] = s[1];
1212 
1213   // stage 4.
1214   ButterflyRotation_4(&s[4], &s[5], 48, true);
1215 
1216   // stage 5.
1217   s[2] = s[0];
1218   s[3] = s[1];
1219   s[6] = s[4];
1220   s[7] = s[5];
1221 
1222   // stage 6.
1223   ButterflyRotation_4(&s[2], &s[3], 32, true);
1224   ButterflyRotation_4(&s[6], &s[7], 32, true);
1225 
1226   // stage 7.
1227   __m128i x[8];
1228   const __m128i v_zero = _mm_setzero_si128();
1229   x[0] = s[0];
1230   x[1] = _mm_subs_epi16(v_zero, s[4]);
1231   x[2] = s[6];
1232   x[3] = _mm_subs_epi16(v_zero, s[2]);
1233   x[4] = s[3];
1234   x[5] = _mm_subs_epi16(v_zero, s[7]);
1235   x[6] = s[5];
1236   x[7] = _mm_subs_epi16(v_zero, s[1]);
1237 
1238   const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
1239   const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
1240   const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
1241   const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
1242   const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1243   const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1244 
1245   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1246   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1247   const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1248   const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1249   const __m128i b = _mm_sra_epi32(a, v_row_shift);
1250   const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1251   StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
1252 
1253   return true;
1254 }
1255 
Adst8DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1256 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
1257                                              int width) {
1258   if (adjusted_tx_height > 1) return false;
1259 
1260   auto* dst = static_cast<int16_t*>(dest);
1261   __m128i s[8];
1262 
1263   int i = 0;
1264   do {
1265     const __m128i v_src = LoadLo8(dst);
1266     // stage 1.
1267     s[1] = v_src;
1268 
1269     // stage 2.
1270     ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1271 
1272     // stage 3.
1273     s[4] = s[0];
1274     s[5] = s[1];
1275 
1276     // stage 4.
1277     ButterflyRotation_4(&s[4], &s[5], 48, true);
1278 
1279     // stage 5.
1280     s[2] = s[0];
1281     s[3] = s[1];
1282     s[6] = s[4];
1283     s[7] = s[5];
1284 
1285     // stage 6.
1286     ButterflyRotation_4(&s[2], &s[3], 32, true);
1287     ButterflyRotation_4(&s[6], &s[7], 32, true);
1288 
1289     // stage 7.
1290     __m128i x[8];
1291     const __m128i v_zero = _mm_setzero_si128();
1292     x[0] = s[0];
1293     x[1] = _mm_subs_epi16(v_zero, s[4]);
1294     x[2] = s[6];
1295     x[3] = _mm_subs_epi16(v_zero, s[2]);
1296     x[4] = s[3];
1297     x[5] = _mm_subs_epi16(v_zero, s[7]);
1298     x[6] = s[5];
1299     x[7] = _mm_subs_epi16(v_zero, s[1]);
1300 
1301     for (int j = 0; j < 8; ++j) {
1302       StoreLo8(&dst[j * width], x[j]);
1303     }
1304     i += 4;
1305     dst += 4;
1306   } while (i < width);
1307 
1308   return true;
1309 }
1310 
1311 template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
Adst16_SSE4_1(void * dest,int32_t step,bool transpose)1312 LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
1313                                          bool transpose) {
1314   auto* const dst = static_cast<int16_t*>(dest);
1315   __m128i s[16], x[16];
1316 
1317   if (stage_is_rectangular) {
1318     if (transpose) {
1319       __m128i input[4];
1320       LoadSrc<16, 4>(dst, step, 0, input);
1321       Transpose8x4To4x8_U16(input, x);
1322       LoadSrc<16, 4>(dst, step, 8, input);
1323       Transpose8x4To4x8_U16(input, &x[8]);
1324     } else {
1325       LoadSrc<8, 16>(dst, step, 0, x);
1326     }
1327   } else {
1328     if (transpose) {
1329       for (int idx = 0; idx < 16; idx += 8) {
1330         __m128i input[8];
1331         LoadSrc<16, 8>(dst, step, idx, input);
1332         Transpose8x8_U16(input, &x[idx]);
1333       }
1334     } else {
1335       LoadSrc<16, 16>(dst, step, 0, x);
1336     }
1337   }
1338 
1339   // stage 1.
1340   s[0] = x[15];
1341   s[1] = x[0];
1342   s[2] = x[13];
1343   s[3] = x[2];
1344   s[4] = x[11];
1345   s[5] = x[4];
1346   s[6] = x[9];
1347   s[7] = x[6];
1348   s[8] = x[7];
1349   s[9] = x[8];
1350   s[10] = x[5];
1351   s[11] = x[10];
1352   s[12] = x[3];
1353   s[13] = x[12];
1354   s[14] = x[1];
1355   s[15] = x[14];
1356 
1357   // stage 2.
1358   butterfly_rotation(&s[0], &s[1], 62 - 0, true);
1359   butterfly_rotation(&s[2], &s[3], 62 - 8, true);
1360   butterfly_rotation(&s[4], &s[5], 62 - 16, true);
1361   butterfly_rotation(&s[6], &s[7], 62 - 24, true);
1362   butterfly_rotation(&s[8], &s[9], 62 - 32, true);
1363   butterfly_rotation(&s[10], &s[11], 62 - 40, true);
1364   butterfly_rotation(&s[12], &s[13], 62 - 48, true);
1365   butterfly_rotation(&s[14], &s[15], 62 - 56, true);
1366 
1367   // stage 3.
1368   HadamardRotation(&s[0], &s[8], false);
1369   HadamardRotation(&s[1], &s[9], false);
1370   HadamardRotation(&s[2], &s[10], false);
1371   HadamardRotation(&s[3], &s[11], false);
1372   HadamardRotation(&s[4], &s[12], false);
1373   HadamardRotation(&s[5], &s[13], false);
1374   HadamardRotation(&s[6], &s[14], false);
1375   HadamardRotation(&s[7], &s[15], false);
1376 
1377   // stage 4.
1378   butterfly_rotation(&s[8], &s[9], 56 - 0, true);
1379   butterfly_rotation(&s[13], &s[12], 8 + 0, true);
1380   butterfly_rotation(&s[10], &s[11], 56 - 32, true);
1381   butterfly_rotation(&s[15], &s[14], 8 + 32, true);
1382 
1383   // stage 5.
1384   HadamardRotation(&s[0], &s[4], false);
1385   HadamardRotation(&s[8], &s[12], false);
1386   HadamardRotation(&s[1], &s[5], false);
1387   HadamardRotation(&s[9], &s[13], false);
1388   HadamardRotation(&s[2], &s[6], false);
1389   HadamardRotation(&s[10], &s[14], false);
1390   HadamardRotation(&s[3], &s[7], false);
1391   HadamardRotation(&s[11], &s[15], false);
1392 
1393   // stage 6.
1394   butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1395   butterfly_rotation(&s[12], &s[13], 48 - 0, true);
1396   butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1397   butterfly_rotation(&s[15], &s[14], 48 - 32, true);
1398 
1399   // stage 7.
1400   HadamardRotation(&s[0], &s[2], false);
1401   HadamardRotation(&s[4], &s[6], false);
1402   HadamardRotation(&s[8], &s[10], false);
1403   HadamardRotation(&s[12], &s[14], false);
1404   HadamardRotation(&s[1], &s[3], false);
1405   HadamardRotation(&s[5], &s[7], false);
1406   HadamardRotation(&s[9], &s[11], false);
1407   HadamardRotation(&s[13], &s[15], false);
1408 
1409   // stage 8.
1410   butterfly_rotation(&s[2], &s[3], 32, true);
1411   butterfly_rotation(&s[6], &s[7], 32, true);
1412   butterfly_rotation(&s[10], &s[11], 32, true);
1413   butterfly_rotation(&s[14], &s[15], 32, true);
1414 
1415   // stage 9.
1416   const __m128i v_zero = _mm_setzero_si128();
1417   x[0] = s[0];
1418   x[1] = _mm_subs_epi16(v_zero, s[8]);
1419   x[2] = s[12];
1420   x[3] = _mm_subs_epi16(v_zero, s[4]);
1421   x[4] = s[6];
1422   x[5] = _mm_subs_epi16(v_zero, s[14]);
1423   x[6] = s[10];
1424   x[7] = _mm_subs_epi16(v_zero, s[2]);
1425   x[8] = s[3];
1426   x[9] = _mm_subs_epi16(v_zero, s[11]);
1427   x[10] = s[15];
1428   x[11] = _mm_subs_epi16(v_zero, s[7]);
1429   x[12] = s[5];
1430   x[13] = _mm_subs_epi16(v_zero, s[13]);
1431   x[14] = s[9];
1432   x[15] = _mm_subs_epi16(v_zero, s[1]);
1433 
1434   if (stage_is_rectangular) {
1435     if (transpose) {
1436       __m128i output[4];
1437       Transpose4x8To8x4_U16(x, output);
1438       StoreDst<16, 4>(dst, step, 0, output);
1439       Transpose4x8To8x4_U16(&x[8], output);
1440       StoreDst<16, 4>(dst, step, 8, output);
1441     } else {
1442       StoreDst<8, 16>(dst, step, 0, x);
1443     }
1444   } else {
1445     if (transpose) {
1446       for (int idx = 0; idx < 16; idx += 8) {
1447         __m128i output[8];
1448         Transpose8x8_U16(&x[idx], output);
1449         StoreDst<16, 8>(dst, step, idx, output);
1450       }
1451     } else {
1452       StoreDst<16, 16>(dst, step, 0, x);
1453     }
1454   }
1455 }
1456 
Adst16DcOnlyInternal(__m128i * s,__m128i * x)1457 LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
1458   // stage 2.
1459   ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
1460 
1461   // stage 3.
1462   s[8] = s[0];
1463   s[9] = s[1];
1464 
1465   // stage 4.
1466   ButterflyRotation_4(&s[8], &s[9], 56, true);
1467 
1468   // stage 5.
1469   s[4] = s[0];
1470   s[12] = s[8];
1471   s[5] = s[1];
1472   s[13] = s[9];
1473 
1474   // stage 6.
1475   ButterflyRotation_4(&s[4], &s[5], 48, true);
1476   ButterflyRotation_4(&s[12], &s[13], 48, true);
1477 
1478   // stage 7.
1479   s[2] = s[0];
1480   s[6] = s[4];
1481   s[10] = s[8];
1482   s[14] = s[12];
1483   s[3] = s[1];
1484   s[7] = s[5];
1485   s[11] = s[9];
1486   s[15] = s[13];
1487 
1488   // stage 8.
1489   ButterflyRotation_4(&s[2], &s[3], 32, true);
1490   ButterflyRotation_4(&s[6], &s[7], 32, true);
1491   ButterflyRotation_4(&s[10], &s[11], 32, true);
1492   ButterflyRotation_4(&s[14], &s[15], 32, true);
1493 
1494   // stage 9.
1495   const __m128i v_zero = _mm_setzero_si128();
1496   x[0] = s[0];
1497   x[1] = _mm_subs_epi16(v_zero, s[8]);
1498   x[2] = s[12];
1499   x[3] = _mm_subs_epi16(v_zero, s[4]);
1500   x[4] = s[6];
1501   x[5] = _mm_subs_epi16(v_zero, s[14]);
1502   x[6] = s[10];
1503   x[7] = _mm_subs_epi16(v_zero, s[2]);
1504   x[8] = s[3];
1505   x[9] = _mm_subs_epi16(v_zero, s[11]);
1506   x[10] = s[15];
1507   x[11] = _mm_subs_epi16(v_zero, s[7]);
1508   x[12] = s[5];
1509   x[13] = _mm_subs_epi16(v_zero, s[13]);
1510   x[14] = s[9];
1511   x[15] = _mm_subs_epi16(v_zero, s[1]);
1512 }
1513 
Adst16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1514 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
1515                                         bool should_round, int row_shift) {
1516   if (adjusted_tx_height > 1) return false;
1517 
1518   auto* dst = static_cast<int16_t*>(dest);
1519   __m128i s[16];
1520   __m128i x[16];
1521 
1522   const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
1523   const __m128i v_mask =
1524       _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1525   const __m128i v_kTransformRowMultiplier =
1526       _mm_set1_epi16(kTransformRowMultiplier << 3);
1527   const __m128i v_src_round =
1528       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1529   // stage 1.
1530   s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1531 
1532   Adst16DcOnlyInternal(s, x);
1533 
1534   for (int i = 0; i < 2; ++i) {
1535     const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
1536     const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
1537     const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
1538     const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
1539     const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1540     const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1541 
1542     const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1543     const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1544     const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1545     const __m128i a1 =
1546         _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1547     const __m128i b = _mm_sra_epi32(a, v_row_shift);
1548     const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1549     StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
1550   }
1551   return true;
1552 }
1553 
Adst16DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1554 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
1555                                               int adjusted_tx_height,
1556                                               int width) {
1557   if (adjusted_tx_height > 1) return false;
1558 
1559   auto* dst = static_cast<int16_t*>(dest);
1560   int i = 0;
1561   do {
1562     __m128i s[16];
1563     __m128i x[16];
1564     const __m128i v_src = LoadUnaligned16(dst);
1565     // stage 1.
1566     s[1] = v_src;
1567 
1568     Adst16DcOnlyInternal(s, x);
1569 
1570     for (int j = 0; j < 16; ++j) {
1571       StoreLo8(&dst[j * width], x[j]);
1572     }
1573     i += 4;
1574     dst += 4;
1575   } while (i < width);
1576 
1577   return true;
1578 }
1579 
1580 //------------------------------------------------------------------------------
1581 // Identity Transforms.
1582 
1583 template <bool is_row_shift>
Identity4_SSE4_1(void * dest,int32_t step)1584 LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
1585   auto* const dst = static_cast<int16_t*>(dest);
1586 
1587   if (is_row_shift) {
1588     const int shift = 1;
1589     const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1590     const __m128i v_multiplier_one =
1591         _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1592     for (int i = 0; i < 4; i += 2) {
1593       const __m128i v_src = LoadUnaligned16(&dst[i * step]);
1594       const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1595       const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
1596       const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1597       const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
1598       const __m128i b = _mm_srai_epi32(a, 12 + shift);
1599       const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
1600       StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
1601     }
1602   } else {
1603     const __m128i v_multiplier =
1604         _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
1605     for (int i = 0; i < 4; i += 2) {
1606       const __m128i v_src = LoadUnaligned16(&dst[i * step]);
1607       const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
1608       const __m128i b = _mm_adds_epi16(a, v_src);
1609       StoreUnaligned16(&dst[i * step], b);
1610     }
1611   }
1612 }
1613 
Identity4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int tx_height)1614 LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
1615                                            bool should_round, int tx_height) {
1616   if (adjusted_tx_height > 1) return false;
1617 
1618   auto* dst = static_cast<int16_t*>(dest);
1619   const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1620   const __m128i v_mask =
1621       _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1622   const __m128i v_kTransformRowMultiplier =
1623       _mm_set1_epi16(kTransformRowMultiplier << 3);
1624   const __m128i v_src_round =
1625       _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1626   const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
1627 
1628   const int shift = (tx_height < 16) ? 0 : 1;
1629   const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1630   const __m128i v_multiplier_one =
1631       _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1632   const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
1633   const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
1634   const __m128i b = _mm_srai_epi32(a, 12 + shift);
1635   dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1636   return true;
1637 }
1638 
Identity4ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1639 LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
1640     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1641     const int tx_width, const int tx_height, const int16_t* source) {
1642   const int stride = frame.columns();
1643   uint8_t* dst = frame[start_y] + start_x;
1644 
1645   const __m128i v_multiplier_fraction =
1646       _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1647   const __m128i v_eight = _mm_set1_epi16(8);
1648 
1649   if (tx_width == 4) {
1650     int i = 0;
1651     do {
1652       const __m128i v_src = LoadLo8(&source[i * tx_width]);
1653       const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1654       const __m128i frame_data = Load4(dst);
1655       const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1656       const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1657       const __m128i b = _mm_srai_epi16(a, 4);
1658       const __m128i c = _mm_cvtepu8_epi16(frame_data);
1659       const __m128i d = _mm_adds_epi16(c, b);
1660       Store4(dst, _mm_packus_epi16(d, d));
1661       dst += stride;
1662     } while (++i < tx_height);
1663   } else {
1664     int i = 0;
1665     do {
1666       const int row = i * tx_width;
1667       int j = 0;
1668       do {
1669         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1670         const __m128i v_src_mult =
1671             _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1672         const __m128i frame_data = LoadLo8(dst + j);
1673         const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1674         const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1675         const __m128i b = _mm_srai_epi16(a, 4);
1676         const __m128i c = _mm_cvtepu8_epi16(frame_data);
1677         const __m128i d = _mm_adds_epi16(c, b);
1678         StoreLo8(dst + j, _mm_packus_epi16(d, d));
1679         j += 8;
1680       } while (j < tx_width);
1681       dst += stride;
1682     } while (++i < tx_height);
1683   }
1684 }
1685 
Identity4RowColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1686 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
1687     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1688     const int tx_width, const int tx_height, const int16_t* source) {
1689   const int stride = frame.columns();
1690   uint8_t* dst = frame[start_y] + start_x;
1691 
1692   const __m128i v_multiplier_fraction =
1693       _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1694   const __m128i v_eight = _mm_set1_epi16(8);
1695   const __m128i v_kTransformRowMultiplier =
1696       _mm_set1_epi16(kTransformRowMultiplier << 3);
1697 
1698   if (tx_width == 4) {
1699     int i = 0;
1700     do {
1701       const __m128i v_src = LoadLo8(&source[i * tx_width]);
1702       const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1703       const __m128i frame_data = Load4(dst);
1704       const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
1705       const __m128i v_src_mult2 =
1706           _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1707       const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1708       const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1709       const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1710       const __m128i b = _mm_srai_epi16(a, 4);
1711       const __m128i c = _mm_adds_epi16(frame_data16, b);
1712       Store4(dst, _mm_packus_epi16(c, c));
1713       dst += stride;
1714     } while (++i < tx_height);
1715   } else {
1716     int i = 0;
1717     do {
1718       const int row = i * tx_width;
1719       int j = 0;
1720       do {
1721         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1722         const __m128i v_src_round =
1723             _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1724         const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
1725         const __m128i v_src_mult2 =
1726             _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1727         const __m128i frame_data = LoadLo8(dst + j);
1728         const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1729         const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1730         const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1731         const __m128i b = _mm_srai_epi16(a, 4);
1732         const __m128i c = _mm_adds_epi16(frame_data16, b);
1733         StoreLo8(dst + j, _mm_packus_epi16(c, c));
1734         j += 8;
1735       } while (j < tx_width);
1736       dst += stride;
1737     } while (++i < tx_height);
1738   }
1739 }
1740 
Identity8Row32_SSE4_1(void * dest,int32_t step)1741 LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
1742   auto* const dst = static_cast<int16_t*>(dest);
1743 
1744   // When combining the identity8 multiplier with the row shift, the
1745   // calculations for tx_height equal to 32 can be simplified from
1746   // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
1747   const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
1748   for (int h = 0; h < 4; ++h) {
1749     const __m128i v_src = LoadUnaligned16(&dst[h * step]);
1750     const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
1751     StoreUnaligned16(&dst[h * step], v_src_mult);
1752   }
1753 }
1754 
Identity8Row4_SSE4_1(void * dest,int32_t step)1755 LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
1756   auto* const dst = static_cast<int16_t*>(dest);
1757 
1758   for (int h = 0; h < 4; ++h) {
1759     const __m128i v_src = LoadUnaligned16(&dst[h * step]);
1760     // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
1761     // saturating add here is ok.
1762     const __m128i a = _mm_adds_epi16(v_src, v_src);
1763     StoreUnaligned16(&dst[h * step], a);
1764   }
1765 }
1766 
Identity8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1767 LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
1768                                            bool should_round, int row_shift) {
1769   if (adjusted_tx_height > 1) return false;
1770 
1771   auto* dst = static_cast<int16_t*>(dest);
1772   const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1773   const __m128i v_mask =
1774       _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1775   const __m128i v_kTransformRowMultiplier =
1776       _mm_set1_epi16(kTransformRowMultiplier << 3);
1777   const __m128i v_src_round =
1778       _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1779   const __m128i v_src =
1780       _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
1781   const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
1782   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1783   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1784   const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
1785   const __m128i b = _mm_sra_epi32(a, v_row_shift);
1786   dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1787   return true;
1788 }
1789 
Identity8ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1790 LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
1791     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1792     const int tx_width, const int tx_height, const int16_t* source) {
1793   const int stride = frame.columns();
1794   uint8_t* dst = frame[start_y] + start_x;
1795   const __m128i v_eight = _mm_set1_epi16(8);
1796   if (tx_width == 4) {
1797     int i = 0;
1798     do {
1799       const int row = i * tx_width;
1800       const __m128i v_src = LoadLo8(&source[row]);
1801       const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1802       const __m128i frame_data = Load4(dst);
1803       const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1804       const __m128i b = _mm_srai_epi16(a, 4);
1805       const __m128i c = _mm_cvtepu8_epi16(frame_data);
1806       const __m128i d = _mm_adds_epi16(c, b);
1807       Store4(dst, _mm_packus_epi16(d, d));
1808       dst += stride;
1809     } while (++i < tx_height);
1810   } else {
1811     int i = 0;
1812     do {
1813       const int row = i * tx_width;
1814       int j = 0;
1815       do {
1816         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1817         const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1818         const __m128i frame_data = LoadLo8(dst + j);
1819         const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1820         const __m128i b = _mm_srai_epi16(a, 4);
1821         const __m128i c = _mm_cvtepu8_epi16(frame_data);
1822         const __m128i d = _mm_adds_epi16(c, b);
1823         StoreLo8(dst + j, _mm_packus_epi16(d, d));
1824         j += 8;
1825       } while (j < tx_width);
1826       dst += stride;
1827     } while (++i < tx_height);
1828   }
1829 }
1830 
Identity16Row_SSE4_1(void * dest,int32_t step,int shift)1831 LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
1832                                                 int shift) {
1833   auto* const dst = static_cast<int16_t*>(dest);
1834 
1835   const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1836   const __m128i v_multiplier_one =
1837       _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1838   const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1839 
1840   for (int h = 0; h < 4; ++h) {
1841     const __m128i v_src = LoadUnaligned16(&dst[h * step]);
1842     const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
1843     const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
1844     const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
1845     const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
1846     const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
1847     const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
1848     const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
1849     const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
1850     const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
1851     const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
1852     const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
1853     const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
1854     const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
1855     StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
1856     StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
1857   }
1858 }
1859 
Identity16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int shift)1860 LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
1861                                             bool should_round, int shift) {
1862   if (adjusted_tx_height > 1) return false;
1863 
1864   auto* dst = static_cast<int16_t*>(dest);
1865   const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1866   const __m128i v_mask =
1867       _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
1868   const __m128i v_kTransformRowMultiplier =
1869       _mm_set1_epi16(kTransformRowMultiplier << 3);
1870   const __m128i v_src_round0 =
1871       _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1872   const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
1873   const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1874   const __m128i v_multiplier_one =
1875       _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1876   const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1877   const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1878   const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1879   const __m128i b = _mm_sra_epi32(a, v_shift);
1880   dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1881   return true;
1882 }
1883 
Identity16ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1884 LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
1885     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1886     const int tx_width, const int tx_height, const int16_t* source) {
1887   const int stride = frame.columns();
1888   uint8_t* dst = frame[start_y] + start_x;
1889   const __m128i v_eight = _mm_set1_epi16(8);
1890   const __m128i v_multiplier =
1891       _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
1892 
1893   if (tx_width == 4) {
1894     int i = 0;
1895     do {
1896       const __m128i v_src = LoadLo8(&source[i * tx_width]);
1897       const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1898       const __m128i frame_data = Load4(dst);
1899       const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1900       const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1901       const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1902       const __m128i b = _mm_srai_epi16(a, 4);
1903       const __m128i c = _mm_cvtepu8_epi16(frame_data);
1904       const __m128i d = _mm_adds_epi16(c, b);
1905       Store4(dst, _mm_packus_epi16(d, d));
1906       dst += stride;
1907     } while (++i < tx_height);
1908   } else {
1909     int i = 0;
1910     do {
1911       const int row = i * tx_width;
1912       int j = 0;
1913       do {
1914         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1915         const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1916         const __m128i frame_data = LoadLo8(dst + j);
1917         const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1918         const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1919         const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1920         const __m128i b = _mm_srai_epi16(a, 4);
1921         const __m128i c = _mm_cvtepu8_epi16(frame_data);
1922         const __m128i d = _mm_adds_epi16(c, b);
1923         StoreLo8(dst + j, _mm_packus_epi16(d, d));
1924         j += 8;
1925       } while (j < tx_width);
1926       dst += stride;
1927     } while (++i < tx_height);
1928   }
1929 }
1930 
Identity32Row16_SSE4_1(void * dest,const int32_t step)1931 LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
1932                                                   const int32_t step) {
1933   auto* const dst = static_cast<int16_t*>(dest);
1934 
1935   // When combining the identity32 multiplier with the row shift, the
1936   // calculation for tx_height equal to 16 can be simplified from
1937   // ((A * 4) + 1) >> 1) to (A * 2).
1938   for (int h = 0; h < 4; ++h) {
1939     for (int i = 0; i < 32; i += 8) {
1940       const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
1941       // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
1942       // saturating add here is ok.
1943       const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1944       StoreUnaligned16(&dst[h * step + i], v_dst_i);
1945     }
1946   }
1947 }
1948 
Identity32DcOnly(void * dest,int adjusted_tx_height)1949 LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
1950                                             int adjusted_tx_height) {
1951   if (adjusted_tx_height > 1) return false;
1952 
1953   auto* dst = static_cast<int16_t*>(dest);
1954   const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
1955   const __m128i v_kTransformRowMultiplier =
1956       _mm_set1_epi16(kTransformRowMultiplier << 3);
1957   const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1958 
1959   // When combining the identity32 multiplier with the row shift, the
1960   // calculation for tx_height equal to 16 can be simplified from
1961   // ((A * 4) + 1) >> 1) to (A * 2).
1962   const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
1963   dst[0] = _mm_extract_epi16(v_dst_0, 0);
1964   return true;
1965 }
1966 
Identity32ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1967 LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
1968     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1969     const int tx_width, const int tx_height, const int16_t* source) {
1970   const int stride = frame.columns();
1971   uint8_t* dst = frame[start_y] + start_x;
1972   const __m128i v_two = _mm_set1_epi16(2);
1973 
1974   int i = 0;
1975   do {
1976     const int row = i * tx_width;
1977     int j = 0;
1978     do {
1979       const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
1980       const __m128i frame_data = LoadLo8(dst + j);
1981       const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
1982       const __m128i b = _mm_srai_epi16(a, 2);
1983       const __m128i c = _mm_cvtepu8_epi16(frame_data);
1984       const __m128i d = _mm_adds_epi16(c, b);
1985       StoreLo8(dst + j, _mm_packus_epi16(d, d));
1986       j += 8;
1987     } while (j < tx_width);
1988     dst += stride;
1989   } while (++i < tx_height);
1990 }
1991 
1992 //------------------------------------------------------------------------------
1993 // Walsh Hadamard Transform.
1994 
1995 // Process 4 wht4 rows and columns.
Wht4_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const void * source,const int adjusted_tx_height)1996 LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
1997                                        const int start_x, const int start_y,
1998                                        const void* source,
1999                                        const int adjusted_tx_height) {
2000   const auto* const src = static_cast<const int16_t*>(source);
2001   __m128i s[4], x[4];
2002 
2003   if (adjusted_tx_height == 1) {
2004     // Special case: only src[0] is nonzero.
2005     //   src[0]  0   0   0
2006     //       0   0   0   0
2007     //       0   0   0   0
2008     //       0   0   0   0
2009     //
2010     // After the row and column transforms are applied, we have:
2011     //       f   h   h   h
2012     //       g   i   i   i
2013     //       g   i   i   i
2014     //       g   i   i   i
2015     // where f, g, h, i are computed as follows.
2016     int16_t f = (src[0] >> 2) - (src[0] >> 3);
2017     const int16_t g = f >> 1;
2018     f = f - (f >> 1);
2019     const int16_t h = (src[0] >> 3) - (src[0] >> 4);
2020     const int16_t i = (src[0] >> 4);
2021     s[0] = _mm_set1_epi16(h);
2022     s[0] = _mm_insert_epi16(s[0], f, 0);
2023     s[1] = _mm_set1_epi16(i);
2024     s[1] = _mm_insert_epi16(s[1], g, 0);
2025     s[2] = s[3] = s[1];
2026   } else {
2027     x[0] = LoadLo8(&src[0 * 4]);
2028     x[2] = LoadLo8(&src[1 * 4]);
2029     x[3] = LoadLo8(&src[2 * 4]);
2030     x[1] = LoadLo8(&src[3 * 4]);
2031 
2032     // Row transforms.
2033     Transpose4x4_U16(x, x);
2034     s[0] = _mm_srai_epi16(x[0], 2);
2035     s[2] = _mm_srai_epi16(x[1], 2);
2036     s[3] = _mm_srai_epi16(x[2], 2);
2037     s[1] = _mm_srai_epi16(x[3], 2);
2038     s[0] = _mm_add_epi16(s[0], s[2]);
2039     s[3] = _mm_sub_epi16(s[3], s[1]);
2040     __m128i e = _mm_sub_epi16(s[0], s[3]);
2041     e = _mm_srai_epi16(e, 1);
2042     s[1] = _mm_sub_epi16(e, s[1]);
2043     s[2] = _mm_sub_epi16(e, s[2]);
2044     s[0] = _mm_sub_epi16(s[0], s[1]);
2045     s[3] = _mm_add_epi16(s[3], s[2]);
2046     Transpose4x4_U16(s, s);
2047 
2048     // Column transforms.
2049     s[0] = _mm_add_epi16(s[0], s[2]);
2050     s[3] = _mm_sub_epi16(s[3], s[1]);
2051     e = _mm_sub_epi16(s[0], s[3]);
2052     e = _mm_srai_epi16(e, 1);
2053     s[1] = _mm_sub_epi16(e, s[1]);
2054     s[2] = _mm_sub_epi16(e, s[2]);
2055     s[0] = _mm_sub_epi16(s[0], s[1]);
2056     s[3] = _mm_add_epi16(s[3], s[2]);
2057   }
2058 
2059   // Store to frame.
2060   const int stride = frame.columns();
2061   uint8_t* dst = frame[start_y] + start_x;
2062   for (int row = 0; row < 4; ++row) {
2063     const __m128i frame_data = Load4(dst);
2064     const __m128i a = _mm_cvtepu8_epi16(frame_data);
2065     // Saturate to prevent overflowing int16_t
2066     const __m128i b = _mm_adds_epi16(a, s[row]);
2067     Store4(dst, _mm_packus_epi16(b, b));
2068     dst += stride;
2069   }
2070 }
2071 
2072 //------------------------------------------------------------------------------
2073 // row/column transform loops
2074 
2075 template <bool enable_flip_rows = false>
StoreToFrameWithRound(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source,TransformType tx_type)2076 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
2077     Array2DView<uint8_t> frame, const int start_x, const int start_y,
2078     const int tx_width, const int tx_height, const int16_t* source,
2079     TransformType tx_type) {
2080   const bool flip_rows =
2081       enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
2082   const __m128i v_eight = _mm_set1_epi16(8);
2083   const int stride = frame.columns();
2084   uint8_t* dst = frame[start_y] + start_x;
2085   if (tx_width == 4) {
2086     for (int i = 0; i < tx_height; ++i) {
2087       const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
2088       const __m128i residual = LoadLo8(&source[row]);
2089       const __m128i frame_data = Load4(dst);
2090       // Saturate to prevent overflowing int16_t
2091       const __m128i a = _mm_adds_epi16(residual, v_eight);
2092       const __m128i b = _mm_srai_epi16(a, 4);
2093       const __m128i c = _mm_cvtepu8_epi16(frame_data);
2094       const __m128i d = _mm_adds_epi16(c, b);
2095       Store4(dst, _mm_packus_epi16(d, d));
2096       dst += stride;
2097     }
2098   } else if (tx_width == 8) {
2099     for (int i = 0; i < tx_height; ++i) {
2100       const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
2101       const __m128i residual = LoadUnaligned16(&source[row]);
2102       const __m128i frame_data = LoadLo8(dst);
2103       // Saturate to prevent overflowing int16_t
2104       const __m128i b = _mm_adds_epi16(residual, v_eight);
2105       const __m128i c = _mm_srai_epi16(b, 4);
2106       const __m128i d = _mm_cvtepu8_epi16(frame_data);
2107       const __m128i e = _mm_adds_epi16(d, c);
2108       StoreLo8(dst, _mm_packus_epi16(e, e));
2109       dst += stride;
2110     }
2111   } else {
2112     for (int i = 0; i < tx_height; ++i) {
2113       const int y = start_y + i;
2114       const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
2115       int j = 0;
2116       do {
2117         const int x = start_x + j;
2118         const __m128i residual = LoadUnaligned16(&source[row + j]);
2119         const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
2120         const __m128i frame_data = LoadUnaligned16(frame[y] + x);
2121         const __m128i b = _mm_adds_epi16(residual, v_eight);
2122         const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
2123         const __m128i c = _mm_srai_epi16(b, 4);
2124         const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
2125         const __m128i d = _mm_cvtepu8_epi16(frame_data);
2126         const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
2127         const __m128i e = _mm_adds_epi16(d, c);
2128         const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
2129         StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
2130         j += 16;
2131       } while (j < tx_width);
2132     }
2133   }
2134 }
2135 
2136 template <int tx_height>
FlipColumns(int16_t * source,int tx_width)2137 LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
2138   const __m128i word_reverse_8 =
2139       _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
2140   if (tx_width >= 16) {
2141     int i = 0;
2142     do {
2143       // read 16 shorts
2144       const __m128i v3210 = LoadUnaligned16(&source[i]);
2145       const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
2146       const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
2147       const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
2148       StoreUnaligned16(&source[i], v4567);
2149       StoreUnaligned16(&source[i + 8], v0123);
2150       i += 16;
2151     } while (i < tx_width * tx_height);
2152   } else if (tx_width == 8) {
2153     for (int i = 0; i < 8 * tx_height; i += 8) {
2154       const __m128i a = LoadUnaligned16(&source[i]);
2155       const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
2156       StoreUnaligned16(&source[i], b);
2157     }
2158   } else {
2159     const __m128i dual_word_reverse_4 =
2160         _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
2161     // Process two rows per iteration.
2162     for (int i = 0; i < 4 * tx_height; i += 8) {
2163       const __m128i a = LoadUnaligned16(&source[i]);
2164       const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
2165       StoreUnaligned16(&source[i], b);
2166     }
2167   }
2168 }
2169 
2170 template <int tx_width>
ApplyRounding(int16_t * source,int num_rows)2171 LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
2172   const __m128i v_kTransformRowMultiplier =
2173       _mm_set1_epi16(kTransformRowMultiplier << 3);
2174   if (tx_width == 4) {
2175     // Process two rows per iteration.
2176     int i = 0;
2177     do {
2178       const __m128i a = LoadUnaligned16(&source[i]);
2179       const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2180       StoreUnaligned16(&source[i], b);
2181       i += 8;
2182     } while (i < tx_width * num_rows);
2183   } else {
2184     int i = 0;
2185     do {
2186       // The last 32 values of every row are always zero if the |tx_width| is
2187       // 64.
2188       const int non_zero_width = (tx_width < 64) ? tx_width : 32;
2189       int j = 0;
2190       do {
2191         const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
2192         const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2193         StoreUnaligned16(&source[i * tx_width + j], b);
2194         j += 8;
2195       } while (j < non_zero_width);
2196     } while (++i < num_rows);
2197   }
2198 }
2199 
2200 template <int tx_width>
RowShift(int16_t * source,int num_rows,int row_shift)2201 LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
2202                                     int row_shift) {
2203   const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
2204   const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
2205   if (tx_width == 4) {
2206     // Process two rows per iteration.
2207     int i = 0;
2208     do {
2209       const __m128i residual = LoadUnaligned16(&source[i]);
2210       const __m128i shifted_residual =
2211           ShiftResidual(residual, v_row_shift_add, v_row_shift);
2212       StoreUnaligned16(&source[i], shifted_residual);
2213       i += 8;
2214     } while (i < tx_width * num_rows);
2215   } else {
2216     int i = 0;
2217     do {
2218       for (int j = 0; j < tx_width; j += 8) {
2219         const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
2220         const __m128i shifted_residual =
2221             ShiftResidual(residual, v_row_shift_add, v_row_shift);
2222         StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
2223       }
2224     } while (++i < num_rows);
2225   }
2226 }
2227 
Dct4TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2228 void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2229                                  TransformSize tx_size, int adjusted_tx_height,
2230                                  void* src_buffer, int /*start_x*/,
2231                                  int /*start_y*/, void* /*dst_frame*/) {
2232   auto* src = static_cast<int16_t*>(src_buffer);
2233   const int tx_height = kTransformHeight[tx_size];
2234   const bool should_round = (tx_height == 8);
2235   const int row_shift = static_cast<int>(tx_height == 16);
2236 
2237   if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
2238     return;
2239   }
2240 
2241   if (should_round) {
2242     ApplyRounding<4>(src, adjusted_tx_height);
2243   }
2244 
2245   if (adjusted_tx_height <= 4) {
2246     // Process 4 1d dct4 rows in parallel.
2247     Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
2248                                             /*transpose=*/true);
2249   } else {
2250     // Process 8 1d dct4 rows in parallel per iteration.
2251     int i = 0;
2252     do {
2253       Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
2254                                              /*transpose=*/true);
2255       i += 8;
2256     } while (i < adjusted_tx_height);
2257   }
2258   if (tx_height == 16) {
2259     RowShift<4>(src, adjusted_tx_height, 1);
2260   }
2261 }
2262 
Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2263 void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
2264                                     TransformSize tx_size,
2265                                     int adjusted_tx_height, void* src_buffer,
2266                                     int start_x, int start_y, void* dst_frame) {
2267   auto* src = static_cast<int16_t*>(src_buffer);
2268   const int tx_width = kTransformWidth[tx_size];
2269 
2270   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2271     FlipColumns<4>(src, tx_width);
2272   }
2273 
2274   if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
2275     if (tx_width == 4) {
2276       // Process 4 1d dct4 columns in parallel.
2277       Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
2278                                               /*transpose=*/false);
2279     } else {
2280       // Process 8 1d dct4 columns in parallel per iteration.
2281       int i = 0;
2282       do {
2283         Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
2284                                                /*transpose=*/false);
2285         i += 8;
2286       } while (i < tx_width);
2287     }
2288   }
2289   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2290   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
2291 }
2292 
Dct8TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2293 void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2294                                  TransformSize tx_size, int adjusted_tx_height,
2295                                  void* src_buffer, int /*start_x*/,
2296                                  int /*start_y*/, void* /*dst_frame*/) {
2297   auto* src = static_cast<int16_t*>(src_buffer);
2298   const bool should_round = kShouldRound[tx_size];
2299   const uint8_t row_shift = kTransformRowShift[tx_size];
2300 
2301   if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
2302     return;
2303   }
2304 
2305   if (should_round) {
2306     ApplyRounding<8>(src, adjusted_tx_height);
2307   }
2308 
2309   if (adjusted_tx_height <= 4) {
2310     // Process 4 1d dct8 rows in parallel.
2311     Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
2312   } else {
2313     // Process 8 1d dct8 rows in parallel per iteration.
2314     int i = 0;
2315     do {
2316       Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
2317                                               /*transpose=*/true);
2318       i += 8;
2319     } while (i < adjusted_tx_height);
2320   }
2321   if (row_shift > 0) {
2322     RowShift<8>(src, adjusted_tx_height, row_shift);
2323   }
2324 }
2325 
Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2326 void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
2327                                     TransformSize tx_size,
2328                                     int adjusted_tx_height, void* src_buffer,
2329                                     int start_x, int start_y, void* dst_frame) {
2330   auto* src = static_cast<int16_t*>(src_buffer);
2331   const int tx_width = kTransformWidth[tx_size];
2332 
2333   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2334     FlipColumns<8>(src, tx_width);
2335   }
2336 
2337   if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
2338     if (tx_width == 4) {
2339       // Process 4 1d dct8 columns in parallel.
2340       Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2341     } else {
2342       // Process 8 1d dct8 columns in parallel per iteration.
2343       int i = 0;
2344       do {
2345         Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2346                                                 /*transpose=*/false);
2347         i += 8;
2348       } while (i < tx_width);
2349     }
2350   }
2351   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2352   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
2353 }
2354 
Dct16TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2355 void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2356                                   TransformSize tx_size, int adjusted_tx_height,
2357                                   void* src_buffer, int /*start_x*/,
2358                                   int /*start_y*/, void* /*dst_frame*/) {
2359   auto* src = static_cast<int16_t*>(src_buffer);
2360   const bool should_round = kShouldRound[tx_size];
2361   const uint8_t row_shift = kTransformRowShift[tx_size];
2362 
2363   if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
2364     return;
2365   }
2366 
2367   if (should_round) {
2368     ApplyRounding<16>(src, adjusted_tx_height);
2369   }
2370 
2371   if (adjusted_tx_height <= 4) {
2372     // Process 4 1d dct16 rows in parallel.
2373     Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
2374   } else {
2375     int i = 0;
2376     do {
2377       // Process 8 1d dct16 rows in parallel per iteration.
2378       Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
2379                                                /*transpose=*/true);
2380       i += 8;
2381     } while (i < adjusted_tx_height);
2382   }
2383   // row_shift is always non zero here.
2384   RowShift<16>(src, adjusted_tx_height, row_shift);
2385 }
2386 
Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2387 void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
2388                                      TransformSize tx_size,
2389                                      int adjusted_tx_height, void* src_buffer,
2390                                      int start_x, int start_y,
2391                                      void* dst_frame) {
2392   auto* src = static_cast<int16_t*>(src_buffer);
2393   const int tx_width = kTransformWidth[tx_size];
2394 
2395   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2396     FlipColumns<16>(src, tx_width);
2397   }
2398 
2399   if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
2400     if (tx_width == 4) {
2401       // Process 4 1d dct16 columns in parallel.
2402       Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2403     } else {
2404       int i = 0;
2405       do {
2406         // Process 8 1d dct16 columns in parallel per iteration.
2407         Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2408                                                  /*transpose=*/false);
2409         i += 8;
2410       } while (i < tx_width);
2411     }
2412   }
2413   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2414   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
2415 }
2416 
Dct32TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2417 void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2418                                   TransformSize tx_size, int adjusted_tx_height,
2419                                   void* src_buffer, int /*start_x*/,
2420                                   int /*start_y*/, void* /*dst_frame*/) {
2421   auto* src = static_cast<int16_t*>(src_buffer);
2422   const bool should_round = kShouldRound[tx_size];
2423   const uint8_t row_shift = kTransformRowShift[tx_size];
2424 
2425   if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
2426     return;
2427   }
2428 
2429   if (should_round) {
2430     ApplyRounding<32>(src, adjusted_tx_height);
2431   }
2432   // Process 8 1d dct32 rows in parallel per iteration.
2433   int i = 0;
2434   do {
2435     Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
2436     i += 8;
2437   } while (i < adjusted_tx_height);
2438   // row_shift is always non zero here.
2439   RowShift<32>(src, adjusted_tx_height, row_shift);
2440 }
2441 
Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2442 void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
2443                                      TransformSize tx_size,
2444                                      int adjusted_tx_height, void* src_buffer,
2445                                      int start_x, int start_y,
2446                                      void* dst_frame) {
2447   auto* src = static_cast<int16_t*>(src_buffer);
2448   const int tx_width = kTransformWidth[tx_size];
2449 
2450   if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
2451     // Process 8 1d dct32 columns in parallel per iteration.
2452     int i = 0;
2453     do {
2454       Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
2455       i += 8;
2456     } while (i < tx_width);
2457   }
2458   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2459   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
2460 }
2461 
Dct64TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2462 void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2463                                   TransformSize tx_size, int adjusted_tx_height,
2464                                   void* src_buffer, int /*start_x*/,
2465                                   int /*start_y*/, void* /*dst_frame*/) {
2466   auto* src = static_cast<int16_t*>(src_buffer);
2467   const bool should_round = kShouldRound[tx_size];
2468   const uint8_t row_shift = kTransformRowShift[tx_size];
2469 
2470   if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
2471     return;
2472   }
2473 
2474   if (should_round) {
2475     ApplyRounding<64>(src, adjusted_tx_height);
2476   }
2477   // Process 8 1d dct64 rows in parallel per iteration.
2478   int i = 0;
2479   do {
2480     Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
2481     i += 8;
2482   } while (i < adjusted_tx_height);
2483   // row_shift is always non zero here.
2484   RowShift<64>(src, adjusted_tx_height, row_shift);
2485 }
2486 
Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2487 void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
2488                                      TransformSize tx_size,
2489                                      int adjusted_tx_height, void* src_buffer,
2490                                      int start_x, int start_y,
2491                                      void* dst_frame) {
2492   auto* src = static_cast<int16_t*>(src_buffer);
2493   const int tx_width = kTransformWidth[tx_size];
2494 
2495   if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
2496     // Process 8 1d dct64 columns in parallel per iteration.
2497     int i = 0;
2498     do {
2499       Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
2500       i += 8;
2501     } while (i < tx_width);
2502   }
2503   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2504   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
2505 }
2506 
Adst4TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2507 void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2508                                   TransformSize tx_size, int adjusted_tx_height,
2509                                   void* src_buffer, int /*start_x*/,
2510                                   int /*start_y*/, void* /*dst_frame*/) {
2511   auto* src = static_cast<int16_t*>(src_buffer);
2512   const int tx_height = kTransformHeight[tx_size];
2513   const int row_shift = static_cast<int>(tx_height == 16);
2514   const bool should_round = (tx_height == 8);
2515 
2516   if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2517     return;
2518   }
2519 
2520   if (should_round) {
2521     ApplyRounding<4>(src, adjusted_tx_height);
2522   }
2523 
2524   // Process 4 1d adst4 rows in parallel per iteration.
2525   int i = 0;
2526   do {
2527     Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
2528     i += 4;
2529   } while (i < adjusted_tx_height);
2530 
2531   if (row_shift != 0) {
2532     RowShift<4>(src, adjusted_tx_height, 1);
2533   }
2534 }
2535 
Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2536 void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
2537                                      TransformSize tx_size,
2538                                      int adjusted_tx_height, void* src_buffer,
2539                                      int start_x, int start_y,
2540                                      void* dst_frame) {
2541   auto* src = static_cast<int16_t*>(src_buffer);
2542   const int tx_width = kTransformWidth[tx_size];
2543 
2544   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2545     FlipColumns<4>(src, tx_width);
2546   }
2547 
2548   if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2549     // Process 4 1d adst4 columns in parallel per iteration.
2550     int i = 0;
2551     do {
2552       Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
2553       i += 4;
2554     } while (i < tx_width);
2555   }
2556   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2557   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2558                                                    tx_width, 4, src, tx_type);
2559 }
2560 
Adst8TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2561 void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2562                                   TransformSize tx_size, int adjusted_tx_height,
2563                                   void* src_buffer, int /*start_x*/,
2564                                   int /*start_y*/, void* /*dst_frame*/) {
2565   auto* src = static_cast<int16_t*>(src_buffer);
2566   const bool should_round = kShouldRound[tx_size];
2567   const uint8_t row_shift = kTransformRowShift[tx_size];
2568 
2569   if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2570     return;
2571   }
2572 
2573   if (should_round) {
2574     ApplyRounding<8>(src, adjusted_tx_height);
2575   }
2576 
2577   if (adjusted_tx_height <= 4) {
2578     // Process 4 1d adst8 rows in parallel.
2579     Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
2580                                             /*transpose=*/true);
2581   } else {
2582     // Process 8 1d adst8 rows in parallel per iteration.
2583     int i = 0;
2584     do {
2585       Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
2586                                                /*transpose=*/true);
2587       i += 8;
2588     } while (i < adjusted_tx_height);
2589   }
2590   if (row_shift > 0) {
2591     RowShift<8>(src, adjusted_tx_height, row_shift);
2592   }
2593 }
2594 
Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2595 void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
2596                                      TransformSize tx_size,
2597                                      int adjusted_tx_height, void* src_buffer,
2598                                      int start_x, int start_y,
2599                                      void* dst_frame) {
2600   auto* src = static_cast<int16_t*>(src_buffer);
2601   const int tx_width = kTransformWidth[tx_size];
2602 
2603   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2604     FlipColumns<8>(src, tx_width);
2605   }
2606 
2607   if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2608     if (tx_width == 4) {
2609       // Process 4 1d adst8 columns in parallel.
2610       Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2611     } else {
2612       // Process 8 1d adst8 columns in parallel per iteration.
2613       int i = 0;
2614       do {
2615         Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2616                                                  /*transpose=*/false);
2617         i += 8;
2618       } while (i < tx_width);
2619     }
2620   }
2621   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2622   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2623                                                    tx_width, 8, src, tx_type);
2624 }
2625 
Adst16TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2626 void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2627                                    TransformSize tx_size,
2628                                    int adjusted_tx_height, void* src_buffer,
2629                                    int /*start_x*/, int /*start_y*/,
2630                                    void* /*dst_frame*/) {
2631   auto* src = static_cast<int16_t*>(src_buffer);
2632   const bool should_round = kShouldRound[tx_size];
2633   const uint8_t row_shift = kTransformRowShift[tx_size];
2634 
2635   if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2636     return;
2637   }
2638 
2639   if (should_round) {
2640     ApplyRounding<16>(src, adjusted_tx_height);
2641   }
2642 
2643   if (adjusted_tx_height <= 4) {
2644     // Process 4 1d adst16 rows in parallel.
2645     Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
2646   } else {
2647     int i = 0;
2648     do {
2649       // Process 8 1d adst16 rows in parallel per iteration.
2650       Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
2651                                                 /*transpose=*/true);
2652       i += 8;
2653     } while (i < adjusted_tx_height);
2654   }
2655   // row_shift is always non zero here.
2656   RowShift<16>(src, adjusted_tx_height, row_shift);
2657 }
2658 
Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2659 void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
2660                                       TransformSize tx_size,
2661                                       int adjusted_tx_height, void* src_buffer,
2662                                       int start_x, int start_y,
2663                                       void* dst_frame) {
2664   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2665   auto* src = static_cast<int16_t*>(src_buffer);
2666   const int tx_width = kTransformWidth[tx_size];
2667 
2668   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2669     FlipColumns<16>(src, tx_width);
2670   }
2671 
2672   if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2673     if (tx_width == 4) {
2674       // Process 4 1d adst16 columns in parallel.
2675       Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
2676     } else {
2677       int i = 0;
2678       do {
2679         // Process 8 1d adst16 columns in parallel per iteration.
2680         Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
2681                                                   /*transpose=*/false);
2682         i += 8;
2683       } while (i < tx_width);
2684     }
2685   }
2686   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2687                                                    tx_width, 16, src, tx_type);
2688 }
2689 
Identity4TransformLoopRow_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2690 void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
2691                                       TransformSize tx_size,
2692                                       int adjusted_tx_height, void* src_buffer,
2693                                       int /*start_x*/, int /*start_y*/,
2694                                       void* /*dst_frame*/) {
2695   // Special case: Process row calculations during column transform call.
2696   // Improves performance.
2697   if (tx_type == kTransformTypeIdentityIdentity &&
2698       tx_size == kTransformSize4x4) {
2699     return;
2700   }
2701 
2702   auto* src = static_cast<int16_t*>(src_buffer);
2703   const int tx_height = kTransformHeight[tx_size];
2704   const bool should_round = (tx_height == 8);
2705   if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
2706     return;
2707   }
2708 
2709   if (should_round) {
2710     ApplyRounding<4>(src, adjusted_tx_height);
2711   }
2712   if (tx_height < 16) {
2713     int i = 0;
2714     do {
2715       Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
2716       i += 4;
2717     } while (i < adjusted_tx_height);
2718   } else {
2719     int i = 0;
2720     do {
2721       Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
2722       i += 4;
2723     } while (i < adjusted_tx_height);
2724   }
2725 }
2726 
Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2727 void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
2728                                          TransformSize tx_size,
2729                                          int adjusted_tx_height,
2730                                          void* src_buffer, int start_x,
2731                                          int start_y, void* dst_frame) {
2732   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2733   auto* src = static_cast<int16_t*>(src_buffer);
2734   const int tx_width = kTransformWidth[tx_size];
2735 
2736   // Special case: Process row calculations during column transform call.
2737   if (tx_type == kTransformTypeIdentityIdentity &&
2738       (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
2739     Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
2740                                    adjusted_tx_height, src);
2741     return;
2742   }
2743 
2744   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2745     FlipColumns<4>(src, tx_width);
2746   }
2747 
2748   Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
2749                               adjusted_tx_height, src);
2750 }
2751 
Identity8TransformLoopRow_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2752 void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
2753                                       TransformSize tx_size,
2754                                       int adjusted_tx_height, void* src_buffer,
2755                                       int /*start_x*/, int /*start_y*/,
2756                                       void* /*dst_frame*/) {
2757   // Special case: Process row calculations during column transform call.
2758   // Improves performance.
2759   if (tx_type == kTransformTypeIdentityIdentity &&
2760       tx_size == kTransformSize8x4) {
2761     return;
2762   }
2763 
2764   auto* src = static_cast<int16_t*>(src_buffer);
2765   const int tx_height = kTransformHeight[tx_size];
2766   const bool should_round = kShouldRound[tx_size];
2767   const uint8_t row_shift = kTransformRowShift[tx_size];
2768   if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2769     return;
2770   }
2771 
2772   if (should_round) {
2773     ApplyRounding<8>(src, adjusted_tx_height);
2774   }
2775 
2776   // When combining the identity8 multiplier with the row shift, the
2777   // calculations for tx_height == 8 and tx_height == 16 can be simplified
2778   // from ((A * 2) + 1) >> 1) to A.
2779   if ((tx_height & 0x18) != 0) {
2780     return;
2781   }
2782   if (tx_height == 32) {
2783     int i = 0;
2784     do {
2785       Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
2786       i += 4;
2787     } while (i < adjusted_tx_height);
2788     return;
2789   }
2790 
2791   assert(tx_size == kTransformSize8x4);
2792   int i = 0;
2793   do {
2794     Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
2795     i += 4;
2796   } while (i < adjusted_tx_height);
2797 }
2798 
Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2799 void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
2800                                          TransformSize tx_size,
2801                                          int adjusted_tx_height,
2802                                          void* src_buffer, int start_x,
2803                                          int start_y, void* dst_frame) {
2804   auto* src = static_cast<int16_t*>(src_buffer);
2805   const int tx_width = kTransformWidth[tx_size];
2806 
2807   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2808     FlipColumns<8>(src, tx_width);
2809   }
2810 
2811   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2812   Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
2813                                      adjusted_tx_height, src);
2814 }
2815 
Identity16TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2816 void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2817                                        TransformSize tx_size,
2818                                        int adjusted_tx_height, void* src_buffer,
2819                                        int /*start_x*/, int /*start_y*/,
2820                                        void* /*dst_frame*/) {
2821   auto* src = static_cast<int16_t*>(src_buffer);
2822   const bool should_round = kShouldRound[tx_size];
2823   const uint8_t row_shift = kTransformRowShift[tx_size];
2824   if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2825     return;
2826   }
2827 
2828   if (should_round) {
2829     ApplyRounding<16>(src, adjusted_tx_height);
2830   }
2831   int i = 0;
2832   do {
2833     Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
2834                          kTransformRowShift[tx_size]);
2835     i += 4;
2836   } while (i < adjusted_tx_height);
2837 }
2838 
Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2839 void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
2840                                           TransformSize tx_size,
2841                                           int adjusted_tx_height,
2842                                           void* src_buffer, int start_x,
2843                                           int start_y, void* dst_frame) {
2844   auto* src = static_cast<int16_t*>(src_buffer);
2845   const int tx_width = kTransformWidth[tx_size];
2846 
2847   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2848     FlipColumns<16>(src, tx_width);
2849   }
2850   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2851   Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
2852                                       adjusted_tx_height, src);
2853 }
2854 
Identity32TransformLoopRow_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2855 void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
2856                                        TransformSize tx_size,
2857                                        int adjusted_tx_height, void* src_buffer,
2858                                        int /*start_x*/, int /*start_y*/,
2859                                        void* /*dst_frame*/) {
2860   const int tx_height = kTransformHeight[tx_size];
2861   // When combining the identity32 multiplier with the row shift, the
2862   // calculations for tx_height == 8 and tx_height == 32 can be simplified
2863   // from ((A * 4) + 2) >> 2) to A.
2864   if ((tx_height & 0x28) != 0) {
2865     return;
2866   }
2867 
2868   // Process kTransformSize32x16. The src is always rounded before the
2869   // identity transform and shifted by 1 afterwards.
2870   auto* src = static_cast<int16_t*>(src_buffer);
2871   if (Identity32DcOnly(src, adjusted_tx_height)) {
2872     return;
2873   }
2874 
2875   assert(tx_size == kTransformSize32x16);
2876   ApplyRounding<32>(src, adjusted_tx_height);
2877   int i = 0;
2878   do {
2879     Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
2880     i += 4;
2881   } while (i < adjusted_tx_height);
2882 }
2883 
Identity32TransformLoopColumn_SSE4_1(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2884 void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
2885                                           TransformSize tx_size,
2886                                           int adjusted_tx_height,
2887                                           void* src_buffer, int start_x,
2888                                           int start_y, void* dst_frame) {
2889   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2890   auto* src = static_cast<int16_t*>(src_buffer);
2891   const int tx_width = kTransformWidth[tx_size];
2892 
2893   Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
2894                                adjusted_tx_height, src);
2895 }
2896 
Wht4TransformLoopRow_SSE4_1(TransformType tx_type,TransformSize tx_size,int,void *,int,int,void *)2897 void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
2898                                  int /*adjusted_tx_height*/,
2899                                  void* /*src_buffer*/, int /*start_x*/,
2900                                  int /*start_y*/, void* /*dst_frame*/) {
2901   assert(tx_type == kTransformTypeDctDct);
2902   assert(tx_size == kTransformSize4x4);
2903   static_cast<void>(tx_type);
2904   static_cast<void>(tx_size);
2905   // Do both row and column transforms in the column-transform pass.
2906 }
2907 
Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2908 void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
2909                                     TransformSize tx_size,
2910                                     int adjusted_tx_height, void* src_buffer,
2911                                     int start_x, int start_y, void* dst_frame) {
2912   assert(tx_type == kTransformTypeDctDct);
2913   assert(tx_size == kTransformSize4x4);
2914   static_cast<void>(tx_type);
2915   static_cast<void>(tx_size);
2916 
2917   // Do both row and column transforms in the column-transform pass.
2918   // Process 4 1d wht4 rows and columns in parallel.
2919   const auto* src = static_cast<int16_t*>(src_buffer);
2920   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2921   Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
2922 }
2923 
2924 //------------------------------------------------------------------------------
2925 
Init8bpp()2926 void Init8bpp() {
2927   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
2928   assert(dsp != nullptr);
2929 
2930   // Maximum transform size for Dct is 64.
2931 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
2932   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
2933       Dct4TransformLoopRow_SSE4_1;
2934   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
2935       Dct4TransformLoopColumn_SSE4_1;
2936 #endif
2937 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
2938   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
2939       Dct8TransformLoopRow_SSE4_1;
2940   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
2941       Dct8TransformLoopColumn_SSE4_1;
2942 #endif
2943 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
2944   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
2945       Dct16TransformLoopRow_SSE4_1;
2946   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
2947       Dct16TransformLoopColumn_SSE4_1;
2948 #endif
2949 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
2950   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
2951       Dct32TransformLoopRow_SSE4_1;
2952   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
2953       Dct32TransformLoopColumn_SSE4_1;
2954 #endif
2955 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
2956   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
2957       Dct64TransformLoopRow_SSE4_1;
2958   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
2959       Dct64TransformLoopColumn_SSE4_1;
2960 #endif
2961 
2962   // Maximum transform size for Adst is 16.
2963 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
2964   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
2965       Adst4TransformLoopRow_SSE4_1;
2966   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
2967       Adst4TransformLoopColumn_SSE4_1;
2968 #endif
2969 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
2970   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
2971       Adst8TransformLoopRow_SSE4_1;
2972   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
2973       Adst8TransformLoopColumn_SSE4_1;
2974 #endif
2975 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
2976   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
2977       Adst16TransformLoopRow_SSE4_1;
2978   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
2979       Adst16TransformLoopColumn_SSE4_1;
2980 #endif
2981 
2982   // Maximum transform size for Identity transform is 32.
2983 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
2984   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
2985       Identity4TransformLoopRow_SSE4_1;
2986   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
2987       Identity4TransformLoopColumn_SSE4_1;
2988 #endif
2989 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
2990   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
2991       Identity8TransformLoopRow_SSE4_1;
2992   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
2993       Identity8TransformLoopColumn_SSE4_1;
2994 #endif
2995 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
2996   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
2997       Identity16TransformLoopRow_SSE4_1;
2998   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
2999       Identity16TransformLoopColumn_SSE4_1;
3000 #endif
3001 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
3002   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
3003       Identity32TransformLoopRow_SSE4_1;
3004   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
3005       Identity32TransformLoopColumn_SSE4_1;
3006 #endif
3007 
3008   // Maximum transform size for Wht is 4.
3009 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
3010   dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
3011       Wht4TransformLoopRow_SSE4_1;
3012   dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
3013       Wht4TransformLoopColumn_SSE4_1;
3014 #endif
3015 }
3016 
3017 }  // namespace
3018 }  // namespace low_bitdepth
3019 
InverseTransformInit_SSE4_1()3020 void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
3021 
3022 }  // namespace dsp
3023 }  // namespace libgav1
3024 #else   // !LIBGAV1_TARGETING_SSE4_1
3025 namespace libgav1 {
3026 namespace dsp {
3027 
InverseTransformInit_SSE4_1()3028 void InverseTransformInit_SSE4_1() {}
3029 
3030 }  // namespace dsp
3031 }  // namespace libgav1
3032 #endif  // LIBGAV1_TARGETING_SSE4_1
3033