• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/inverse_transform.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_SSE4_1
19 
20 #include <smmintrin.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstdint>
25 #include <cstring>
26 
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/dsp/x86/transpose_sse4.h"
31 #include "src/utils/array_2d.h"
32 #include "src/utils/common.h"
33 #include "src/utils/compiler_attributes.h"
34 
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39 
40 // Include the constants and utility functions inside the anonymous namespace.
41 #include "src/dsp/inverse_transform.inc"
42 
43 template <int store_width, int store_count>
StoreDst(int16_t * dst,int32_t stride,int32_t idx,const __m128i * s)44 LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
45                                     const __m128i* s) {
46   // NOTE: It is expected that the compiler will unroll these loops.
47   if (store_width == 16) {
48     for (int i = 0; i < store_count; i += 4) {
49       StoreUnaligned16(&dst[i * stride + idx], s[i]);
50       StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
51       StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
52       StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
53     }
54   }
55   if (store_width == 8) {
56     for (int i = 0; i < store_count; i += 4) {
57       StoreLo8(&dst[i * stride + idx], s[i]);
58       StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
59       StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
60       StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
61     }
62   }
63 }
64 
65 template <int load_width, int load_count>
LoadSrc(const int16_t * src,int32_t stride,int32_t idx,__m128i * x)66 LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
67                                    int32_t idx, __m128i* x) {
68   // NOTE: It is expected that the compiler will unroll these loops.
69   if (load_width == 16) {
70     for (int i = 0; i < load_count; i += 4) {
71       x[i] = LoadUnaligned16(&src[i * stride + idx]);
72       x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
73       x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
74       x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
75     }
76   }
77   if (load_width == 8) {
78     for (int i = 0; i < load_count; i += 4) {
79       x[i] = LoadLo8(&src[i * stride + idx]);
80       x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
81       x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
82       x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
83     }
84   }
85 }
86 
87 // Butterfly rotate 4 values.
ButterflyRotation_4(__m128i * a,__m128i * b,const int angle,const bool flip)88 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
89                                                const int angle,
90                                                const bool flip) {
91   const int16_t cos128 = Cos128(angle);
92   const int16_t sin128 = Sin128(angle);
93   const __m128i psin_pcos = _mm_set1_epi32(
94       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
95   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
96   const __m128i ab = _mm_unpacklo_epi16(*b, *a);
97   const __m128i sign =
98       _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
99   // -sin cos, -sin cos, -sin cos, -sin cos
100   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
101   const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
102   const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
103   const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
104   const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
105   const __m128i x = _mm_packs_epi32(x1, x1);
106   const __m128i y = _mm_packs_epi32(y1, y1);
107   if (flip) {
108     *a = y;
109     *b = x;
110   } else {
111     *a = x;
112     *b = y;
113   }
114 }
115 
116 // Butterfly rotate 8 values.
ButterflyRotation_8(__m128i * a,__m128i * b,const int angle,const bool flip)117 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
118                                                const int angle,
119                                                const bool flip) {
120   const int16_t cos128 = Cos128(angle);
121   const int16_t sin128 = Sin128(angle);
122   const __m128i psin_pcos = _mm_set1_epi32(
123       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
124   const __m128i sign =
125       _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
126   // -sin cos, -sin cos, -sin cos, -sin cos
127   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
128   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
129   const __m128i ab = _mm_unpacklo_epi16(*b, *a);
130   const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
131   const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
132   const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
133   const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
134   const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
135   const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
136   const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
137   const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
138   const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
139   const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
140   const __m128i x = _mm_packs_epi32(x1, x1_hi);
141   const __m128i y = _mm_packs_epi32(y1, y1_hi);
142   if (flip) {
143     *a = y;
144     *b = x;
145   } else {
146     *a = x;
147     *b = y;
148   }
149 }
150 
ButterflyRotation_FirstIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)151 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
152                                                          const int angle,
153                                                          const bool flip) {
154   const int16_t cos128 = Cos128(angle);
155   const int16_t sin128 = Sin128(angle);
156   const __m128i pcos = _mm_set1_epi16(cos128 << 3);
157   const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
158   const __m128i x = _mm_mulhrs_epi16(*b, psin);
159   const __m128i y = _mm_mulhrs_epi16(*b, pcos);
160   if (flip) {
161     *a = y;
162     *b = x;
163   } else {
164     *a = x;
165     *b = y;
166   }
167 }
168 
ButterflyRotation_SecondIsZero(__m128i * a,__m128i * b,const int angle,const bool flip)169 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
170                                                           __m128i* b,
171                                                           const int angle,
172                                                           const bool flip) {
173   const int16_t cos128 = Cos128(angle);
174   const int16_t sin128 = Sin128(angle);
175   const __m128i pcos = _mm_set1_epi16(cos128 << 3);
176   const __m128i psin = _mm_set1_epi16(sin128 << 3);
177   const __m128i x = _mm_mulhrs_epi16(*a, pcos);
178   const __m128i y = _mm_mulhrs_epi16(*a, psin);
179   if (flip) {
180     *a = y;
181     *b = x;
182   } else {
183     *a = x;
184     *b = y;
185   }
186 }
187 
HadamardRotation(__m128i * a,__m128i * b,bool flip)188 LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
189   __m128i x, y;
190   if (flip) {
191     y = _mm_adds_epi16(*b, *a);
192     x = _mm_subs_epi16(*b, *a);
193   } else {
194     x = _mm_adds_epi16(*a, *b);
195     y = _mm_subs_epi16(*a, *b);
196   }
197   *a = x;
198   *b = y;
199 }
200 
201 using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
202                                        bool flip);
203 
ShiftResidual(const __m128i residual,const __m128i v_row_shift_add,const __m128i v_row_shift)204 LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
205                                             const __m128i v_row_shift_add,
206                                             const __m128i v_row_shift) {
207   const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
208   // The max row_shift is 2, so int16_t values greater than 0x7ffd may
209   // overflow.  Generate a mask for this case.
210   const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
211   const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
212   // Assume int16_t values.
213   const __m128i a = _mm_sra_epi16(x, v_row_shift);
214   // Assume uint16_t values.
215   const __m128i b = _mm_srl_epi16(x, v_row_shift);
216   // Select the correct shifted value.
217   return _mm_blendv_epi8(a, b, mask);
218 }
219 
220 //------------------------------------------------------------------------------
221 // Discrete Cosine Transforms (DCT).
222 
223 template <int width>
DctDcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)224 LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, const void* source,
225                                      int non_zero_coeff_count,
226                                      bool should_round, int row_shift) {
227   if (non_zero_coeff_count > 1) {
228     return false;
229   }
230 
231   auto* dst = static_cast<int16_t*>(dest);
232   const auto* const src = static_cast<const int16_t*>(source);
233 
234   const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
235   const __m128i v_src =
236       (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
237   const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
238   const __m128i v_kTransformRowMultiplier =
239       _mm_set1_epi16(kTransformRowMultiplier << 3);
240   const __m128i v_src_round =
241       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
242   const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
243   const int16_t cos128 = Cos128(32);
244   const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
245 
246   // Expand to 32 bits to prevent int16_t overflows during the shift add.
247   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
248   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
249   const __m128i a = _mm_cvtepi16_epi32(xy);
250   const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
251   const __m128i b = _mm_add_epi32(a, v_row_shift_add);
252   const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
253   const __m128i c = _mm_sra_epi32(b, v_row_shift);
254   const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
255   const __m128i xy_shifted = _mm_packs_epi32(c, c1);
256 
257   if (width == 4) {
258     StoreLo8(dst, xy_shifted);
259   } else {
260     for (int i = 0; i < width; i += 8) {
261       StoreUnaligned16(dst, xy_shifted);
262       dst += 8;
263     }
264   }
265   return true;
266 }
267 
268 template <int height>
DctDcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)269 LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, const void* source,
270                                            int non_zero_coeff_count,
271                                            int width) {
272   if (non_zero_coeff_count > 1) {
273     return false;
274   }
275 
276   auto* dst = static_cast<int16_t*>(dest);
277   const auto* const src = static_cast<const int16_t*>(source);
278   const int16_t cos128 = Cos128(32);
279 
280   // Calculate dc values for first row.
281   if (width == 4) {
282     const __m128i v_src = LoadLo8(src);
283     const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
284     StoreLo8(dst, xy);
285   } else {
286     int i = 0;
287     do {
288       const __m128i v_src = LoadUnaligned16(&src[i]);
289       const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
290       StoreUnaligned16(&dst[i], xy);
291       i += 8;
292     } while (i < width);
293   }
294 
295   // Copy first row to the rest of the block.
296   for (int y = 1; y < height; ++y) {
297     memcpy(&dst[y * width], &src[(y - 1) * width], width * sizeof(dst[0]));
298   }
299   return true;
300 }
301 
302 template <ButterflyRotationFunc bufferfly_rotation,
303           bool is_fast_bufferfly = false>
Dct4Stages(__m128i * s)304 LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
305   // stage 12.
306   if (is_fast_bufferfly) {
307     ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
308     ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
309   } else {
310     bufferfly_rotation(&s[0], &s[1], 32, true);
311     bufferfly_rotation(&s[2], &s[3], 48, false);
312   }
313 
314   // stage 17.
315   HadamardRotation(&s[0], &s[3], false);
316   HadamardRotation(&s[1], &s[2], false);
317 }
318 
319 // Process 4 dct4 rows or columns, depending on the transpose flag.
320 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Dct4_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)321 LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, const void* source,
322                                        int32_t step, bool transpose) {
323   auto* const dst = static_cast<int16_t*>(dest);
324   const auto* const src = static_cast<const int16_t*>(source);
325   __m128i s[4], x[4];
326 
327   if (stage_is_rectangular) {
328     if (transpose) {
329       __m128i input[8];
330       LoadSrc<8, 8>(src, step, 0, input);
331       Transpose4x8To8x4_U16(input, x);
332     } else {
333       LoadSrc<16, 4>(src, step, 0, x);
334     }
335   } else {
336     LoadSrc<8, 4>(src, step, 0, x);
337     if (transpose) {
338       Transpose4x4_U16(x, x);
339     }
340   }
341   // stage 1.
342   // kBitReverseLookup 0, 2, 1, 3
343   s[0] = x[0];
344   s[1] = x[2];
345   s[2] = x[1];
346   s[3] = x[3];
347 
348   Dct4Stages<bufferfly_rotation>(s);
349 
350   if (stage_is_rectangular) {
351     if (transpose) {
352       __m128i output[8];
353       Transpose8x4To4x8_U16(s, output);
354       StoreDst<8, 8>(dst, step, 0, output);
355     } else {
356       StoreDst<16, 4>(dst, step, 0, s);
357     }
358   } else {
359     if (transpose) {
360       Transpose4x4_U16(s, s);
361     }
362     StoreDst<8, 4>(dst, step, 0, s);
363   }
364 }
365 
366 template <ButterflyRotationFunc bufferfly_rotation,
367           bool is_fast_bufferfly = false>
Dct8Stages(__m128i * s)368 LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
369   // stage 8.
370   if (is_fast_bufferfly) {
371     ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
372     ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
373   } else {
374     bufferfly_rotation(&s[4], &s[7], 56, false);
375     bufferfly_rotation(&s[5], &s[6], 24, false);
376   }
377 
378   // stage 13.
379   HadamardRotation(&s[4], &s[5], false);
380   HadamardRotation(&s[6], &s[7], true);
381 
382   // stage 18.
383   bufferfly_rotation(&s[6], &s[5], 32, true);
384 
385   // stage 22.
386   HadamardRotation(&s[0], &s[7], false);
387   HadamardRotation(&s[1], &s[6], false);
388   HadamardRotation(&s[2], &s[5], false);
389   HadamardRotation(&s[3], &s[4], false);
390 }
391 
392 // Process dct8 rows or columns, depending on the transpose flag.
393 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Dct8_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)394 LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, const void* source,
395                                        int32_t step, bool transpose) {
396   auto* const dst = static_cast<int16_t*>(dest);
397   const auto* const src = static_cast<const int16_t*>(source);
398   __m128i s[8], x[8];
399 
400   if (stage_is_rectangular) {
401     if (transpose) {
402       __m128i input[4];
403       LoadSrc<16, 4>(src, step, 0, input);
404       Transpose8x4To4x8_U16(input, x);
405     } else {
406       LoadSrc<8, 8>(src, step, 0, x);
407     }
408   } else {
409     if (transpose) {
410       __m128i input[8];
411       LoadSrc<16, 8>(src, step, 0, input);
412       Transpose8x8_U16(input, x);
413     } else {
414       LoadSrc<16, 8>(src, step, 0, x);
415     }
416   }
417 
418   // stage 1.
419   // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
420   s[0] = x[0];
421   s[1] = x[4];
422   s[2] = x[2];
423   s[3] = x[6];
424   s[4] = x[1];
425   s[5] = x[5];
426   s[6] = x[3];
427   s[7] = x[7];
428 
429   Dct4Stages<bufferfly_rotation>(s);
430   Dct8Stages<bufferfly_rotation>(s);
431 
432   if (stage_is_rectangular) {
433     if (transpose) {
434       __m128i output[4];
435       Transpose4x8To8x4_U16(s, output);
436       StoreDst<16, 4>(dst, step, 0, output);
437     } else {
438       StoreDst<8, 8>(dst, step, 0, s);
439     }
440   } else {
441     if (transpose) {
442       __m128i output[8];
443       Transpose8x8_U16(s, output);
444       StoreDst<16, 8>(dst, step, 0, output);
445     } else {
446       StoreDst<16, 8>(dst, step, 0, s);
447     }
448   }
449 }
450 
451 template <ButterflyRotationFunc bufferfly_rotation,
452           bool is_fast_bufferfly = false>
Dct16Stages(__m128i * s)453 LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
454   // stage 5.
455   if (is_fast_bufferfly) {
456     ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
457     ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
458     ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
459     ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
460   } else {
461     bufferfly_rotation(&s[8], &s[15], 60, false);
462     bufferfly_rotation(&s[9], &s[14], 28, false);
463     bufferfly_rotation(&s[10], &s[13], 44, false);
464     bufferfly_rotation(&s[11], &s[12], 12, false);
465   }
466 
467   // stage 9.
468   HadamardRotation(&s[8], &s[9], false);
469   HadamardRotation(&s[10], &s[11], true);
470   HadamardRotation(&s[12], &s[13], false);
471   HadamardRotation(&s[14], &s[15], true);
472 
473   // stage 14.
474   bufferfly_rotation(&s[14], &s[9], 48, true);
475   bufferfly_rotation(&s[13], &s[10], 112, true);
476 
477   // stage 19.
478   HadamardRotation(&s[8], &s[11], false);
479   HadamardRotation(&s[9], &s[10], false);
480   HadamardRotation(&s[12], &s[15], true);
481   HadamardRotation(&s[13], &s[14], true);
482 
483   // stage 23.
484   bufferfly_rotation(&s[13], &s[10], 32, true);
485   bufferfly_rotation(&s[12], &s[11], 32, true);
486 
487   // stage 26.
488   HadamardRotation(&s[0], &s[15], false);
489   HadamardRotation(&s[1], &s[14], false);
490   HadamardRotation(&s[2], &s[13], false);
491   HadamardRotation(&s[3], &s[12], false);
492   HadamardRotation(&s[4], &s[11], false);
493   HadamardRotation(&s[5], &s[10], false);
494   HadamardRotation(&s[6], &s[9], false);
495   HadamardRotation(&s[7], &s[8], false);
496 }
497 
498 // Process dct16 rows or columns, depending on the transpose flag.
499 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Dct16_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)500 LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, const void* source,
501                                         int32_t step, bool transpose) {
502   auto* const dst = static_cast<int16_t*>(dest);
503   const auto* const src = static_cast<const int16_t*>(source);
504   __m128i s[16], x[16];
505 
506   if (stage_is_rectangular) {
507     if (transpose) {
508       __m128i input[4];
509       LoadSrc<16, 4>(src, step, 0, input);
510       Transpose8x4To4x8_U16(input, x);
511       LoadSrc<16, 4>(src, step, 8, input);
512       Transpose8x4To4x8_U16(input, &x[8]);
513     } else {
514       LoadSrc<8, 16>(src, step, 0, x);
515     }
516   } else {
517     if (transpose) {
518       for (int idx = 0; idx < 16; idx += 8) {
519         __m128i input[8];
520         LoadSrc<16, 8>(src, step, idx, input);
521         Transpose8x8_U16(input, &x[idx]);
522       }
523     } else {
524       LoadSrc<16, 16>(src, step, 0, x);
525     }
526   }
527 
528   // stage 1
529   // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
530   s[0] = x[0];
531   s[1] = x[8];
532   s[2] = x[4];
533   s[3] = x[12];
534   s[4] = x[2];
535   s[5] = x[10];
536   s[6] = x[6];
537   s[7] = x[14];
538   s[8] = x[1];
539   s[9] = x[9];
540   s[10] = x[5];
541   s[11] = x[13];
542   s[12] = x[3];
543   s[13] = x[11];
544   s[14] = x[7];
545   s[15] = x[15];
546 
547   Dct4Stages<bufferfly_rotation>(s);
548   Dct8Stages<bufferfly_rotation>(s);
549   Dct16Stages<bufferfly_rotation>(s);
550 
551   if (stage_is_rectangular) {
552     if (transpose) {
553       __m128i output[4];
554       Transpose4x8To8x4_U16(s, output);
555       StoreDst<16, 4>(dst, step, 0, output);
556       Transpose4x8To8x4_U16(&s[8], output);
557       StoreDst<16, 4>(dst, step, 8, output);
558     } else {
559       StoreDst<8, 16>(dst, step, 0, s);
560     }
561   } else {
562     if (transpose) {
563       for (int idx = 0; idx < 16; idx += 8) {
564         __m128i output[8];
565         Transpose8x8_U16(&s[idx], output);
566         StoreDst<16, 8>(dst, step, idx, output);
567       }
568     } else {
569       StoreDst<16, 16>(dst, step, 0, s);
570     }
571   }
572 }
573 
574 template <ButterflyRotationFunc bufferfly_rotation,
575           bool is_fast_butterfly = false>
Dct32Stages(__m128i * s)576 LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
577   // stage 3
578   if (is_fast_butterfly) {
579     ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
580     ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
581     ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
582     ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
583     ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
584     ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
585     ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
586     ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
587   } else {
588     bufferfly_rotation(&s[16], &s[31], 62, false);
589     bufferfly_rotation(&s[17], &s[30], 30, false);
590     bufferfly_rotation(&s[18], &s[29], 46, false);
591     bufferfly_rotation(&s[19], &s[28], 14, false);
592     bufferfly_rotation(&s[20], &s[27], 54, false);
593     bufferfly_rotation(&s[21], &s[26], 22, false);
594     bufferfly_rotation(&s[22], &s[25], 38, false);
595     bufferfly_rotation(&s[23], &s[24], 6, false);
596   }
597   // stage 6.
598   HadamardRotation(&s[16], &s[17], false);
599   HadamardRotation(&s[18], &s[19], true);
600   HadamardRotation(&s[20], &s[21], false);
601   HadamardRotation(&s[22], &s[23], true);
602   HadamardRotation(&s[24], &s[25], false);
603   HadamardRotation(&s[26], &s[27], true);
604   HadamardRotation(&s[28], &s[29], false);
605   HadamardRotation(&s[30], &s[31], true);
606 
607   // stage 10.
608   bufferfly_rotation(&s[30], &s[17], 24 + 32, true);
609   bufferfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
610   bufferfly_rotation(&s[26], &s[21], 24, true);
611   bufferfly_rotation(&s[25], &s[22], 24 + 64, true);
612 
613   // stage 15.
614   HadamardRotation(&s[16], &s[19], false);
615   HadamardRotation(&s[17], &s[18], false);
616   HadamardRotation(&s[20], &s[23], true);
617   HadamardRotation(&s[21], &s[22], true);
618   HadamardRotation(&s[24], &s[27], false);
619   HadamardRotation(&s[25], &s[26], false);
620   HadamardRotation(&s[28], &s[31], true);
621   HadamardRotation(&s[29], &s[30], true);
622 
623   // stage 20.
624   bufferfly_rotation(&s[29], &s[18], 48, true);
625   bufferfly_rotation(&s[28], &s[19], 48, true);
626   bufferfly_rotation(&s[27], &s[20], 48 + 64, true);
627   bufferfly_rotation(&s[26], &s[21], 48 + 64, true);
628 
629   // stage 24.
630   HadamardRotation(&s[16], &s[23], false);
631   HadamardRotation(&s[17], &s[22], false);
632   HadamardRotation(&s[18], &s[21], false);
633   HadamardRotation(&s[19], &s[20], false);
634   HadamardRotation(&s[24], &s[31], true);
635   HadamardRotation(&s[25], &s[30], true);
636   HadamardRotation(&s[26], &s[29], true);
637   HadamardRotation(&s[27], &s[28], true);
638 
639   // stage 27.
640   bufferfly_rotation(&s[27], &s[20], 32, true);
641   bufferfly_rotation(&s[26], &s[21], 32, true);
642   bufferfly_rotation(&s[25], &s[22], 32, true);
643   bufferfly_rotation(&s[24], &s[23], 32, true);
644 
645   // stage 29.
646   HadamardRotation(&s[0], &s[31], false);
647   HadamardRotation(&s[1], &s[30], false);
648   HadamardRotation(&s[2], &s[29], false);
649   HadamardRotation(&s[3], &s[28], false);
650   HadamardRotation(&s[4], &s[27], false);
651   HadamardRotation(&s[5], &s[26], false);
652   HadamardRotation(&s[6], &s[25], false);
653   HadamardRotation(&s[7], &s[24], false);
654   HadamardRotation(&s[8], &s[23], false);
655   HadamardRotation(&s[9], &s[22], false);
656   HadamardRotation(&s[10], &s[21], false);
657   HadamardRotation(&s[11], &s[20], false);
658   HadamardRotation(&s[12], &s[19], false);
659   HadamardRotation(&s[13], &s[18], false);
660   HadamardRotation(&s[14], &s[17], false);
661   HadamardRotation(&s[15], &s[16], false);
662 }
663 
664 // Process dct32 rows or columns, depending on the transpose flag.
Dct32_SSE4_1(void * dest,const void * source,const int32_t step,const bool transpose)665 LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const void* source,
666                                         const int32_t step,
667                                         const bool transpose) {
668   auto* const dst = static_cast<int16_t*>(dest);
669   const auto* const src = static_cast<const int16_t*>(source);
670   __m128i s[32], x[32];
671 
672   if (transpose) {
673     for (int idx = 0; idx < 32; idx += 8) {
674       __m128i input[8];
675       LoadSrc<16, 8>(src, step, idx, input);
676       Transpose8x8_U16(input, &x[idx]);
677     }
678   } else {
679     LoadSrc<16, 32>(src, step, 0, x);
680   }
681 
682   // stage 1
683   // kBitReverseLookup
684   // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
685   s[0] = x[0];
686   s[1] = x[16];
687   s[2] = x[8];
688   s[3] = x[24];
689   s[4] = x[4];
690   s[5] = x[20];
691   s[6] = x[12];
692   s[7] = x[28];
693   s[8] = x[2];
694   s[9] = x[18];
695   s[10] = x[10];
696   s[11] = x[26];
697   s[12] = x[6];
698   s[13] = x[22];
699   s[14] = x[14];
700   s[15] = x[30];
701 
702   // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
703   s[16] = x[1];
704   s[17] = x[17];
705   s[18] = x[9];
706   s[19] = x[25];
707   s[20] = x[5];
708   s[21] = x[21];
709   s[22] = x[13];
710   s[23] = x[29];
711   s[24] = x[3];
712   s[25] = x[19];
713   s[26] = x[11];
714   s[27] = x[27];
715   s[28] = x[7];
716   s[29] = x[23];
717   s[30] = x[15];
718   s[31] = x[31];
719 
720   Dct4Stages<ButterflyRotation_8>(s);
721   Dct8Stages<ButterflyRotation_8>(s);
722   Dct16Stages<ButterflyRotation_8>(s);
723   Dct32Stages<ButterflyRotation_8>(s);
724 
725   if (transpose) {
726     for (int idx = 0; idx < 32; idx += 8) {
727       __m128i output[8];
728       Transpose8x8_U16(&s[idx], output);
729       StoreDst<16, 8>(dst, step, idx, output);
730     }
731   } else {
732     StoreDst<16, 32>(dst, step, 0, s);
733   }
734 }
735 
736 // Allow the compiler to call this function instead of force inlining. Tests
737 // show the performance is slightly faster.
Dct64_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)738 void Dct64_SSE4_1(void* dest, const void* source, int32_t step,
739                   bool transpose) {
740   auto* const dst = static_cast<int16_t*>(dest);
741   const auto* const src = static_cast<const int16_t*>(source);
742   __m128i s[64], x[32];
743 
744   if (transpose) {
745     // The last 32 values of every row are always zero if the |tx_width| is
746     // 64.
747     for (int idx = 0; idx < 32; idx += 8) {
748       __m128i input[8];
749       LoadSrc<16, 8>(src, step, idx, input);
750       Transpose8x8_U16(input, &x[idx]);
751     }
752   } else {
753     // The last 32 values of every column are always zero if the |tx_height| is
754     // 64.
755     LoadSrc<16, 32>(src, step, 0, x);
756   }
757 
758   // stage 1
759   // kBitReverseLookup
760   // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
761   s[0] = x[0];
762   s[2] = x[16];
763   s[4] = x[8];
764   s[6] = x[24];
765   s[8] = x[4];
766   s[10] = x[20];
767   s[12] = x[12];
768   s[14] = x[28];
769 
770   // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
771   s[16] = x[2];
772   s[18] = x[18];
773   s[20] = x[10];
774   s[22] = x[26];
775   s[24] = x[6];
776   s[26] = x[22];
777   s[28] = x[14];
778   s[30] = x[30];
779 
780   // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
781   s[32] = x[1];
782   s[34] = x[17];
783   s[36] = x[9];
784   s[38] = x[25];
785   s[40] = x[5];
786   s[42] = x[21];
787   s[44] = x[13];
788   s[46] = x[29];
789 
790   // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
791   s[48] = x[3];
792   s[50] = x[19];
793   s[52] = x[11];
794   s[54] = x[27];
795   s[56] = x[7];
796   s[58] = x[23];
797   s[60] = x[15];
798   s[62] = x[31];
799 
800   Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
801   Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
802   Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
803   Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
804 
805   //-- start dct 64 stages
806   // stage 2.
807   ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
808   ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
809   ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
810   ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
811   ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
812   ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
813   ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
814   ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
815   ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
816   ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
817   ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
818   ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
819   ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
820   ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
821   ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
822   ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
823 
824   // stage 4.
825   HadamardRotation(&s[32], &s[33], false);
826   HadamardRotation(&s[34], &s[35], true);
827   HadamardRotation(&s[36], &s[37], false);
828   HadamardRotation(&s[38], &s[39], true);
829   HadamardRotation(&s[40], &s[41], false);
830   HadamardRotation(&s[42], &s[43], true);
831   HadamardRotation(&s[44], &s[45], false);
832   HadamardRotation(&s[46], &s[47], true);
833   HadamardRotation(&s[48], &s[49], false);
834   HadamardRotation(&s[50], &s[51], true);
835   HadamardRotation(&s[52], &s[53], false);
836   HadamardRotation(&s[54], &s[55], true);
837   HadamardRotation(&s[56], &s[57], false);
838   HadamardRotation(&s[58], &s[59], true);
839   HadamardRotation(&s[60], &s[61], false);
840   HadamardRotation(&s[62], &s[63], true);
841 
842   // stage 7.
843   ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
844   ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
845   ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
846   ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
847   ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
848   ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
849   ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
850   ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
851 
852   // stage 11.
853   HadamardRotation(&s[32], &s[35], false);
854   HadamardRotation(&s[33], &s[34], false);
855   HadamardRotation(&s[36], &s[39], true);
856   HadamardRotation(&s[37], &s[38], true);
857   HadamardRotation(&s[40], &s[43], false);
858   HadamardRotation(&s[41], &s[42], false);
859   HadamardRotation(&s[44], &s[47], true);
860   HadamardRotation(&s[45], &s[46], true);
861   HadamardRotation(&s[48], &s[51], false);
862   HadamardRotation(&s[49], &s[50], false);
863   HadamardRotation(&s[52], &s[55], true);
864   HadamardRotation(&s[53], &s[54], true);
865   HadamardRotation(&s[56], &s[59], false);
866   HadamardRotation(&s[57], &s[58], false);
867   HadamardRotation(&s[60], &s[63], true);
868   HadamardRotation(&s[61], &s[62], true);
869 
870   // stage 16.
871   ButterflyRotation_8(&s[61], &s[34], 56, true);
872   ButterflyRotation_8(&s[60], &s[35], 56, true);
873   ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
874   ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
875   ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
876   ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
877   ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
878   ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
879 
880   // stage 21.
881   HadamardRotation(&s[32], &s[39], false);
882   HadamardRotation(&s[33], &s[38], false);
883   HadamardRotation(&s[34], &s[37], false);
884   HadamardRotation(&s[35], &s[36], false);
885   HadamardRotation(&s[40], &s[47], true);
886   HadamardRotation(&s[41], &s[46], true);
887   HadamardRotation(&s[42], &s[45], true);
888   HadamardRotation(&s[43], &s[44], true);
889   HadamardRotation(&s[48], &s[55], false);
890   HadamardRotation(&s[49], &s[54], false);
891   HadamardRotation(&s[50], &s[53], false);
892   HadamardRotation(&s[51], &s[52], false);
893   HadamardRotation(&s[56], &s[63], true);
894   HadamardRotation(&s[57], &s[62], true);
895   HadamardRotation(&s[58], &s[61], true);
896   HadamardRotation(&s[59], &s[60], true);
897 
898   // stage 25.
899   ButterflyRotation_8(&s[59], &s[36], 48, true);
900   ButterflyRotation_8(&s[58], &s[37], 48, true);
901   ButterflyRotation_8(&s[57], &s[38], 48, true);
902   ButterflyRotation_8(&s[56], &s[39], 48, true);
903   ButterflyRotation_8(&s[55], &s[40], 112, true);
904   ButterflyRotation_8(&s[54], &s[41], 112, true);
905   ButterflyRotation_8(&s[53], &s[42], 112, true);
906   ButterflyRotation_8(&s[52], &s[43], 112, true);
907 
908   // stage 28.
909   HadamardRotation(&s[32], &s[47], false);
910   HadamardRotation(&s[33], &s[46], false);
911   HadamardRotation(&s[34], &s[45], false);
912   HadamardRotation(&s[35], &s[44], false);
913   HadamardRotation(&s[36], &s[43], false);
914   HadamardRotation(&s[37], &s[42], false);
915   HadamardRotation(&s[38], &s[41], false);
916   HadamardRotation(&s[39], &s[40], false);
917   HadamardRotation(&s[48], &s[63], true);
918   HadamardRotation(&s[49], &s[62], true);
919   HadamardRotation(&s[50], &s[61], true);
920   HadamardRotation(&s[51], &s[60], true);
921   HadamardRotation(&s[52], &s[59], true);
922   HadamardRotation(&s[53], &s[58], true);
923   HadamardRotation(&s[54], &s[57], true);
924   HadamardRotation(&s[55], &s[56], true);
925 
926   // stage 30.
927   ButterflyRotation_8(&s[55], &s[40], 32, true);
928   ButterflyRotation_8(&s[54], &s[41], 32, true);
929   ButterflyRotation_8(&s[53], &s[42], 32, true);
930   ButterflyRotation_8(&s[52], &s[43], 32, true);
931   ButterflyRotation_8(&s[51], &s[44], 32, true);
932   ButterflyRotation_8(&s[50], &s[45], 32, true);
933   ButterflyRotation_8(&s[49], &s[46], 32, true);
934   ButterflyRotation_8(&s[48], &s[47], 32, true);
935 
936   // stage 31.
937   for (int i = 0; i < 32; i += 4) {
938     HadamardRotation(&s[i], &s[63 - i], false);
939     HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
940     HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
941     HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
942   }
943   //-- end dct 64 stages
944 
945   if (transpose) {
946     for (int idx = 0; idx < 64; idx += 8) {
947       __m128i output[8];
948       Transpose8x8_U16(&s[idx], output);
949       StoreDst<16, 8>(dst, step, idx, output);
950     }
951   } else {
952     StoreDst<16, 64>(dst, step, 0, s);
953   }
954 }
955 
956 //------------------------------------------------------------------------------
957 // Asymmetric Discrete Sine Transforms (ADST).
958 
959 template <bool stage_is_rectangular>
Adst4_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)960 LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, const void* source,
961                                         int32_t step, bool transpose) {
962   auto* const dst = static_cast<int16_t*>(dest);
963   const auto* const src = static_cast<const int16_t*>(source);
964   __m128i s[8], x[4];
965 
966   if (stage_is_rectangular) {
967     if (transpose) {
968       __m128i input[8];
969       LoadSrc<8, 8>(src, step, 0, input);
970       Transpose4x8To8x4_U16(input, x);
971     } else {
972       LoadSrc<16, 4>(src, step, 0, x);
973     }
974   } else {
975     LoadSrc<8, 4>(src, step, 0, x);
976     if (transpose) {
977       Transpose4x4_U16(x, x);
978     }
979   }
980 
981   const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
982   const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
983   const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
984   const __m128i kAdst4Multiplier_m0_1 =
985       _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
986                      (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
987   const __m128i kAdst4Multiplier_3_0 =
988       _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
989                      (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
990 
991   // stage 1.
992   const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
993   const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
994   const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
995   const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
996   const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
997 
998   s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
999   s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
1000 
1001   // stage 2.
1002   // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
1003   const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
1004   const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
1005   const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
1006 
1007   // stage 3.
1008   s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
1009   s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
1010   s[2] = b7;
1011   s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
1012 
1013   // stage 4.
1014   s[0] = _mm_add_epi32(s[0], s[5]);
1015   s[1] = _mm_sub_epi32(s[1], s[6]);
1016 
1017   // stages 5 and 6.
1018   x[0] = _mm_add_epi32(s[0], s[3]);
1019   x[1] = _mm_add_epi32(s[1], s[3]);
1020   x[2] = _mm_add_epi32(s[0], s[1]);
1021   x[3] = _mm_sub_epi32(x[2], s[3]);
1022 
1023   x[0] = RightShiftWithRounding_S32(x[0], 12);
1024   x[1] = RightShiftWithRounding_S32(x[1], 12);
1025   x[2] = RightShiftWithRounding_S32(s[2], 12);
1026   x[3] = RightShiftWithRounding_S32(x[3], 12);
1027 
1028   x[0] = _mm_packs_epi32(x[0], x[1]);
1029   x[2] = _mm_packs_epi32(x[2], x[3]);
1030   x[1] = _mm_srli_si128(x[0], 8);
1031   x[3] = _mm_srli_si128(x[2], 8);
1032 
1033   if (stage_is_rectangular) {
1034     if (transpose) {
1035       __m128i output[8];
1036       Transpose8x4To4x8_U16(x, output);
1037       StoreDst<8, 8>(dst, step, 0, output);
1038     } else {
1039       StoreDst<16, 4>(dst, step, 0, x);
1040     }
1041   } else {
1042     if (transpose) {
1043       Transpose4x4_U16(x, x);
1044     }
1045     StoreDst<8, 4>(dst, step, 0, x);
1046   }
1047 }
1048 
1049 constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
1050                                                3344, 0, 2482, 1321};
1051 
Adst4DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1052 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, const void* source,
1053                                        int non_zero_coeff_count,
1054                                        bool should_round, int row_shift) {
1055   if (non_zero_coeff_count > 1) {
1056     return false;
1057   }
1058 
1059   auto* dst = static_cast<int16_t*>(dest);
1060   const auto* const src = static_cast<const int16_t*>(source);
1061   const __m128i v_src =
1062       _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0), 0);
1063   const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1064   const __m128i v_kTransformRowMultiplier =
1065       _mm_set1_epi16(kTransformRowMultiplier << 3);
1066   const __m128i v_src_round =
1067       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1068   const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1069   const __m128i v_kAdst4DcOnlyMultipliers =
1070       LoadUnaligned16(kAdst4DcOnlyMultiplier);
1071   // s0*k0 s0*k1 s0*k2 s0*k1
1072   // +
1073   // s0*0  s0*0  s0*0  s0*k0
1074   const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
1075   const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
1076   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1077   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1078   const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
1079   const __m128i b = _mm_sra_epi32(a, v_row_shift);
1080   const __m128i c = _mm_packs_epi32(b, b);
1081   StoreLo8(dst, c);
1082 
1083   return true;
1084 }
1085 
Adst4DcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)1086 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, const void* source,
1087                                              int non_zero_coeff_count,
1088                                              int width) {
1089   if (non_zero_coeff_count > 1) {
1090     return false;
1091   }
1092 
1093   auto* dst = static_cast<int16_t*>(dest);
1094   const auto* const src = static_cast<const int16_t*>(source);
1095 
1096   int i = 0;
1097   do {
1098     const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&src[i]));
1099     const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
1100     const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
1101     const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
1102     const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
1103     const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
1104     const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
1105     const __m128i x0 = s0;
1106     const __m128i x1 = s1;
1107     const __m128i x2 = s2;
1108     const __m128i x3 = _mm_add_epi32(s0, s1);
1109     const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
1110     const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
1111     const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
1112     const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
1113     const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
1114     const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
1115     StoreLo8(&dst[i], dst_0_1);
1116     StoreHi8(&dst[i + width * 1], dst_0_1);
1117     StoreLo8(&dst[i + width * 2], dst_2_3);
1118     StoreHi8(&dst[i + width * 3], dst_2_3);
1119     i += 4;
1120   } while (i < width);
1121 
1122   return true;
1123 }
1124 
1125 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Adst8_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)1126 LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, const void* source,
1127                                         int32_t step, bool transpose) {
1128   auto* const dst = static_cast<int16_t*>(dest);
1129   const auto* const src = static_cast<const int16_t*>(source);
1130   __m128i s[8], x[8];
1131 
1132   if (stage_is_rectangular) {
1133     if (transpose) {
1134       __m128i input[4];
1135       LoadSrc<16, 4>(src, step, 0, input);
1136       Transpose8x4To4x8_U16(input, x);
1137     } else {
1138       LoadSrc<8, 8>(src, step, 0, x);
1139     }
1140   } else {
1141     if (transpose) {
1142       __m128i input[8];
1143       LoadSrc<16, 8>(src, step, 0, input);
1144       Transpose8x8_U16(input, x);
1145     } else {
1146       LoadSrc<16, 8>(src, step, 0, x);
1147     }
1148   }
1149 
1150   // stage 1.
1151   s[0] = x[7];
1152   s[1] = x[0];
1153   s[2] = x[5];
1154   s[3] = x[2];
1155   s[4] = x[3];
1156   s[5] = x[4];
1157   s[6] = x[1];
1158   s[7] = x[6];
1159 
1160   // stage 2.
1161   bufferfly_rotation(&s[0], &s[1], 60 - 0, true);
1162   bufferfly_rotation(&s[2], &s[3], 60 - 16, true);
1163   bufferfly_rotation(&s[4], &s[5], 60 - 32, true);
1164   bufferfly_rotation(&s[6], &s[7], 60 - 48, true);
1165 
1166   // stage 3.
1167   HadamardRotation(&s[0], &s[4], false);
1168   HadamardRotation(&s[1], &s[5], false);
1169   HadamardRotation(&s[2], &s[6], false);
1170   HadamardRotation(&s[3], &s[7], false);
1171 
1172   // stage 4.
1173   bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
1174   bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
1175 
1176   // stage 5.
1177   HadamardRotation(&s[0], &s[2], false);
1178   HadamardRotation(&s[4], &s[6], false);
1179   HadamardRotation(&s[1], &s[3], false);
1180   HadamardRotation(&s[5], &s[7], false);
1181 
1182   // stage 6.
1183   bufferfly_rotation(&s[2], &s[3], 32, true);
1184   bufferfly_rotation(&s[6], &s[7], 32, true);
1185 
1186   // stage 7.
1187   const __m128i v_zero = _mm_setzero_si128();
1188   x[0] = s[0];
1189   x[1] = _mm_subs_epi16(v_zero, s[4]);
1190   x[2] = s[6];
1191   x[3] = _mm_subs_epi16(v_zero, s[2]);
1192   x[4] = s[3];
1193   x[5] = _mm_subs_epi16(v_zero, s[7]);
1194   x[6] = s[5];
1195   x[7] = _mm_subs_epi16(v_zero, s[1]);
1196 
1197   if (stage_is_rectangular) {
1198     if (transpose) {
1199       __m128i output[4];
1200       Transpose4x8To8x4_U16(x, output);
1201       StoreDst<16, 4>(dst, step, 0, output);
1202     } else {
1203       StoreDst<8, 8>(dst, step, 0, x);
1204     }
1205   } else {
1206     if (transpose) {
1207       __m128i output[8];
1208       Transpose8x8_U16(x, output);
1209       StoreDst<16, 8>(dst, step, 0, output);
1210     } else {
1211       StoreDst<16, 8>(dst, step, 0, x);
1212     }
1213   }
1214 }
1215 
Adst8DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1216 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, const void* source,
1217                                        int non_zero_coeff_count,
1218                                        bool should_round, int row_shift) {
1219   if (non_zero_coeff_count > 1) {
1220     return false;
1221   }
1222 
1223   auto* dst = static_cast<int16_t*>(dest);
1224   const auto* const src = static_cast<const int16_t*>(source);
1225   __m128i s[8];
1226 
1227   const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
1228   const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1229   const __m128i v_kTransformRowMultiplier =
1230       _mm_set1_epi16(kTransformRowMultiplier << 3);
1231   const __m128i v_src_round =
1232       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1233   // stage 1.
1234   s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1235 
1236   // stage 2.
1237   ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1238 
1239   // stage 3.
1240   s[4] = s[0];
1241   s[5] = s[1];
1242 
1243   // stage 4.
1244   ButterflyRotation_4(&s[4], &s[5], 48, true);
1245 
1246   // stage 5.
1247   s[2] = s[0];
1248   s[3] = s[1];
1249   s[6] = s[4];
1250   s[7] = s[5];
1251 
1252   // stage 6.
1253   ButterflyRotation_4(&s[2], &s[3], 32, true);
1254   ButterflyRotation_4(&s[6], &s[7], 32, true);
1255 
1256   // stage 7.
1257   __m128i x[8];
1258   const __m128i v_zero = _mm_setzero_si128();
1259   x[0] = s[0];
1260   x[1] = _mm_subs_epi16(v_zero, s[4]);
1261   x[2] = s[6];
1262   x[3] = _mm_subs_epi16(v_zero, s[2]);
1263   x[4] = s[3];
1264   x[5] = _mm_subs_epi16(v_zero, s[7]);
1265   x[6] = s[5];
1266   x[7] = _mm_subs_epi16(v_zero, s[1]);
1267 
1268   const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
1269   const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
1270   const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
1271   const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
1272   const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1273   const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1274 
1275   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1276   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1277   const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1278   const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1279   const __m128i b = _mm_sra_epi32(a, v_row_shift);
1280   const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1281   StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
1282 
1283   return true;
1284 }
1285 
Adst8DcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)1286 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, const void* source,
1287                                              int non_zero_coeff_count,
1288                                              int width) {
1289   if (non_zero_coeff_count > 1) {
1290     return false;
1291   }
1292 
1293   auto* dst = static_cast<int16_t*>(dest);
1294   const auto* const src = static_cast<const int16_t*>(source);
1295   __m128i s[8];
1296 
1297   int i = 0;
1298   do {
1299     const __m128i v_src = LoadLo8(&src[i]);
1300     // stage 1.
1301     s[1] = v_src;
1302 
1303     // stage 2.
1304     ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1305 
1306     // stage 3.
1307     s[4] = s[0];
1308     s[5] = s[1];
1309 
1310     // stage 4.
1311     ButterflyRotation_4(&s[4], &s[5], 48, true);
1312 
1313     // stage 5.
1314     s[2] = s[0];
1315     s[3] = s[1];
1316     s[6] = s[4];
1317     s[7] = s[5];
1318 
1319     // stage 6.
1320     ButterflyRotation_4(&s[2], &s[3], 32, true);
1321     ButterflyRotation_4(&s[6], &s[7], 32, true);
1322 
1323     // stage 7.
1324     __m128i x[8];
1325     const __m128i v_zero = _mm_setzero_si128();
1326     x[0] = s[0];
1327     x[1] = _mm_subs_epi16(v_zero, s[4]);
1328     x[2] = s[6];
1329     x[3] = _mm_subs_epi16(v_zero, s[2]);
1330     x[4] = s[3];
1331     x[5] = _mm_subs_epi16(v_zero, s[7]);
1332     x[6] = s[5];
1333     x[7] = _mm_subs_epi16(v_zero, s[1]);
1334 
1335     for (int j = 0; j < 8; ++j) {
1336       StoreLo8(&dst[j * width], x[j]);
1337     }
1338     i += 4;
1339     dst += 4;
1340   } while (i < width);
1341 
1342   return true;
1343 }
1344 
1345 template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
Adst16_SSE4_1(void * dest,const void * source,int32_t step,bool transpose)1346 LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, const void* source,
1347                                          int32_t step, bool transpose) {
1348   auto* const dst = static_cast<int16_t*>(dest);
1349   const auto* const src = static_cast<const int16_t*>(source);
1350   __m128i s[16], x[16];
1351 
1352   if (stage_is_rectangular) {
1353     if (transpose) {
1354       __m128i input[4];
1355       LoadSrc<16, 4>(src, step, 0, input);
1356       Transpose8x4To4x8_U16(input, x);
1357       LoadSrc<16, 4>(src, step, 8, input);
1358       Transpose8x4To4x8_U16(input, &x[8]);
1359     } else {
1360       LoadSrc<8, 16>(src, step, 0, x);
1361     }
1362   } else {
1363     if (transpose) {
1364       for (int idx = 0; idx < 16; idx += 8) {
1365         __m128i input[8];
1366         LoadSrc<16, 8>(src, step, idx, input);
1367         Transpose8x8_U16(input, &x[idx]);
1368       }
1369     } else {
1370       LoadSrc<16, 16>(src, step, 0, x);
1371     }
1372   }
1373 
1374   // stage 1.
1375   s[0] = x[15];
1376   s[1] = x[0];
1377   s[2] = x[13];
1378   s[3] = x[2];
1379   s[4] = x[11];
1380   s[5] = x[4];
1381   s[6] = x[9];
1382   s[7] = x[6];
1383   s[8] = x[7];
1384   s[9] = x[8];
1385   s[10] = x[5];
1386   s[11] = x[10];
1387   s[12] = x[3];
1388   s[13] = x[12];
1389   s[14] = x[1];
1390   s[15] = x[14];
1391 
1392   // stage 2.
1393   bufferfly_rotation(&s[0], &s[1], 62 - 0, true);
1394   bufferfly_rotation(&s[2], &s[3], 62 - 8, true);
1395   bufferfly_rotation(&s[4], &s[5], 62 - 16, true);
1396   bufferfly_rotation(&s[6], &s[7], 62 - 24, true);
1397   bufferfly_rotation(&s[8], &s[9], 62 - 32, true);
1398   bufferfly_rotation(&s[10], &s[11], 62 - 40, true);
1399   bufferfly_rotation(&s[12], &s[13], 62 - 48, true);
1400   bufferfly_rotation(&s[14], &s[15], 62 - 56, true);
1401 
1402   // stage 3.
1403   HadamardRotation(&s[0], &s[8], false);
1404   HadamardRotation(&s[1], &s[9], false);
1405   HadamardRotation(&s[2], &s[10], false);
1406   HadamardRotation(&s[3], &s[11], false);
1407   HadamardRotation(&s[4], &s[12], false);
1408   HadamardRotation(&s[5], &s[13], false);
1409   HadamardRotation(&s[6], &s[14], false);
1410   HadamardRotation(&s[7], &s[15], false);
1411 
1412   // stage 4.
1413   bufferfly_rotation(&s[8], &s[9], 56 - 0, true);
1414   bufferfly_rotation(&s[13], &s[12], 8 + 0, true);
1415   bufferfly_rotation(&s[10], &s[11], 56 - 32, true);
1416   bufferfly_rotation(&s[15], &s[14], 8 + 32, true);
1417 
1418   // stage 5.
1419   HadamardRotation(&s[0], &s[4], false);
1420   HadamardRotation(&s[8], &s[12], false);
1421   HadamardRotation(&s[1], &s[5], false);
1422   HadamardRotation(&s[9], &s[13], false);
1423   HadamardRotation(&s[2], &s[6], false);
1424   HadamardRotation(&s[10], &s[14], false);
1425   HadamardRotation(&s[3], &s[7], false);
1426   HadamardRotation(&s[11], &s[15], false);
1427 
1428   // stage 6.
1429   bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
1430   bufferfly_rotation(&s[12], &s[13], 48 - 0, true);
1431   bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
1432   bufferfly_rotation(&s[15], &s[14], 48 - 32, true);
1433 
1434   // stage 7.
1435   HadamardRotation(&s[0], &s[2], false);
1436   HadamardRotation(&s[4], &s[6], false);
1437   HadamardRotation(&s[8], &s[10], false);
1438   HadamardRotation(&s[12], &s[14], false);
1439   HadamardRotation(&s[1], &s[3], false);
1440   HadamardRotation(&s[5], &s[7], false);
1441   HadamardRotation(&s[9], &s[11], false);
1442   HadamardRotation(&s[13], &s[15], false);
1443 
1444   // stage 8.
1445   bufferfly_rotation(&s[2], &s[3], 32, true);
1446   bufferfly_rotation(&s[6], &s[7], 32, true);
1447   bufferfly_rotation(&s[10], &s[11], 32, true);
1448   bufferfly_rotation(&s[14], &s[15], 32, true);
1449 
1450   // stage 9.
1451   const __m128i v_zero = _mm_setzero_si128();
1452   x[0] = s[0];
1453   x[1] = _mm_subs_epi16(v_zero, s[8]);
1454   x[2] = s[12];
1455   x[3] = _mm_subs_epi16(v_zero, s[4]);
1456   x[4] = s[6];
1457   x[5] = _mm_subs_epi16(v_zero, s[14]);
1458   x[6] = s[10];
1459   x[7] = _mm_subs_epi16(v_zero, s[2]);
1460   x[8] = s[3];
1461   x[9] = _mm_subs_epi16(v_zero, s[11]);
1462   x[10] = s[15];
1463   x[11] = _mm_subs_epi16(v_zero, s[7]);
1464   x[12] = s[5];
1465   x[13] = _mm_subs_epi16(v_zero, s[13]);
1466   x[14] = s[9];
1467   x[15] = _mm_subs_epi16(v_zero, s[1]);
1468 
1469   if (stage_is_rectangular) {
1470     if (transpose) {
1471       __m128i output[4];
1472       Transpose4x8To8x4_U16(x, output);
1473       StoreDst<16, 4>(dst, step, 0, output);
1474       Transpose4x8To8x4_U16(&x[8], output);
1475       StoreDst<16, 4>(dst, step, 8, output);
1476     } else {
1477       StoreDst<8, 16>(dst, step, 0, x);
1478     }
1479   } else {
1480     if (transpose) {
1481       for (int idx = 0; idx < 16; idx += 8) {
1482         __m128i output[8];
1483         Transpose8x8_U16(&x[idx], output);
1484         StoreDst<16, 8>(dst, step, idx, output);
1485       }
1486     } else {
1487       StoreDst<16, 16>(dst, step, 0, x);
1488     }
1489   }
1490 }
1491 
Adst16DcOnlyInternal(__m128i * s,__m128i * x)1492 LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
1493   // stage 2.
1494   ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
1495 
1496   // stage 3.
1497   s[8] = s[0];
1498   s[9] = s[1];
1499 
1500   // stage 4.
1501   ButterflyRotation_4(&s[8], &s[9], 56, true);
1502 
1503   // stage 5.
1504   s[4] = s[0];
1505   s[12] = s[8];
1506   s[5] = s[1];
1507   s[13] = s[9];
1508 
1509   // stage 6.
1510   ButterflyRotation_4(&s[4], &s[5], 48, true);
1511   ButterflyRotation_4(&s[12], &s[13], 48, true);
1512 
1513   // stage 7.
1514   s[2] = s[0];
1515   s[6] = s[4];
1516   s[10] = s[8];
1517   s[14] = s[12];
1518   s[3] = s[1];
1519   s[7] = s[5];
1520   s[11] = s[9];
1521   s[15] = s[13];
1522 
1523   // stage 8.
1524   ButterflyRotation_4(&s[2], &s[3], 32, true);
1525   ButterflyRotation_4(&s[6], &s[7], 32, true);
1526   ButterflyRotation_4(&s[10], &s[11], 32, true);
1527   ButterflyRotation_4(&s[14], &s[15], 32, true);
1528 
1529   // stage 9.
1530   const __m128i v_zero = _mm_setzero_si128();
1531   x[0] = s[0];
1532   x[1] = _mm_subs_epi16(v_zero, s[8]);
1533   x[2] = s[12];
1534   x[3] = _mm_subs_epi16(v_zero, s[4]);
1535   x[4] = s[6];
1536   x[5] = _mm_subs_epi16(v_zero, s[14]);
1537   x[6] = s[10];
1538   x[7] = _mm_subs_epi16(v_zero, s[2]);
1539   x[8] = s[3];
1540   x[9] = _mm_subs_epi16(v_zero, s[11]);
1541   x[10] = s[15];
1542   x[11] = _mm_subs_epi16(v_zero, s[7]);
1543   x[12] = s[5];
1544   x[13] = _mm_subs_epi16(v_zero, s[13]);
1545   x[14] = s[9];
1546   x[15] = _mm_subs_epi16(v_zero, s[1]);
1547 }
1548 
Adst16DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1549 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, const void* source,
1550                                         int non_zero_coeff_count,
1551                                         bool should_round, int row_shift) {
1552   if (non_zero_coeff_count > 1) {
1553     return false;
1554   }
1555 
1556   auto* dst = static_cast<int16_t*>(dest);
1557   const auto* const src = static_cast<const int16_t*>(source);
1558   __m128i s[16];
1559   __m128i x[16];
1560 
1561   const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
1562   const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1563   const __m128i v_kTransformRowMultiplier =
1564       _mm_set1_epi16(kTransformRowMultiplier << 3);
1565   const __m128i v_src_round =
1566       _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1567   // stage 1.
1568   s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
1569 
1570   Adst16DcOnlyInternal(s, x);
1571 
1572   for (int i = 0; i < 2; ++i) {
1573     const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
1574     const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
1575     const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
1576     const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
1577     const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
1578     const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
1579 
1580     const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1581     const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1582     const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
1583     const __m128i a1 =
1584         _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
1585     const __m128i b = _mm_sra_epi32(a, v_row_shift);
1586     const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
1587     StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
1588   }
1589   return true;
1590 }
1591 
Adst16DcOnlyColumn(void * dest,const void * source,int non_zero_coeff_count,int width)1592 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, const void* source,
1593                                               int non_zero_coeff_count,
1594                                               int width) {
1595   if (non_zero_coeff_count > 1) {
1596     return false;
1597   }
1598 
1599   auto* dst = static_cast<int16_t*>(dest);
1600   const auto* const src = static_cast<const int16_t*>(source);
1601 
1602   int i = 0;
1603   do {
1604     __m128i s[16];
1605     __m128i x[16];
1606     const __m128i v_src = LoadUnaligned16(&src[i]);
1607     // stage 1.
1608     s[1] = v_src;
1609 
1610     Adst16DcOnlyInternal(s, x);
1611 
1612     for (int j = 0; j < 16; ++j) {
1613       StoreLo8(&dst[j * width], x[j]);
1614     }
1615     i += 4;
1616     dst += 4;
1617   } while (i < width);
1618 
1619   return true;
1620 }
1621 
1622 //------------------------------------------------------------------------------
1623 // Identity Transforms.
1624 
1625 template <bool is_row_shift>
Identity4_SSE4_1(void * dest,const void * source,int32_t step)1626 LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, const void* source,
1627                                             int32_t step) {
1628   auto* const dst = static_cast<int16_t*>(dest);
1629   const auto* const src = static_cast<const int16_t*>(source);
1630 
1631   if (is_row_shift) {
1632     const int shift = 1;
1633     const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1634     const __m128i v_multiplier_one =
1635         _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1636     for (int i = 0; i < 4; i += 2) {
1637       const __m128i v_src = LoadUnaligned16(&src[i * step]);
1638       const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1639       const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
1640       const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1641       const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
1642       const __m128i b = _mm_srai_epi32(a, 12 + shift);
1643       const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
1644       StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
1645     }
1646   } else {
1647     const __m128i v_multiplier =
1648         _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
1649     for (int i = 0; i < 4; i += 2) {
1650       const __m128i v_src = LoadUnaligned16(&src[i * step]);
1651       const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
1652       const __m128i b = _mm_adds_epi16(a, v_src);
1653       StoreUnaligned16(&dst[i * step], b);
1654     }
1655   }
1656 }
1657 
Identity4DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int tx_height)1658 LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, const void* source,
1659                                            int non_zero_coeff_count,
1660                                            bool should_round, int tx_height) {
1661   if (non_zero_coeff_count > 1) {
1662     return false;
1663   }
1664 
1665   auto* dst = static_cast<int16_t*>(dest);
1666   const auto* const src = static_cast<const int16_t*>(source);
1667 
1668   const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
1669   const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1670   const __m128i v_kTransformRowMultiplier =
1671       _mm_set1_epi16(kTransformRowMultiplier << 3);
1672   const __m128i v_src_round =
1673       _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1674   const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
1675 
1676   const int shift = (tx_height < 16) ? 0 : 1;
1677   const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1678   const __m128i v_multiplier_one =
1679       _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
1680   const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
1681   const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
1682   const __m128i b = _mm_srai_epi32(a, 12 + shift);
1683   dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1684   return true;
1685 }
1686 
Identity4ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1687 LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
1688     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1689     const int tx_width, const int tx_height, const int16_t* source) {
1690   const int stride = frame.columns();
1691   uint8_t* dst = frame[start_y] + start_x;
1692 
1693   const __m128i v_multiplier_fraction =
1694       _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1695   const __m128i v_eight = _mm_set1_epi16(8);
1696 
1697   if (tx_width == 4) {
1698     int i = 0;
1699     do {
1700       const __m128i v_src = LoadLo8(&source[i * tx_width]);
1701       const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1702       const __m128i frame_data = Load4(dst);
1703       const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1704       const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1705       const __m128i b = _mm_srai_epi16(a, 4);
1706       const __m128i c = _mm_cvtepu8_epi16(frame_data);
1707       const __m128i d = _mm_adds_epi16(c, b);
1708       Store4(dst, _mm_packus_epi16(d, d));
1709       dst += stride;
1710     } while (++i < tx_height);
1711   } else {
1712     int i = 0;
1713     do {
1714       const int row = i * tx_width;
1715       int j = 0;
1716       do {
1717         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1718         const __m128i v_src_mult =
1719             _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1720         const __m128i frame_data = LoadLo8(dst + j);
1721         const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
1722         const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1723         const __m128i b = _mm_srai_epi16(a, 4);
1724         const __m128i c = _mm_cvtepu8_epi16(frame_data);
1725         const __m128i d = _mm_adds_epi16(c, b);
1726         StoreLo8(dst + j, _mm_packus_epi16(d, d));
1727         j += 8;
1728       } while (j < tx_width);
1729       dst += stride;
1730     } while (++i < tx_height);
1731   }
1732 }
1733 
Identity4RowColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1734 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
1735     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1736     const int tx_width, const int tx_height, const int16_t* source) {
1737   const int stride = frame.columns();
1738   uint8_t* dst = frame[start_y] + start_x;
1739 
1740   const __m128i v_multiplier_fraction =
1741       _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
1742   const __m128i v_eight = _mm_set1_epi16(8);
1743   const __m128i v_kTransformRowMultiplier =
1744       _mm_set1_epi16(kTransformRowMultiplier << 3);
1745 
1746   if (tx_width == 4) {
1747     int i = 0;
1748     do {
1749       const __m128i v_src = LoadLo8(&source[i * tx_width]);
1750       const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
1751       const __m128i frame_data = Load4(dst);
1752       const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
1753       const __m128i v_src_mult2 =
1754           _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1755       const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1756       const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1757       const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1758       const __m128i b = _mm_srai_epi16(a, 4);
1759       const __m128i c = _mm_adds_epi16(frame_data16, b);
1760       Store4(dst, _mm_packus_epi16(c, c));
1761       dst += stride;
1762     } while (++i < tx_height);
1763   } else {
1764     int i = 0;
1765     do {
1766       const int row = i * tx_width;
1767       int j = 0;
1768       do {
1769         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1770         const __m128i v_src_round =
1771             _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
1772         const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
1773         const __m128i v_src_mult2 =
1774             _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
1775         const __m128i frame_data = LoadLo8(dst + j);
1776         const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
1777         const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
1778         const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
1779         const __m128i b = _mm_srai_epi16(a, 4);
1780         const __m128i c = _mm_adds_epi16(frame_data16, b);
1781         StoreLo8(dst + j, _mm_packus_epi16(c, c));
1782         j += 8;
1783       } while (j < tx_width);
1784       dst += stride;
1785     } while (++i < tx_height);
1786   }
1787 }
1788 
Identity8Row32_SSE4_1(void * dest,const void * source,int32_t step)1789 LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, const void* source,
1790                                                  int32_t step) {
1791   auto* const dst = static_cast<int16_t*>(dest);
1792   const auto* const src = static_cast<const int16_t*>(source);
1793 
1794   // When combining the identity8 multiplier with the row shift, the
1795   // calculations for tx_height equal to 32 can be simplified from
1796   // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
1797   const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
1798   for (int h = 0; h < 4; ++h) {
1799     const __m128i v_src = LoadUnaligned16(&src[h * step]);
1800     const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
1801     StoreUnaligned16(&dst[h * step], v_src_mult);
1802   }
1803 }
1804 
Identity8Row4_SSE4_1(void * dest,const void * source,int32_t step)1805 LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, const void* source,
1806                                                 int32_t step) {
1807   auto* const dst = static_cast<int16_t*>(dest);
1808   const auto* const src = static_cast<const int16_t*>(source);
1809 
1810   for (int h = 0; h < 4; ++h) {
1811     const __m128i v_src = LoadUnaligned16(&src[h * step]);
1812     // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
1813     // saturating add here is ok.
1814     const __m128i a = _mm_adds_epi16(v_src, v_src);
1815     StoreUnaligned16(&dst[h * step], a);
1816   }
1817 }
1818 
Identity8DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int row_shift)1819 LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, const void* source,
1820                                            int non_zero_coeff_count,
1821                                            bool should_round, int row_shift) {
1822   if (non_zero_coeff_count > 1) {
1823     return false;
1824   }
1825 
1826   auto* dst = static_cast<int16_t*>(dest);
1827   const auto* const src = static_cast<const int16_t*>(source);
1828 
1829   const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
1830   const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1831   const __m128i v_kTransformRowMultiplier =
1832       _mm_set1_epi16(kTransformRowMultiplier << 3);
1833   const __m128i v_src_round =
1834       _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1835   const __m128i v_src =
1836       _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
1837   const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
1838   const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
1839   const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
1840   const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
1841   const __m128i b = _mm_sra_epi32(a, v_row_shift);
1842   dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1843   return true;
1844 }
1845 
Identity8ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1846 LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
1847     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1848     const int tx_width, const int tx_height, const int16_t* source) {
1849   const int stride = frame.columns();
1850   uint8_t* dst = frame[start_y] + start_x;
1851   const __m128i v_eight = _mm_set1_epi16(8);
1852   if (tx_width == 4) {
1853     int i = 0;
1854     do {
1855       const int row = i * tx_width;
1856       const __m128i v_src = LoadLo8(&source[row]);
1857       const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1858       const __m128i frame_data = Load4(dst);
1859       const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1860       const __m128i b = _mm_srai_epi16(a, 4);
1861       const __m128i c = _mm_cvtepu8_epi16(frame_data);
1862       const __m128i d = _mm_adds_epi16(c, b);
1863       Store4(dst, _mm_packus_epi16(d, d));
1864       dst += stride;
1865     } while (++i < tx_height);
1866   } else {
1867     int i = 0;
1868     do {
1869       const int row = i * tx_width;
1870       int j = 0;
1871       do {
1872         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1873         const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
1874         const __m128i frame_data = LoadLo8(dst + j);
1875         const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1876         const __m128i b = _mm_srai_epi16(a, 4);
1877         const __m128i c = _mm_cvtepu8_epi16(frame_data);
1878         const __m128i d = _mm_adds_epi16(c, b);
1879         StoreLo8(dst + j, _mm_packus_epi16(d, d));
1880         j += 8;
1881       } while (j < tx_width);
1882       dst += stride;
1883     } while (++i < tx_height);
1884   }
1885 }
1886 
Identity16Row_SSE4_1(void * dest,const void * source,int32_t step,int shift)1887 LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, const void* source,
1888                                                 int32_t step, int shift) {
1889   auto* const dst = static_cast<int16_t*>(dest);
1890   const auto* const src = static_cast<const int16_t*>(source);
1891 
1892   const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1893   const __m128i v_multiplier_one =
1894       _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1895   const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1896 
1897   for (int h = 0; h < 4; ++h) {
1898     const __m128i v_src = LoadUnaligned16(&src[h * step]);
1899     const __m128i v_src2 = LoadUnaligned16(&src[h * step + 8]);
1900     const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
1901     const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
1902     const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
1903     const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
1904     const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
1905     const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
1906     const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
1907     const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
1908     const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
1909     const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
1910     const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
1911     const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
1912     StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
1913     StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
1914   }
1915 }
1916 
Identity16DcOnly(void * dest,const void * source,int non_zero_coeff_count,bool should_round,int shift)1917 LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, const void* source,
1918                                             int non_zero_coeff_count,
1919                                             bool should_round, int shift) {
1920   if (non_zero_coeff_count > 1) {
1921     return false;
1922   }
1923 
1924   auto* dst = static_cast<int16_t*>(dest);
1925   const auto* const src = static_cast<const int16_t*>(source);
1926 
1927   const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
1928   const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
1929   const __m128i v_kTransformRowMultiplier =
1930       _mm_set1_epi16(kTransformRowMultiplier << 3);
1931   const __m128i v_src_round0 =
1932       _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
1933   const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
1934   const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
1935   const __m128i v_multiplier_one =
1936       _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
1937   const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
1938   const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
1939   const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
1940   const __m128i b = _mm_sra_epi32(a, v_shift);
1941   dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
1942   return true;
1943 }
1944 
Identity16ColumnStoreToFrame_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)1945 LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
1946     Array2DView<uint8_t> frame, const int start_x, const int start_y,
1947     const int tx_width, const int tx_height, const int16_t* source) {
1948   const int stride = frame.columns();
1949   uint8_t* dst = frame[start_y] + start_x;
1950   const __m128i v_eight = _mm_set1_epi16(8);
1951   const __m128i v_multiplier =
1952       _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
1953 
1954   if (tx_width == 4) {
1955     int i = 0;
1956     do {
1957       const __m128i v_src = LoadLo8(&source[i * tx_width]);
1958       const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1959       const __m128i frame_data = Load4(dst);
1960       const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1961       const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1962       const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1963       const __m128i b = _mm_srai_epi16(a, 4);
1964       const __m128i c = _mm_cvtepu8_epi16(frame_data);
1965       const __m128i d = _mm_adds_epi16(c, b);
1966       Store4(dst, _mm_packus_epi16(d, d));
1967       dst += stride;
1968     } while (++i < tx_height);
1969   } else {
1970     int i = 0;
1971     do {
1972       const int row = i * tx_width;
1973       int j = 0;
1974       do {
1975         const __m128i v_src = LoadUnaligned16(&source[row + j]);
1976         const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
1977         const __m128i frame_data = LoadLo8(dst + j);
1978         const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
1979         const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
1980         const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
1981         const __m128i b = _mm_srai_epi16(a, 4);
1982         const __m128i c = _mm_cvtepu8_epi16(frame_data);
1983         const __m128i d = _mm_adds_epi16(c, b);
1984         StoreLo8(dst + j, _mm_packus_epi16(d, d));
1985         j += 8;
1986       } while (j < tx_width);
1987       dst += stride;
1988     } while (++i < tx_height);
1989   }
1990 }
1991 
Identity32Row16_SSE4_1(void * dest,const void * source,const int32_t step)1992 LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
1993                                                   const void* source,
1994                                                   const int32_t step) {
1995   auto* const dst = static_cast<int16_t*>(dest);
1996   const auto* const src = static_cast<const int16_t*>(source);
1997 
1998   // When combining the identity32 multiplier with the row shift, the
1999   // calculation for tx_height equal to 16 can be simplified from
2000   // ((A * 4) + 1) >> 1) to (A * 2).
2001   for (int h = 0; h < 4; ++h) {
2002     for (int i = 0; i < 32; i += 8) {
2003       const __m128i v_src = LoadUnaligned16(&src[h * step + i]);
2004       // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
2005       // saturating add here is ok.
2006       const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
2007       StoreUnaligned16(&dst[h * step + i], v_dst_i);
2008     }
2009   }
2010 }
2011 
Identity32DcOnly(void * dest,const void * source,int non_zero_coeff_count)2012 LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, const void* source,
2013                                             int non_zero_coeff_count) {
2014   if (non_zero_coeff_count > 1) {
2015     return false;
2016   }
2017 
2018   auto* dst = static_cast<int16_t*>(dest);
2019   const auto* const src = static_cast<const int16_t*>(source);
2020 
2021   const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
2022   const __m128i v_kTransformRowMultiplier =
2023       _mm_set1_epi16(kTransformRowMultiplier << 3);
2024   const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
2025 
2026   // When combining the identity32 multiplier with the row shift, the
2027   // calculation for tx_height equal to 16 can be simplified from
2028   // ((A * 4) + 1) >> 1) to (A * 2).
2029   const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
2030   dst[0] = _mm_extract_epi16(v_dst_0, 0);
2031   return true;
2032 }
2033 
Identity32ColumnStoreToFrame(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source)2034 LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
2035     Array2DView<uint8_t> frame, const int start_x, const int start_y,
2036     const int tx_width, const int tx_height, const int16_t* source) {
2037   const int stride = frame.columns();
2038   uint8_t* dst = frame[start_y] + start_x;
2039   const __m128i v_two = _mm_set1_epi16(2);
2040 
2041   int i = 0;
2042   do {
2043     const int row = i * tx_width;
2044     int j = 0;
2045     do {
2046       const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
2047       const __m128i frame_data = LoadLo8(dst + j);
2048       const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
2049       const __m128i b = _mm_srai_epi16(a, 2);
2050       const __m128i c = _mm_cvtepu8_epi16(frame_data);
2051       const __m128i d = _mm_adds_epi16(c, b);
2052       StoreLo8(dst + j, _mm_packus_epi16(d, d));
2053       j += 8;
2054     } while (j < tx_width);
2055     dst += stride;
2056   } while (++i < tx_height);
2057 }
2058 
2059 //------------------------------------------------------------------------------
2060 // Walsh Hadamard Transform.
2061 
2062 // Process 4 wht4 rows and columns.
Wht4_SSE4_1(Array2DView<uint8_t> frame,const int start_x,const int start_y,const void * source,const int non_zero_coeff_count)2063 LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
2064                                        const int start_x, const int start_y,
2065                                        const void* source,
2066                                        const int non_zero_coeff_count) {
2067   const auto* const src = static_cast<const int16_t*>(source);
2068   __m128i s[4], x[4];
2069 
2070   if (non_zero_coeff_count == 1) {
2071     // Special case: only src[0] is nonzero.
2072     //   src[0]  0   0   0
2073     //       0   0   0   0
2074     //       0   0   0   0
2075     //       0   0   0   0
2076     //
2077     // After the row and column transforms are applied, we have:
2078     //       f   h   h   h
2079     //       g   i   i   i
2080     //       g   i   i   i
2081     //       g   i   i   i
2082     // where f, g, h, i are computed as follows.
2083     int16_t f = (src[0] >> 2) - (src[0] >> 3);
2084     const int16_t g = f >> 1;
2085     f = f - (f >> 1);
2086     const int16_t h = (src[0] >> 3) - (src[0] >> 4);
2087     const int16_t i = (src[0] >> 4);
2088     s[0] = _mm_set1_epi16(h);
2089     s[0] = _mm_insert_epi16(s[0], f, 0);
2090     s[1] = _mm_set1_epi16(i);
2091     s[1] = _mm_insert_epi16(s[1], g, 0);
2092     s[2] = s[3] = s[1];
2093   } else {
2094     x[0] = LoadLo8(&src[0 * 4]);
2095     x[2] = LoadLo8(&src[1 * 4]);
2096     x[3] = LoadLo8(&src[2 * 4]);
2097     x[1] = LoadLo8(&src[3 * 4]);
2098 
2099     // Row transforms.
2100     Transpose4x4_U16(x, x);
2101     s[0] = _mm_srai_epi16(x[0], 2);
2102     s[2] = _mm_srai_epi16(x[1], 2);
2103     s[3] = _mm_srai_epi16(x[2], 2);
2104     s[1] = _mm_srai_epi16(x[3], 2);
2105     s[0] = _mm_add_epi16(s[0], s[2]);
2106     s[3] = _mm_sub_epi16(s[3], s[1]);
2107     __m128i e = _mm_sub_epi16(s[0], s[3]);
2108     e = _mm_srai_epi16(e, 1);
2109     s[1] = _mm_sub_epi16(e, s[1]);
2110     s[2] = _mm_sub_epi16(e, s[2]);
2111     s[0] = _mm_sub_epi16(s[0], s[1]);
2112     s[3] = _mm_add_epi16(s[3], s[2]);
2113     Transpose4x4_U16(s, s);
2114 
2115     // Column transforms.
2116     s[0] = _mm_add_epi16(s[0], s[2]);
2117     s[3] = _mm_sub_epi16(s[3], s[1]);
2118     e = _mm_sub_epi16(s[0], s[3]);
2119     e = _mm_srai_epi16(e, 1);
2120     s[1] = _mm_sub_epi16(e, s[1]);
2121     s[2] = _mm_sub_epi16(e, s[2]);
2122     s[0] = _mm_sub_epi16(s[0], s[1]);
2123     s[3] = _mm_add_epi16(s[3], s[2]);
2124   }
2125 
2126   // Store to frame.
2127   const int stride = frame.columns();
2128   uint8_t* dst = frame[start_y] + start_x;
2129   for (int row = 0; row < 4; ++row) {
2130     const __m128i frame_data = Load4(dst);
2131     const __m128i a = _mm_cvtepu8_epi16(frame_data);
2132     // Saturate to prevent overflowing int16_t
2133     const __m128i b = _mm_adds_epi16(a, s[row]);
2134     Store4(dst, _mm_packus_epi16(b, b));
2135     dst += stride;
2136   }
2137 }
2138 
2139 //------------------------------------------------------------------------------
2140 // row/column transform loops
2141 
2142 template <bool enable_flip_rows = false>
StoreToFrameWithRound(Array2DView<uint8_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int16_t * source,TransformType tx_type)2143 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
2144     Array2DView<uint8_t> frame, const int start_x, const int start_y,
2145     const int tx_width, const int tx_height, const int16_t* source,
2146     TransformType tx_type) {
2147   const bool flip_rows =
2148       enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
2149   const __m128i v_eight = _mm_set1_epi16(8);
2150   const int stride = frame.columns();
2151   uint8_t* dst = frame[start_y] + start_x;
2152   if (tx_width == 4) {
2153     for (int i = 0; i < tx_height; ++i) {
2154       const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
2155       const __m128i residual = LoadLo8(&source[row]);
2156       const __m128i frame_data = Load4(dst);
2157       // Saturate to prevent overflowing int16_t
2158       const __m128i a = _mm_adds_epi16(residual, v_eight);
2159       const __m128i b = _mm_srai_epi16(a, 4);
2160       const __m128i c = _mm_cvtepu8_epi16(frame_data);
2161       const __m128i d = _mm_adds_epi16(c, b);
2162       Store4(dst, _mm_packus_epi16(d, d));
2163       dst += stride;
2164     }
2165   } else if (tx_width == 8) {
2166     for (int i = 0; i < tx_height; ++i) {
2167       const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
2168       const __m128i residual = LoadUnaligned16(&source[row]);
2169       const __m128i frame_data = LoadLo8(dst);
2170       // Saturate to prevent overflowing int16_t
2171       const __m128i b = _mm_adds_epi16(residual, v_eight);
2172       const __m128i c = _mm_srai_epi16(b, 4);
2173       const __m128i d = _mm_cvtepu8_epi16(frame_data);
2174       const __m128i e = _mm_adds_epi16(d, c);
2175       StoreLo8(dst, _mm_packus_epi16(e, e));
2176       dst += stride;
2177     }
2178   } else {
2179     for (int i = 0; i < tx_height; ++i) {
2180       const int y = start_y + i;
2181       const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
2182       int j = 0;
2183       do {
2184         const int x = start_x + j;
2185         const __m128i residual = LoadUnaligned16(&source[row + j]);
2186         const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
2187         const __m128i frame_data = LoadUnaligned16(frame[y] + x);
2188         const __m128i b = _mm_adds_epi16(residual, v_eight);
2189         const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
2190         const __m128i c = _mm_srai_epi16(b, 4);
2191         const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
2192         const __m128i d = _mm_cvtepu8_epi16(frame_data);
2193         const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
2194         const __m128i e = _mm_adds_epi16(d, c);
2195         const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
2196         StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
2197         j += 16;
2198       } while (j < tx_width);
2199     }
2200   }
2201 }
2202 
2203 template <int tx_height>
FlipColumns(int16_t * source,int tx_width)2204 LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
2205   const __m128i word_reverse_8 =
2206       _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
2207   if (tx_width >= 16) {
2208     int i = 0;
2209     do {
2210       // read 16 shorts
2211       const __m128i v3210 = LoadUnaligned16(&source[i]);
2212       const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
2213       const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
2214       const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
2215       StoreUnaligned16(&source[i], v4567);
2216       StoreUnaligned16(&source[i + 8], v0123);
2217       i += 16;
2218     } while (i < tx_width * tx_height);
2219   } else if (tx_width == 8) {
2220     for (int i = 0; i < 8 * tx_height; i += 8) {
2221       const __m128i a = LoadUnaligned16(&source[i]);
2222       const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
2223       StoreUnaligned16(&source[i], b);
2224     }
2225   } else {
2226     const __m128i dual_word_reverse_4 =
2227         _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
2228     // Process two rows per iteration.
2229     for (int i = 0; i < 4 * tx_height; i += 8) {
2230       const __m128i a = LoadUnaligned16(&source[i]);
2231       const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
2232       StoreUnaligned16(&source[i], b);
2233     }
2234   }
2235 }
2236 
2237 template <int tx_width>
ApplyRounding(int16_t * source,int num_rows)2238 LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
2239   const __m128i v_kTransformRowMultiplier =
2240       _mm_set1_epi16(kTransformRowMultiplier << 3);
2241   if (tx_width == 4) {
2242     // Process two rows per iteration.
2243     int i = 0;
2244     do {
2245       const __m128i a = LoadUnaligned16(&source[i]);
2246       const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2247       StoreUnaligned16(&source[i], b);
2248       i += 8;
2249     } while (i < tx_width * num_rows);
2250   } else {
2251     int i = 0;
2252     do {
2253       // The last 32 values of every row are always zero if the |tx_width| is
2254       // 64.
2255       const int non_zero_width = (tx_width < 64) ? tx_width : 32;
2256       int j = 0;
2257       do {
2258         const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
2259         const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
2260         StoreUnaligned16(&source[i * tx_width + j], b);
2261         j += 8;
2262       } while (j < non_zero_width);
2263     } while (++i < num_rows);
2264   }
2265 }
2266 
2267 template <int tx_width>
RowShift(int16_t * source,int num_rows,int row_shift)2268 LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
2269                                     int row_shift) {
2270   const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
2271   const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
2272   if (tx_width == 4) {
2273     // Process two rows per iteration.
2274     int i = 0;
2275     do {
2276       const __m128i residual = LoadUnaligned16(&source[i]);
2277       const __m128i shifted_residual =
2278           ShiftResidual(residual, v_row_shift_add, v_row_shift);
2279       StoreUnaligned16(&source[i], shifted_residual);
2280       i += 8;
2281     } while (i < tx_width * num_rows);
2282   } else {
2283     int i = 0;
2284     do {
2285       for (int j = 0; j < tx_width; j += 8) {
2286         const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
2287         const __m128i shifted_residual =
2288             ShiftResidual(residual, v_row_shift_add, v_row_shift);
2289         StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
2290       }
2291     } while (++i < num_rows);
2292   }
2293 }
2294 
Dct4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2295 void Dct4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2296                               void* src_buffer, int start_x, int start_y,
2297                               void* dst_frame, bool is_row,
2298                               int non_zero_coeff_count) {
2299   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2300   auto* src = static_cast<int16_t*>(src_buffer);
2301   const int tx_width = kTransformWidth[tx_size];
2302   const int tx_height = kTransformHeight[tx_size];
2303 
2304   if (is_row) {
2305     const bool should_round = (tx_height == 8);
2306     const int row_shift = static_cast<int>(tx_height == 16);
2307 
2308     if (DctDcOnly<4>(&src[0], &src[0], non_zero_coeff_count, should_round,
2309                      row_shift)) {
2310       return;
2311     }
2312 
2313     const int num_rows =
2314         GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
2315     if (should_round) {
2316       ApplyRounding<4>(src, num_rows);
2317     }
2318 
2319     if (num_rows <= 4) {
2320       // Process 4 1d dct4 rows in parallel.
2321       Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], /*step=*/4,
2322                                               /*transpose=*/true);
2323     } else {
2324       // Process 8 1d dct4 rows in parallel per iteration.
2325       int i = 0;
2326       do {
2327         Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], &src[i * 4],
2328                                                /*step=*/4, /*transpose=*/true);
2329         i += 8;
2330       } while (i < num_rows);
2331     }
2332     if (tx_height == 16) {
2333       RowShift<4>(src, num_rows, 1);
2334     }
2335     return;
2336   }
2337 
2338   assert(!is_row);
2339   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2340     FlipColumns<4>(src, tx_width);
2341   }
2342 
2343   if (!DctDcOnlyColumn<4>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2344     if (tx_width == 4) {
2345       // Process 4 1d dct4 columns in parallel.
2346       Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], tx_width,
2347                                               /*transpose=*/false);
2348     } else {
2349       // Process 8 1d dct4 columns in parallel per iteration.
2350       int i = 0;
2351       do {
2352         Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], &src[i], tx_width,
2353                                                /*transpose=*/false);
2354         i += 8;
2355       } while (i < tx_width);
2356     }
2357   }
2358   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
2359 }
2360 
Dct8TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2361 void Dct8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2362                               void* src_buffer, int start_x, int start_y,
2363                               void* dst_frame, bool is_row,
2364                               int non_zero_coeff_count) {
2365   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2366   auto* src = static_cast<int16_t*>(src_buffer);
2367   const int tx_width = kTransformWidth[tx_size];
2368   const int tx_height = kTransformHeight[tx_size];
2369 
2370   if (is_row) {
2371     const bool should_round = kShouldRound[tx_size];
2372     const uint8_t row_shift = kTransformRowShift[tx_size];
2373 
2374     if (DctDcOnly<8>(&src[0], &src[0], non_zero_coeff_count, should_round,
2375                      row_shift)) {
2376       return;
2377     }
2378 
2379     const int num_rows =
2380         GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
2381     if (should_round) {
2382       ApplyRounding<8>(src, num_rows);
2383     }
2384 
2385     if (num_rows <= 4) {
2386       // Process 4 1d dct8 rows in parallel.
2387       Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
2388                                              /*transpose=*/true);
2389     } else {
2390       // Process 8 1d dct8 rows in parallel per iteration.
2391       int i = 0;
2392       do {
2393         Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
2394                                                 /*step=*/8, /*transpose=*/true);
2395         i += 8;
2396       } while (i < num_rows);
2397     }
2398     if (row_shift > 0) {
2399       RowShift<8>(src, num_rows, row_shift);
2400     }
2401     return;
2402   }
2403 
2404   assert(!is_row);
2405   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2406     FlipColumns<8>(src, tx_width);
2407   }
2408 
2409   if (!DctDcOnlyColumn<8>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2410     if (tx_width == 4) {
2411       // Process 4 1d dct8 columns in parallel.
2412       Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2413                                              /*transpose=*/false);
2414     } else {
2415       // Process 8 1d dct8 columns in parallel per iteration.
2416       int i = 0;
2417       do {
2418         Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2419                                                 /*transpose=*/false);
2420         i += 8;
2421       } while (i < tx_width);
2422     }
2423   }
2424   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
2425 }
2426 
Dct16TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2427 void Dct16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2428                                void* src_buffer, int start_x, int start_y,
2429                                void* dst_frame, bool is_row,
2430                                int non_zero_coeff_count) {
2431   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2432   auto* src = static_cast<int16_t*>(src_buffer);
2433   const int tx_width = kTransformWidth[tx_size];
2434   const int tx_height = kTransformHeight[tx_size];
2435 
2436   if (is_row) {
2437     const bool should_round = kShouldRound[tx_size];
2438     const uint8_t row_shift = kTransformRowShift[tx_size];
2439 
2440     if (DctDcOnly<16>(&src[0], &src[0], non_zero_coeff_count, should_round,
2441                       row_shift)) {
2442       return;
2443     }
2444 
2445     const int num_rows =
2446         GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2447     if (should_round) {
2448       ApplyRounding<16>(src, num_rows);
2449     }
2450 
2451     if (num_rows <= 4) {
2452       // Process 4 1d dct16 rows in parallel.
2453       Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
2454                                               /*transpose=*/true);
2455     } else {
2456       int i = 0;
2457       do {
2458         // Process 8 1d dct16 rows in parallel per iteration.
2459         Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
2460                                                  /*transpose=*/true);
2461         i += 8;
2462       } while (i < num_rows);
2463     }
2464     // row_shift is always non zero here.
2465     RowShift<16>(src, num_rows, row_shift);
2466 
2467     return;
2468   }
2469 
2470   assert(!is_row);
2471   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2472     FlipColumns<16>(src, tx_width);
2473   }
2474 
2475   if (!DctDcOnlyColumn<16>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2476     if (tx_width == 4) {
2477       // Process 4 1d dct16 columns in parallel.
2478       Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2479                                               /*transpose=*/false);
2480     } else {
2481       int i = 0;
2482       do {
2483         // Process 8 1d dct16 columns in parallel per iteration.
2484         Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2485                                                  /*transpose=*/false);
2486         i += 8;
2487       } while (i < tx_width);
2488     }
2489   }
2490   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
2491 }
2492 
Dct32TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2493 void Dct32TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2494                                void* src_buffer, int start_x, int start_y,
2495                                void* dst_frame, bool is_row,
2496                                int non_zero_coeff_count) {
2497   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2498   auto* src = static_cast<int16_t*>(src_buffer);
2499   const int tx_width = kTransformWidth[tx_size];
2500   const int tx_height = kTransformHeight[tx_size];
2501 
2502   if (is_row) {
2503     const bool should_round = kShouldRound[tx_size];
2504     const uint8_t row_shift = kTransformRowShift[tx_size];
2505 
2506     if (DctDcOnly<32>(&src[0], &src[0], non_zero_coeff_count, should_round,
2507                       row_shift)) {
2508       return;
2509     }
2510 
2511     const int num_rows =
2512         GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2513     if (should_round) {
2514       ApplyRounding<32>(src, num_rows);
2515     }
2516     // Process 8 1d dct32 rows in parallel per iteration.
2517     int i = 0;
2518     do {
2519       Dct32_SSE4_1(&src[i * 32], &src[i * 32], 32, /*transpose=*/true);
2520       i += 8;
2521     } while (i < num_rows);
2522     // row_shift is always non zero here.
2523     RowShift<32>(src, num_rows, row_shift);
2524 
2525     return;
2526   }
2527 
2528   assert(!is_row);
2529   if (!DctDcOnlyColumn<32>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2530     // Process 8 1d dct32 columns in parallel per iteration.
2531     int i = 0;
2532     do {
2533       Dct32_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
2534       i += 8;
2535     } while (i < tx_width);
2536   }
2537   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
2538 }
2539 
Dct64TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2540 void Dct64TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2541                                void* src_buffer, int start_x, int start_y,
2542                                void* dst_frame, bool is_row,
2543                                int non_zero_coeff_count) {
2544   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2545   auto* src = static_cast<int16_t*>(src_buffer);
2546   const int tx_width = kTransformWidth[tx_size];
2547   const int tx_height = kTransformHeight[tx_size];
2548 
2549   if (is_row) {
2550     const bool should_round = kShouldRound[tx_size];
2551     const uint8_t row_shift = kTransformRowShift[tx_size];
2552 
2553     if (DctDcOnly<64>(&src[0], &src[0], non_zero_coeff_count, should_round,
2554                       row_shift)) {
2555       return;
2556     }
2557 
2558     const int num_rows =
2559         GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2560     if (should_round) {
2561       ApplyRounding<64>(src, num_rows);
2562     }
2563     // Process 8 1d dct64 rows in parallel per iteration.
2564     int i = 0;
2565     do {
2566       Dct64_SSE4_1(&src[i * 64], &src[i * 64], 64, /*transpose=*/true);
2567       i += 8;
2568     } while (i < num_rows);
2569     // row_shift is always non zero here.
2570     RowShift<64>(src, num_rows, row_shift);
2571 
2572     return;
2573   }
2574 
2575   assert(!is_row);
2576   if (!DctDcOnlyColumn<64>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2577     // Process 8 1d dct64 columns in parallel per iteration.
2578     int i = 0;
2579     do {
2580       Dct64_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
2581       i += 8;
2582     } while (i < tx_width);
2583   }
2584   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
2585 }
2586 
Adst4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2587 void Adst4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2588                                void* src_buffer, int start_x, int start_y,
2589                                void* dst_frame, bool is_row,
2590                                int non_zero_coeff_count) {
2591   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2592   auto* src = static_cast<int16_t*>(src_buffer);
2593   const int tx_width = kTransformWidth[tx_size];
2594   const int tx_height = kTransformHeight[tx_size];
2595 
2596   if (is_row) {
2597     const uint8_t row_shift = static_cast<uint8_t>(tx_height == 16);
2598     const bool should_round = (tx_height == 8);
2599 
2600     if (Adst4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2601                     row_shift)) {
2602       return;
2603     }
2604 
2605     const int num_rows =
2606         GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
2607     if (should_round) {
2608       ApplyRounding<4>(src, num_rows);
2609     }
2610 
2611     // Process 4 1d adst4 rows in parallel per iteration.
2612     int i = 0;
2613     do {
2614       Adst4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4,
2615                           /*transpose=*/true);
2616       i += 4;
2617     } while (i < num_rows);
2618 
2619     if (row_shift != 0u) {
2620       RowShift<4>(src, num_rows, 1);
2621     }
2622     return;
2623   }
2624 
2625   assert(!is_row);
2626   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2627     FlipColumns<4>(src, tx_width);
2628   }
2629 
2630   if (!Adst4DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2631     // Process 4 1d adst4 columns in parallel per iteration.
2632     int i = 0;
2633     do {
2634       Adst4_SSE4_1<false>(&src[i], &src[i], tx_width, /*transpose=*/false);
2635       i += 4;
2636     } while (i < tx_width);
2637   }
2638   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2639                                                    tx_width, 4, src, tx_type);
2640 }
2641 
Adst8TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2642 void Adst8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2643                                void* src_buffer, int start_x, int start_y,
2644                                void* dst_frame, bool is_row,
2645                                int non_zero_coeff_count) {
2646   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2647   auto* src = static_cast<int16_t*>(src_buffer);
2648   const int tx_width = kTransformWidth[tx_size];
2649   const int tx_height = kTransformHeight[tx_size];
2650 
2651   if (is_row) {
2652     const bool should_round = kShouldRound[tx_size];
2653     const uint8_t row_shift = kTransformRowShift[tx_size];
2654 
2655     if (Adst8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2656                     row_shift)) {
2657       return;
2658     }
2659 
2660     const int num_rows =
2661         GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
2662     if (should_round) {
2663       ApplyRounding<8>(src, num_rows);
2664     }
2665 
2666     if (num_rows <= 4) {
2667       // Process 4 1d adst8 rows in parallel.
2668       Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
2669                                               /*transpose=*/true);
2670     } else {
2671       // Process 8 1d adst8 rows in parallel per iteration.
2672       int i = 0;
2673       do {
2674         Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
2675                                                  /*step=*/8,
2676                                                  /*transpose=*/true);
2677         i += 8;
2678       } while (i < num_rows);
2679     }
2680     if (row_shift > 0) {
2681       RowShift<8>(src, num_rows, row_shift);
2682     }
2683     return;
2684   }
2685 
2686   assert(!is_row);
2687   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2688     FlipColumns<8>(src, tx_width);
2689   }
2690 
2691   if (!Adst8DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2692     if (tx_width == 4) {
2693       // Process 4 1d adst8 columns in parallel.
2694       Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2695                                               /*transpose=*/false);
2696     } else {
2697       // Process 8 1d adst8 columns in parallel per iteration.
2698       int i = 0;
2699       do {
2700         Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2701                                                  /*transpose=*/false);
2702         i += 8;
2703       } while (i < tx_width);
2704     }
2705   }
2706   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2707                                                    tx_width, 8, src, tx_type);
2708 }
2709 
Adst16TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2710 void Adst16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2711                                 void* src_buffer, int start_x, int start_y,
2712                                 void* dst_frame, bool is_row,
2713                                 int non_zero_coeff_count) {
2714   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2715   auto* src = static_cast<int16_t*>(src_buffer);
2716   const int tx_width = kTransformWidth[tx_size];
2717   const int tx_height = kTransformHeight[tx_size];
2718 
2719   if (is_row) {
2720     const bool should_round = kShouldRound[tx_size];
2721     const uint8_t row_shift = kTransformRowShift[tx_size];
2722 
2723     if (Adst16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2724                      row_shift)) {
2725       return;
2726     }
2727 
2728     const int num_rows =
2729         GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2730     if (should_round) {
2731       ApplyRounding<16>(src, num_rows);
2732     }
2733 
2734     if (num_rows <= 4) {
2735       // Process 4 1d adst16 rows in parallel.
2736       Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
2737                                                /*transpose=*/true);
2738     } else {
2739       int i = 0;
2740       do {
2741         // Process 8 1d adst16 rows in parallel per iteration.
2742         Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16],
2743                                                   16, /*transpose=*/true);
2744         i += 8;
2745       } while (i < num_rows);
2746     }
2747     // row_shift is always non zero here.
2748     RowShift<16>(src, num_rows, row_shift);
2749 
2750     return;
2751   }
2752 
2753   assert(!is_row);
2754   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2755     FlipColumns<16>(src, tx_width);
2756   }
2757 
2758   if (!Adst16DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
2759     if (tx_width == 4) {
2760       // Process 4 1d adst16 columns in parallel.
2761       Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
2762                                                /*transpose=*/false);
2763     } else {
2764       int i = 0;
2765       do {
2766         // Process 8 1d adst16 columns in parallel per iteration.
2767         Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
2768                                                   /*transpose=*/false);
2769         i += 8;
2770       } while (i < tx_width);
2771     }
2772   }
2773   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
2774                                                    tx_width, 16, src, tx_type);
2775 }
2776 
Identity4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2777 void Identity4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2778                                    void* src_buffer, int start_x, int start_y,
2779                                    void* dst_frame, bool is_row,
2780                                    int non_zero_coeff_count) {
2781   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2782   auto* src = static_cast<int16_t*>(src_buffer);
2783   const int tx_width = kTransformWidth[tx_size];
2784   const int tx_height = kTransformHeight[tx_size];
2785 
2786   if (is_row) {
2787     // Special case: Process row calculations during column transform call.
2788     // Improves performance.
2789     if (tx_type == kTransformTypeIdentityIdentity &&
2790         tx_size == kTransformSize4x4) {
2791       return;
2792     }
2793 
2794     const bool should_round = (tx_height == 8);
2795     if (Identity4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2796                         tx_height)) {
2797       return;
2798     }
2799 
2800     const int num_rows =
2801         GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
2802     if (should_round) {
2803       ApplyRounding<4>(src, num_rows);
2804     }
2805     if (tx_height < 16) {
2806       int i = 0;
2807       do {
2808         Identity4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4);
2809         i += 4;
2810       } while (i < num_rows);
2811     } else {
2812       int i = 0;
2813       do {
2814         Identity4_SSE4_1<true>(&src[i * 4], &src[i * 4], /*step=*/4);
2815         i += 4;
2816       } while (i < num_rows);
2817     }
2818     return;
2819   }
2820   assert(!is_row);
2821   const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2822   // Special case: Process row calculations during column transform call.
2823   if (tx_type == kTransformTypeIdentityIdentity &&
2824       (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
2825     Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, height,
2826                                    src);
2827     return;
2828   }
2829 
2830   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2831     FlipColumns<4>(src, tx_width);
2832   }
2833 
2834   Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
2835 }
2836 
Identity8TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2837 void Identity8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2838                                    void* src_buffer, int start_x, int start_y,
2839                                    void* dst_frame, bool is_row,
2840                                    int non_zero_coeff_count) {
2841   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2842   auto* src = static_cast<int16_t*>(src_buffer);
2843   const int tx_width = kTransformWidth[tx_size];
2844   const int tx_height = kTransformHeight[tx_size];
2845 
2846   if (is_row) {
2847     // Special case: Process row calculations during column transform call.
2848     // Improves performance.
2849     if (tx_type == kTransformTypeIdentityIdentity &&
2850         tx_size == kTransformSize8x4) {
2851       return;
2852     }
2853 
2854     const bool should_round = kShouldRound[tx_size];
2855     const uint8_t row_shift = kTransformRowShift[tx_size];
2856     if (Identity8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2857                         row_shift)) {
2858       return;
2859     }
2860 
2861     const int num_rows =
2862         GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
2863     if (should_round) {
2864       ApplyRounding<8>(src, num_rows);
2865     }
2866 
2867     // When combining the identity8 multiplier with the row shift, the
2868     // calculations for tx_height == 8 and tx_height == 16 can be simplified
2869     // from ((A * 2) + 1) >> 1) to A.
2870     if ((tx_height & 0x18) != 0) {
2871       return;
2872     }
2873     if (tx_height == 32) {
2874       int i = 0;
2875       do {
2876         Identity8Row32_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
2877         i += 4;
2878       } while (i < num_rows);
2879       return;
2880     }
2881 
2882     // Process kTransformSize8x4
2883     assert(tx_size == kTransformSize8x4);
2884     int i = 0;
2885     do {
2886       Identity8Row4_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
2887       i += 4;
2888     } while (i < num_rows);
2889     return;
2890   }
2891 
2892   assert(!is_row);
2893   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2894     FlipColumns<8>(src, tx_width);
2895   }
2896 
2897   const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2898   Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
2899                                      src);
2900 }
2901 
Identity16TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2902 void Identity16TransformLoop_SSE4_1(TransformType tx_type,
2903                                     TransformSize tx_size, void* src_buffer,
2904                                     int start_x, int start_y, void* dst_frame,
2905                                     bool is_row, int non_zero_coeff_count) {
2906   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2907   auto* src = static_cast<int16_t*>(src_buffer);
2908   const int tx_width = kTransformWidth[tx_size];
2909   const int tx_height = kTransformHeight[tx_size];
2910 
2911   if (is_row) {
2912     const bool should_round = kShouldRound[tx_size];
2913     const uint8_t row_shift = kTransformRowShift[tx_size];
2914     if (Identity16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
2915                          row_shift)) {
2916       return;
2917     }
2918 
2919     const int num_rows =
2920         GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
2921     if (should_round) {
2922       ApplyRounding<16>(src, num_rows);
2923     }
2924     int i = 0;
2925     do {
2926       Identity16Row_SSE4_1(&src[i * 16], &src[i * 16], /*step=*/16,
2927                            kTransformRowShift[tx_size]);
2928       i += 4;
2929     } while (i < num_rows);
2930     return;
2931   }
2932 
2933   assert(!is_row);
2934   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2935     FlipColumns<16>(src, tx_width);
2936   }
2937   const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2938   Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
2939                                       src);
2940 }
2941 
Identity32TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2942 void Identity32TransformLoop_SSE4_1(TransformType tx_type,
2943                                     TransformSize tx_size, void* src_buffer,
2944                                     int start_x, int start_y, void* dst_frame,
2945                                     bool is_row, int non_zero_coeff_count) {
2946   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
2947   auto* src = static_cast<int16_t*>(src_buffer);
2948   const int tx_width = kTransformWidth[tx_size];
2949   const int tx_height = kTransformHeight[tx_size];
2950 
2951   if (is_row) {
2952     // When combining the identity32 multiplier with the row shift, the
2953     // calculations for tx_height == 8 and tx_height == 32 can be simplified
2954     // from ((A * 4) + 2) >> 2) to A.
2955     if ((tx_height & 0x28) != 0) {
2956       return;
2957     }
2958 
2959     // Process kTransformSize32x16. The src is always rounded before the
2960     // identity transform and shifted by 1 afterwards.
2961 
2962     if (Identity32DcOnly(&src[0], &src[0], non_zero_coeff_count)) {
2963       return;
2964     }
2965 
2966     const int num_rows =
2967         GetNumRows<32>(tx_type, tx_height, non_zero_coeff_count);
2968 
2969     // Process kTransformSize32x16
2970     assert(tx_size == kTransformSize32x16);
2971     ApplyRounding<32>(src, num_rows);
2972     int i = 0;
2973     do {
2974       Identity32Row16_SSE4_1(&src[i * 32], &src[i * 32], /*step=*/32);
2975       i += 4;
2976     } while (i < num_rows);
2977     return;
2978   }
2979 
2980   assert(!is_row);
2981   const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
2982   Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
2983 }
2984 
Wht4TransformLoop_SSE4_1(TransformType tx_type,TransformSize tx_size,void * src_buffer,int start_x,int start_y,void * dst_frame,bool is_row,int non_zero_coeff_count)2985 void Wht4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
2986                               void* src_buffer, int start_x, int start_y,
2987                               void* dst_frame, bool is_row,
2988                               int non_zero_coeff_count) {
2989   assert(tx_type == kTransformTypeDctDct);
2990   assert(tx_size == kTransformSize4x4);
2991   static_cast<void>(tx_type);
2992   static_cast<void>(tx_size);
2993   if (is_row) {
2994     // Do both row and column transforms in the column-transform pass.
2995     return;
2996   }
2997 
2998   assert(!is_row);
2999   // Process 4 1d wht4 rows and columns in parallel.
3000   const auto* src = static_cast<int16_t*>(src_buffer);
3001   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
3002   Wht4_SSE4_1(frame, start_x, start_y, src, non_zero_coeff_count);
3003 }
3004 
3005 //------------------------------------------------------------------------------
3006 
3007 template <typename Residual, typename Pixel>
InitAll(Dsp * const dsp)3008 void InitAll(Dsp* const dsp) {
3009   // Maximum transform size for Dct is 64.
3010   dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
3011       Dct4TransformLoop_SSE4_1;
3012   dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
3013       Dct8TransformLoop_SSE4_1;
3014   dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
3015       Dct16TransformLoop_SSE4_1;
3016   dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
3017       Dct32TransformLoop_SSE4_1;
3018   dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
3019       Dct64TransformLoop_SSE4_1;
3020 
3021   // Maximum transform size for Adst is 16.
3022   dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
3023       Adst4TransformLoop_SSE4_1;
3024   dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
3025       Adst8TransformLoop_SSE4_1;
3026   dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
3027       Adst16TransformLoop_SSE4_1;
3028 
3029   // Maximum transform size for Identity transform is 32.
3030   dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
3031       Identity4TransformLoop_SSE4_1;
3032   dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
3033       Identity8TransformLoop_SSE4_1;
3034   dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
3035       Identity16TransformLoop_SSE4_1;
3036   dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
3037       Identity32TransformLoop_SSE4_1;
3038 
3039   // Maximum transform size for Wht is 4.
3040   dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
3041       Wht4TransformLoop_SSE4_1;
3042 }
3043 
Init8bpp()3044 void Init8bpp() {
3045   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
3046   assert(dsp != nullptr);
3047 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
3048   InitAll<int16_t, uint8_t>(dsp);
3049 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
3050 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
3051   dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
3052       Dct4TransformLoop_SSE4_1;
3053 #endif
3054 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
3055   dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
3056       Dct8TransformLoop_SSE4_1;
3057 #endif
3058 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
3059   dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
3060       Dct16TransformLoop_SSE4_1;
3061 #endif
3062 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
3063   dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
3064       Dct32TransformLoop_SSE4_1;
3065 #endif
3066 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
3067   dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
3068       Dct64TransformLoop_SSE4_1;
3069 #endif
3070 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
3071   dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
3072       Adst4TransformLoop_SSE4_1;
3073 #endif
3074 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
3075   dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
3076       Adst8TransformLoop_SSE4_1;
3077 #endif
3078 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
3079   dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
3080       Adst16TransformLoop_SSE4_1;
3081 #endif
3082 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
3083   dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
3084       Identity4TransformLoop_SSE4_1;
3085 #endif
3086 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
3087   dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
3088       Identity8TransformLoop_SSE4_1;
3089 #endif
3090 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
3091   dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
3092       Identity16TransformLoop_SSE4_1;
3093 #endif
3094 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
3095   dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
3096       Identity32TransformLoop_SSE4_1;
3097 #endif
3098 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
3099   dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
3100       Wht4TransformLoop_SSE4_1;
3101 #endif
3102 #endif
3103 }
3104 
3105 }  // namespace
3106 }  // namespace low_bitdepth
3107 
InverseTransformInit_SSE4_1()3108 void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
3109 
3110 }  // namespace dsp
3111 }  // namespace libgav1
3112 #else  // !LIBGAV1_ENABLE_SSE4_1
3113 namespace libgav1 {
3114 namespace dsp {
3115 
InverseTransformInit_SSE4_1()3116 void InverseTransformInit_SSE4_1() {}
3117 
3118 }  // namespace dsp
3119 }  // namespace libgav1
3120 #endif  // LIBGAV1_ENABLE_SSE4_1
3121