• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/inverse_transform.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
19 
20 #include <arm_neon.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstdint>
25 
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/utils/array_2d.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32 #include "src/utils/constants.h"
33 
34 namespace libgav1 {
35 namespace dsp {
36 namespace {
37 
38 // Include the constants and utility functions inside the anonymous namespace.
39 #include "src/dsp/inverse_transform.inc"
40 
41 //------------------------------------------------------------------------------
42 
Transpose4x4(const int32x4_t in[4],int32x4_t out[4])43 LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
44                                         int32x4_t out[4]) {
45   // in:
46   // 00 01 02 03
47   // 10 11 12 13
48   // 20 21 22 23
49   // 30 31 32 33
50 
51   // 00 10 02 12   a.val[0]
52   // 01 11 03 13   a.val[1]
53   // 20 30 22 32   b.val[0]
54   // 21 31 23 33   b.val[1]
55   const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
56   const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
57   out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
58   out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
59   out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
60   out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
61   // out:
62   // 00 10 20 30
63   // 01 11 21 31
64   // 02 12 22 32
65   // 03 13 23 33
66 }
67 
68 //------------------------------------------------------------------------------
69 template <int store_count>
StoreDst(int32_t * dst,int32_t stride,int32_t idx,const int32x4_t * const s)70 LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
71                                     const int32x4_t* const s) {
72   assert(store_count % 4 == 0);
73   for (int i = 0; i < store_count; i += 4) {
74     vst1q_s32(&dst[i * stride + idx], s[i]);
75     vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
76     vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
77     vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
78   }
79 }
80 
81 template <int load_count>
LoadSrc(const int32_t * src,int32_t stride,int32_t idx,int32x4_t * x)82 LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
83                                    int32_t idx, int32x4_t* x) {
84   assert(load_count % 4 == 0);
85   for (int i = 0; i < load_count; i += 4) {
86     x[i] = vld1q_s32(&src[i * stride + idx]);
87     x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
88     x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
89     x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
90   }
91 }
92 
93 // Butterfly rotate 4 values.
ButterflyRotation_4(int32x4_t * a,int32x4_t * b,const int angle,const bool flip)94 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
95                                                const int angle,
96                                                const bool flip) {
97   const int32_t cos128 = Cos128(angle);
98   const int32_t sin128 = Sin128(angle);
99   const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
100   const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
101   // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
102   // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
103   // bit lane.
104   const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
105   const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
106   const int32x4_t x = vrshrq_n_s32(x0, 12);
107   const int32x4_t y = vrshrq_n_s32(y0, 12);
108   if (flip) {
109     *a = y;
110     *b = x;
111   } else {
112     *a = x;
113     *b = y;
114   }
115 }
116 
ButterflyRotation_FirstIsZero(int32x4_t * a,int32x4_t * b,const int angle,const bool flip)117 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
118                                                          int32x4_t* b,
119                                                          const int angle,
120                                                          const bool flip) {
121   const int32_t cos128 = Cos128(angle);
122   const int32_t sin128 = Sin128(angle);
123   assert(sin128 <= 0xfff);
124   const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
125   const int32x4_t y0 = vmulq_n_s32(*b, cos128);
126   const int32x4_t x = vrshrq_n_s32(x0, 12);
127   const int32x4_t y = vrshrq_n_s32(y0, 12);
128   if (flip) {
129     *a = y;
130     *b = x;
131   } else {
132     *a = x;
133     *b = y;
134   }
135 }
136 
ButterflyRotation_SecondIsZero(int32x4_t * a,int32x4_t * b,const int angle,const bool flip)137 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
138                                                           int32x4_t* b,
139                                                           const int angle,
140                                                           const bool flip) {
141   const int32_t cos128 = Cos128(angle);
142   const int32_t sin128 = Sin128(angle);
143   const int32x4_t x0 = vmulq_n_s32(*a, cos128);
144   const int32x4_t y0 = vmulq_n_s32(*a, sin128);
145   const int32x4_t x = vrshrq_n_s32(x0, 12);
146   const int32x4_t y = vrshrq_n_s32(y0, 12);
147   if (flip) {
148     *a = y;
149     *b = x;
150   } else {
151     *a = x;
152     *b = y;
153   }
154 }
155 
HadamardRotation(int32x4_t * a,int32x4_t * b,bool flip)156 LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
157                                             bool flip) {
158   int32x4_t x, y;
159   if (flip) {
160     y = vqaddq_s32(*b, *a);
161     x = vqsubq_s32(*b, *a);
162   } else {
163     x = vqaddq_s32(*a, *b);
164     y = vqsubq_s32(*a, *b);
165   }
166   *a = x;
167   *b = y;
168 }
169 
HadamardRotation(int32x4_t * a,int32x4_t * b,bool flip,const int32x4_t * min,const int32x4_t * max)170 LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
171                                             bool flip, const int32x4_t* min,
172                                             const int32x4_t* max) {
173   int32x4_t x, y;
174   if (flip) {
175     y = vqaddq_s32(*b, *a);
176     x = vqsubq_s32(*b, *a);
177   } else {
178     x = vqaddq_s32(*a, *b);
179     y = vqsubq_s32(*a, *b);
180   }
181   *a = vmaxq_s32(vminq_s32(x, *max), *min);
182   *b = vmaxq_s32(vminq_s32(y, *max), *min);
183 }
184 
185 using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
186                                        bool flip);
187 
188 //------------------------------------------------------------------------------
189 // Discrete Cosine Transforms (DCT).
190 
191 template <int width>
DctDcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)192 LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
193                                      bool should_round, int row_shift) {
194   if (adjusted_tx_height > 1) return false;
195 
196   auto* dst = static_cast<int32_t*>(dest);
197   const int32x4_t v_src = vdupq_n_s32(dst[0]);
198   const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
199   const int32x4_t v_src_round =
200       vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
201   const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
202   const int32_t cos128 = Cos128(32);
203   const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
204   // vqrshlq_s32 will shift right if shift value is negative.
205   const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
206   // Clamp result to signed 16 bits.
207   const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
208   if (width == 4) {
209     vst1q_s32(dst, result);
210   } else {
211     for (int i = 0; i < width; i += 4) {
212       vst1q_s32(dst, result);
213       dst += 4;
214     }
215   }
216   return true;
217 }
218 
219 template <int height>
DctDcOnlyColumn(void * dest,int adjusted_tx_height,int width)220 LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
221                                            int width) {
222   if (adjusted_tx_height > 1) return false;
223 
224   auto* dst = static_cast<int32_t*>(dest);
225   const int32_t cos128 = Cos128(32);
226 
227   // Calculate dc values for first row.
228   if (width == 4) {
229     const int32x4_t v_src = vld1q_s32(dst);
230     const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
231     vst1q_s32(dst, xy);
232   } else {
233     int i = 0;
234     do {
235       const int32x4_t v_src = vld1q_s32(&dst[i]);
236       const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
237       vst1q_s32(&dst[i], xy);
238       i += 4;
239     } while (i < width);
240   }
241 
242   // Copy first row to the rest of the block.
243   for (int y = 1; y < height; ++y) {
244     memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
245   }
246   return true;
247 }
248 
249 template <ButterflyRotationFunc butterfly_rotation,
250           bool is_fast_butterfly = false>
Dct4Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)251 LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
252                                       const int32x4_t* max,
253                                       const bool is_last_stage) {
254   // stage 12.
255   if (is_fast_butterfly) {
256     ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
257     ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
258   } else {
259     butterfly_rotation(&s[0], &s[1], 32, true);
260     butterfly_rotation(&s[2], &s[3], 48, false);
261   }
262 
263   // stage 17.
264   if (is_last_stage) {
265     HadamardRotation(&s[0], &s[3], false);
266     HadamardRotation(&s[1], &s[2], false);
267   } else {
268     HadamardRotation(&s[0], &s[3], false, min, max);
269     HadamardRotation(&s[1], &s[2], false, min, max);
270   }
271 }
272 
273 template <ButterflyRotationFunc butterfly_rotation>
Dct4_NEON(void * dest,int32_t step,bool is_row,int row_shift)274 LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
275                                      int row_shift) {
276   auto* const dst = static_cast<int32_t*>(dest);
277   // When |is_row| is true, set range to the row range, otherwise, set to the
278   // column range.
279   const int32_t range = is_row ? kBitdepth10 + 7 : 15;
280   const int32x4_t min = vdupq_n_s32(-(1 << range));
281   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
282   int32x4_t s[4], x[4];
283 
284   LoadSrc<4>(dst, step, 0, x);
285   if (is_row) {
286     Transpose4x4(x, x);
287   }
288 
289   // stage 1.
290   // kBitReverseLookup 0, 2, 1, 3
291   s[0] = x[0];
292   s[1] = x[2];
293   s[2] = x[1];
294   s[3] = x[3];
295 
296   Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
297 
298   if (is_row) {
299     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
300     for (int i = 0; i < 4; ++i) {
301       s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
302     }
303     Transpose4x4(s, s);
304   }
305   StoreDst<4>(dst, step, 0, s);
306 }
307 
308 template <ButterflyRotationFunc butterfly_rotation,
309           bool is_fast_butterfly = false>
Dct8Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)310 LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
311                                       const int32x4_t* max,
312                                       const bool is_last_stage) {
313   // stage 8.
314   if (is_fast_butterfly) {
315     ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
316     ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
317   } else {
318     butterfly_rotation(&s[4], &s[7], 56, false);
319     butterfly_rotation(&s[5], &s[6], 24, false);
320   }
321 
322   // stage 13.
323   HadamardRotation(&s[4], &s[5], false, min, max);
324   HadamardRotation(&s[6], &s[7], true, min, max);
325 
326   // stage 18.
327   butterfly_rotation(&s[6], &s[5], 32, true);
328 
329   // stage 22.
330   if (is_last_stage) {
331     HadamardRotation(&s[0], &s[7], false);
332     HadamardRotation(&s[1], &s[6], false);
333     HadamardRotation(&s[2], &s[5], false);
334     HadamardRotation(&s[3], &s[4], false);
335   } else {
336     HadamardRotation(&s[0], &s[7], false, min, max);
337     HadamardRotation(&s[1], &s[6], false, min, max);
338     HadamardRotation(&s[2], &s[5], false, min, max);
339     HadamardRotation(&s[3], &s[4], false, min, max);
340   }
341 }
342 
343 // Process dct8 rows or columns, depending on the |is_row| flag.
344 template <ButterflyRotationFunc butterfly_rotation>
Dct8_NEON(void * dest,int32_t step,bool is_row,int row_shift)345 LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
346                                      int row_shift) {
347   auto* const dst = static_cast<int32_t*>(dest);
348   const int32_t range = is_row ? kBitdepth10 + 7 : 15;
349   const int32x4_t min = vdupq_n_s32(-(1 << range));
350   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
351   int32x4_t s[8], x[8];
352 
353   if (is_row) {
354     LoadSrc<4>(dst, step, 0, &x[0]);
355     LoadSrc<4>(dst, step, 4, &x[4]);
356     Transpose4x4(&x[0], &x[0]);
357     Transpose4x4(&x[4], &x[4]);
358   } else {
359     LoadSrc<8>(dst, step, 0, &x[0]);
360   }
361 
362   // stage 1.
363   // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
364   s[0] = x[0];
365   s[1] = x[4];
366   s[2] = x[2];
367   s[3] = x[6];
368   s[4] = x[1];
369   s[5] = x[5];
370   s[6] = x[3];
371   s[7] = x[7];
372 
373   Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
374   Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
375 
376   if (is_row) {
377     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
378     for (int i = 0; i < 8; ++i) {
379       s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
380     }
381     Transpose4x4(&s[0], &s[0]);
382     Transpose4x4(&s[4], &s[4]);
383     StoreDst<4>(dst, step, 0, &s[0]);
384     StoreDst<4>(dst, step, 4, &s[4]);
385   } else {
386     StoreDst<8>(dst, step, 0, &s[0]);
387   }
388 }
389 
390 template <ButterflyRotationFunc butterfly_rotation,
391           bool is_fast_butterfly = false>
Dct16Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)392 LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
393                                        const int32x4_t* max,
394                                        const bool is_last_stage) {
395   // stage 5.
396   if (is_fast_butterfly) {
397     ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
398     ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
399     ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
400     ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
401   } else {
402     butterfly_rotation(&s[8], &s[15], 60, false);
403     butterfly_rotation(&s[9], &s[14], 28, false);
404     butterfly_rotation(&s[10], &s[13], 44, false);
405     butterfly_rotation(&s[11], &s[12], 12, false);
406   }
407 
408   // stage 9.
409   HadamardRotation(&s[8], &s[9], false, min, max);
410   HadamardRotation(&s[10], &s[11], true, min, max);
411   HadamardRotation(&s[12], &s[13], false, min, max);
412   HadamardRotation(&s[14], &s[15], true, min, max);
413 
414   // stage 14.
415   butterfly_rotation(&s[14], &s[9], 48, true);
416   butterfly_rotation(&s[13], &s[10], 112, true);
417 
418   // stage 19.
419   HadamardRotation(&s[8], &s[11], false, min, max);
420   HadamardRotation(&s[9], &s[10], false, min, max);
421   HadamardRotation(&s[12], &s[15], true, min, max);
422   HadamardRotation(&s[13], &s[14], true, min, max);
423 
424   // stage 23.
425   butterfly_rotation(&s[13], &s[10], 32, true);
426   butterfly_rotation(&s[12], &s[11], 32, true);
427 
428   // stage 26.
429   if (is_last_stage) {
430     HadamardRotation(&s[0], &s[15], false);
431     HadamardRotation(&s[1], &s[14], false);
432     HadamardRotation(&s[2], &s[13], false);
433     HadamardRotation(&s[3], &s[12], false);
434     HadamardRotation(&s[4], &s[11], false);
435     HadamardRotation(&s[5], &s[10], false);
436     HadamardRotation(&s[6], &s[9], false);
437     HadamardRotation(&s[7], &s[8], false);
438   } else {
439     HadamardRotation(&s[0], &s[15], false, min, max);
440     HadamardRotation(&s[1], &s[14], false, min, max);
441     HadamardRotation(&s[2], &s[13], false, min, max);
442     HadamardRotation(&s[3], &s[12], false, min, max);
443     HadamardRotation(&s[4], &s[11], false, min, max);
444     HadamardRotation(&s[5], &s[10], false, min, max);
445     HadamardRotation(&s[6], &s[9], false, min, max);
446     HadamardRotation(&s[7], &s[8], false, min, max);
447   }
448 }
449 
450 // Process dct16 rows or columns, depending on the |is_row| flag.
451 template <ButterflyRotationFunc butterfly_rotation>
Dct16_NEON(void * dest,int32_t step,bool is_row,int row_shift)452 LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
453                                       int row_shift) {
454   auto* const dst = static_cast<int32_t*>(dest);
455   const int32_t range = is_row ? kBitdepth10 + 7 : 15;
456   const int32x4_t min = vdupq_n_s32(-(1 << range));
457   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
458   int32x4_t s[16], x[16];
459 
460   if (is_row) {
461     for (int idx = 0; idx < 16; idx += 8) {
462       LoadSrc<4>(dst, step, idx, &x[idx]);
463       LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
464       Transpose4x4(&x[idx], &x[idx]);
465       Transpose4x4(&x[idx + 4], &x[idx + 4]);
466     }
467   } else {
468     LoadSrc<16>(dst, step, 0, &x[0]);
469   }
470 
471   // stage 1
472   // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
473   s[0] = x[0];
474   s[1] = x[8];
475   s[2] = x[4];
476   s[3] = x[12];
477   s[4] = x[2];
478   s[5] = x[10];
479   s[6] = x[6];
480   s[7] = x[14];
481   s[8] = x[1];
482   s[9] = x[9];
483   s[10] = x[5];
484   s[11] = x[13];
485   s[12] = x[3];
486   s[13] = x[11];
487   s[14] = x[7];
488   s[15] = x[15];
489 
490   Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
491   Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
492   Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
493 
494   if (is_row) {
495     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
496     for (int i = 0; i < 16; ++i) {
497       s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
498     }
499     for (int idx = 0; idx < 16; idx += 8) {
500       Transpose4x4(&s[idx], &s[idx]);
501       Transpose4x4(&s[idx + 4], &s[idx + 4]);
502       StoreDst<4>(dst, step, idx, &s[idx]);
503       StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
504     }
505   } else {
506     StoreDst<16>(dst, step, 0, &s[0]);
507   }
508 }
509 
510 template <ButterflyRotationFunc butterfly_rotation,
511           bool is_fast_butterfly = false>
Dct32Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)512 LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
513                                        const int32x4_t* max,
514                                        const bool is_last_stage) {
515   // stage 3
516   if (is_fast_butterfly) {
517     ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
518     ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
519     ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
520     ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
521     ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
522     ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
523     ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
524     ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
525   } else {
526     butterfly_rotation(&s[16], &s[31], 62, false);
527     butterfly_rotation(&s[17], &s[30], 30, false);
528     butterfly_rotation(&s[18], &s[29], 46, false);
529     butterfly_rotation(&s[19], &s[28], 14, false);
530     butterfly_rotation(&s[20], &s[27], 54, false);
531     butterfly_rotation(&s[21], &s[26], 22, false);
532     butterfly_rotation(&s[22], &s[25], 38, false);
533     butterfly_rotation(&s[23], &s[24], 6, false);
534   }
535 
536   // stage 6.
537   HadamardRotation(&s[16], &s[17], false, min, max);
538   HadamardRotation(&s[18], &s[19], true, min, max);
539   HadamardRotation(&s[20], &s[21], false, min, max);
540   HadamardRotation(&s[22], &s[23], true, min, max);
541   HadamardRotation(&s[24], &s[25], false, min, max);
542   HadamardRotation(&s[26], &s[27], true, min, max);
543   HadamardRotation(&s[28], &s[29], false, min, max);
544   HadamardRotation(&s[30], &s[31], true, min, max);
545 
546   // stage 10.
547   butterfly_rotation(&s[30], &s[17], 24 + 32, true);
548   butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
549   butterfly_rotation(&s[26], &s[21], 24, true);
550   butterfly_rotation(&s[25], &s[22], 24 + 64, true);
551 
552   // stage 15.
553   HadamardRotation(&s[16], &s[19], false, min, max);
554   HadamardRotation(&s[17], &s[18], false, min, max);
555   HadamardRotation(&s[20], &s[23], true, min, max);
556   HadamardRotation(&s[21], &s[22], true, min, max);
557   HadamardRotation(&s[24], &s[27], false, min, max);
558   HadamardRotation(&s[25], &s[26], false, min, max);
559   HadamardRotation(&s[28], &s[31], true, min, max);
560   HadamardRotation(&s[29], &s[30], true, min, max);
561 
562   // stage 20.
563   butterfly_rotation(&s[29], &s[18], 48, true);
564   butterfly_rotation(&s[28], &s[19], 48, true);
565   butterfly_rotation(&s[27], &s[20], 48 + 64, true);
566   butterfly_rotation(&s[26], &s[21], 48 + 64, true);
567 
568   // stage 24.
569   HadamardRotation(&s[16], &s[23], false, min, max);
570   HadamardRotation(&s[17], &s[22], false, min, max);
571   HadamardRotation(&s[18], &s[21], false, min, max);
572   HadamardRotation(&s[19], &s[20], false, min, max);
573   HadamardRotation(&s[24], &s[31], true, min, max);
574   HadamardRotation(&s[25], &s[30], true, min, max);
575   HadamardRotation(&s[26], &s[29], true, min, max);
576   HadamardRotation(&s[27], &s[28], true, min, max);
577 
578   // stage 27.
579   butterfly_rotation(&s[27], &s[20], 32, true);
580   butterfly_rotation(&s[26], &s[21], 32, true);
581   butterfly_rotation(&s[25], &s[22], 32, true);
582   butterfly_rotation(&s[24], &s[23], 32, true);
583 
584   // stage 29.
585   if (is_last_stage) {
586     HadamardRotation(&s[0], &s[31], false);
587     HadamardRotation(&s[1], &s[30], false);
588     HadamardRotation(&s[2], &s[29], false);
589     HadamardRotation(&s[3], &s[28], false);
590     HadamardRotation(&s[4], &s[27], false);
591     HadamardRotation(&s[5], &s[26], false);
592     HadamardRotation(&s[6], &s[25], false);
593     HadamardRotation(&s[7], &s[24], false);
594     HadamardRotation(&s[8], &s[23], false);
595     HadamardRotation(&s[9], &s[22], false);
596     HadamardRotation(&s[10], &s[21], false);
597     HadamardRotation(&s[11], &s[20], false);
598     HadamardRotation(&s[12], &s[19], false);
599     HadamardRotation(&s[13], &s[18], false);
600     HadamardRotation(&s[14], &s[17], false);
601     HadamardRotation(&s[15], &s[16], false);
602   } else {
603     HadamardRotation(&s[0], &s[31], false, min, max);
604     HadamardRotation(&s[1], &s[30], false, min, max);
605     HadamardRotation(&s[2], &s[29], false, min, max);
606     HadamardRotation(&s[3], &s[28], false, min, max);
607     HadamardRotation(&s[4], &s[27], false, min, max);
608     HadamardRotation(&s[5], &s[26], false, min, max);
609     HadamardRotation(&s[6], &s[25], false, min, max);
610     HadamardRotation(&s[7], &s[24], false, min, max);
611     HadamardRotation(&s[8], &s[23], false, min, max);
612     HadamardRotation(&s[9], &s[22], false, min, max);
613     HadamardRotation(&s[10], &s[21], false, min, max);
614     HadamardRotation(&s[11], &s[20], false, min, max);
615     HadamardRotation(&s[12], &s[19], false, min, max);
616     HadamardRotation(&s[13], &s[18], false, min, max);
617     HadamardRotation(&s[14], &s[17], false, min, max);
618     HadamardRotation(&s[15], &s[16], false, min, max);
619   }
620 }
621 
622 // Process dct32 rows or columns, depending on the |is_row| flag.
Dct32_NEON(void * dest,const int32_t step,const bool is_row,int row_shift)623 LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
624                                       const bool is_row, int row_shift) {
625   auto* const dst = static_cast<int32_t*>(dest);
626   const int32_t range = is_row ? kBitdepth10 + 7 : 15;
627   const int32x4_t min = vdupq_n_s32(-(1 << range));
628   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
629   int32x4_t s[32], x[32];
630 
631   if (is_row) {
632     for (int idx = 0; idx < 32; idx += 8) {
633       LoadSrc<4>(dst, step, idx, &x[idx]);
634       LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
635       Transpose4x4(&x[idx], &x[idx]);
636       Transpose4x4(&x[idx + 4], &x[idx + 4]);
637     }
638   } else {
639     LoadSrc<32>(dst, step, 0, &x[0]);
640   }
641 
642   // stage 1
643   // kBitReverseLookup
644   // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
645   s[0] = x[0];
646   s[1] = x[16];
647   s[2] = x[8];
648   s[3] = x[24];
649   s[4] = x[4];
650   s[5] = x[20];
651   s[6] = x[12];
652   s[7] = x[28];
653   s[8] = x[2];
654   s[9] = x[18];
655   s[10] = x[10];
656   s[11] = x[26];
657   s[12] = x[6];
658   s[13] = x[22];
659   s[14] = x[14];
660   s[15] = x[30];
661 
662   // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
663   s[16] = x[1];
664   s[17] = x[17];
665   s[18] = x[9];
666   s[19] = x[25];
667   s[20] = x[5];
668   s[21] = x[21];
669   s[22] = x[13];
670   s[23] = x[29];
671   s[24] = x[3];
672   s[25] = x[19];
673   s[26] = x[11];
674   s[27] = x[27];
675   s[28] = x[7];
676   s[29] = x[23];
677   s[30] = x[15];
678   s[31] = x[31];
679 
680   Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
681   Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
682   Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
683   Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
684 
685   if (is_row) {
686     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
687     for (int idx = 0; idx < 32; idx += 8) {
688       int32x4_t output[8];
689       Transpose4x4(&s[idx], &output[0]);
690       Transpose4x4(&s[idx + 4], &output[4]);
691       for (int i = 0; i < 8; ++i) {
692         output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
693       }
694       StoreDst<4>(dst, step, idx, &output[0]);
695       StoreDst<4>(dst, step, idx + 4, &output[4]);
696     }
697   } else {
698     StoreDst<32>(dst, step, 0, &s[0]);
699   }
700 }
701 
Dct64_NEON(void * dest,int32_t step,bool is_row,int row_shift)702 void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
703   auto* const dst = static_cast<int32_t*>(dest);
704   const int32_t range = is_row ? kBitdepth10 + 7 : 15;
705   const int32x4_t min = vdupq_n_s32(-(1 << range));
706   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
707   int32x4_t s[64], x[32];
708 
709   if (is_row) {
710     // The last 32 values of every row are always zero if the |tx_width| is
711     // 64.
712     for (int idx = 0; idx < 32; idx += 8) {
713       LoadSrc<4>(dst, step, idx, &x[idx]);
714       LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
715       Transpose4x4(&x[idx], &x[idx]);
716       Transpose4x4(&x[idx + 4], &x[idx + 4]);
717     }
718   } else {
719     // The last 32 values of every column are always zero if the |tx_height| is
720     // 64.
721     LoadSrc<32>(dst, step, 0, &x[0]);
722   }
723 
724   // stage 1
725   // kBitReverseLookup
726   // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
727   s[0] = x[0];
728   s[2] = x[16];
729   s[4] = x[8];
730   s[6] = x[24];
731   s[8] = x[4];
732   s[10] = x[20];
733   s[12] = x[12];
734   s[14] = x[28];
735 
736   // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
737   s[16] = x[2];
738   s[18] = x[18];
739   s[20] = x[10];
740   s[22] = x[26];
741   s[24] = x[6];
742   s[26] = x[22];
743   s[28] = x[14];
744   s[30] = x[30];
745 
746   // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
747   s[32] = x[1];
748   s[34] = x[17];
749   s[36] = x[9];
750   s[38] = x[25];
751   s[40] = x[5];
752   s[42] = x[21];
753   s[44] = x[13];
754   s[46] = x[29];
755 
756   // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
757   s[48] = x[3];
758   s[50] = x[19];
759   s[52] = x[11];
760   s[54] = x[27];
761   s[56] = x[7];
762   s[58] = x[23];
763   s[60] = x[15];
764   s[62] = x[31];
765 
766   Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
767       s, &min, &max, /*is_last_stage=*/false);
768   Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
769       s, &min, &max, /*is_last_stage=*/false);
770   Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
771       s, &min, &max, /*is_last_stage=*/false);
772   Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
773       s, &min, &max, /*is_last_stage=*/false);
774 
775   //-- start dct 64 stages
776   // stage 2.
777   ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
778   ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
779   ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
780   ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
781   ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
782   ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
783   ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
784   ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
785   ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
786   ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
787   ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
788   ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
789   ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
790   ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
791   ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
792   ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
793 
794   // stage 4.
795   HadamardRotation(&s[32], &s[33], false, &min, &max);
796   HadamardRotation(&s[34], &s[35], true, &min, &max);
797   HadamardRotation(&s[36], &s[37], false, &min, &max);
798   HadamardRotation(&s[38], &s[39], true, &min, &max);
799   HadamardRotation(&s[40], &s[41], false, &min, &max);
800   HadamardRotation(&s[42], &s[43], true, &min, &max);
801   HadamardRotation(&s[44], &s[45], false, &min, &max);
802   HadamardRotation(&s[46], &s[47], true, &min, &max);
803   HadamardRotation(&s[48], &s[49], false, &min, &max);
804   HadamardRotation(&s[50], &s[51], true, &min, &max);
805   HadamardRotation(&s[52], &s[53], false, &min, &max);
806   HadamardRotation(&s[54], &s[55], true, &min, &max);
807   HadamardRotation(&s[56], &s[57], false, &min, &max);
808   HadamardRotation(&s[58], &s[59], true, &min, &max);
809   HadamardRotation(&s[60], &s[61], false, &min, &max);
810   HadamardRotation(&s[62], &s[63], true, &min, &max);
811 
812   // stage 7.
813   ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
814   ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
815   ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
816   ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
817   ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
818   ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
819   ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
820   ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
821 
822   // stage 11.
823   HadamardRotation(&s[32], &s[35], false, &min, &max);
824   HadamardRotation(&s[33], &s[34], false, &min, &max);
825   HadamardRotation(&s[36], &s[39], true, &min, &max);
826   HadamardRotation(&s[37], &s[38], true, &min, &max);
827   HadamardRotation(&s[40], &s[43], false, &min, &max);
828   HadamardRotation(&s[41], &s[42], false, &min, &max);
829   HadamardRotation(&s[44], &s[47], true, &min, &max);
830   HadamardRotation(&s[45], &s[46], true, &min, &max);
831   HadamardRotation(&s[48], &s[51], false, &min, &max);
832   HadamardRotation(&s[49], &s[50], false, &min, &max);
833   HadamardRotation(&s[52], &s[55], true, &min, &max);
834   HadamardRotation(&s[53], &s[54], true, &min, &max);
835   HadamardRotation(&s[56], &s[59], false, &min, &max);
836   HadamardRotation(&s[57], &s[58], false, &min, &max);
837   HadamardRotation(&s[60], &s[63], true, &min, &max);
838   HadamardRotation(&s[61], &s[62], true, &min, &max);
839 
840   // stage 16.
841   ButterflyRotation_4(&s[61], &s[34], 56, true);
842   ButterflyRotation_4(&s[60], &s[35], 56, true);
843   ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
844   ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
845   ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
846   ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
847   ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
848   ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
849 
850   // stage 21.
851   HadamardRotation(&s[32], &s[39], false, &min, &max);
852   HadamardRotation(&s[33], &s[38], false, &min, &max);
853   HadamardRotation(&s[34], &s[37], false, &min, &max);
854   HadamardRotation(&s[35], &s[36], false, &min, &max);
855   HadamardRotation(&s[40], &s[47], true, &min, &max);
856   HadamardRotation(&s[41], &s[46], true, &min, &max);
857   HadamardRotation(&s[42], &s[45], true, &min, &max);
858   HadamardRotation(&s[43], &s[44], true, &min, &max);
859   HadamardRotation(&s[48], &s[55], false, &min, &max);
860   HadamardRotation(&s[49], &s[54], false, &min, &max);
861   HadamardRotation(&s[50], &s[53], false, &min, &max);
862   HadamardRotation(&s[51], &s[52], false, &min, &max);
863   HadamardRotation(&s[56], &s[63], true, &min, &max);
864   HadamardRotation(&s[57], &s[62], true, &min, &max);
865   HadamardRotation(&s[58], &s[61], true, &min, &max);
866   HadamardRotation(&s[59], &s[60], true, &min, &max);
867 
868   // stage 25.
869   ButterflyRotation_4(&s[59], &s[36], 48, true);
870   ButterflyRotation_4(&s[58], &s[37], 48, true);
871   ButterflyRotation_4(&s[57], &s[38], 48, true);
872   ButterflyRotation_4(&s[56], &s[39], 48, true);
873   ButterflyRotation_4(&s[55], &s[40], 112, true);
874   ButterflyRotation_4(&s[54], &s[41], 112, true);
875   ButterflyRotation_4(&s[53], &s[42], 112, true);
876   ButterflyRotation_4(&s[52], &s[43], 112, true);
877 
878   // stage 28.
879   HadamardRotation(&s[32], &s[47], false, &min, &max);
880   HadamardRotation(&s[33], &s[46], false, &min, &max);
881   HadamardRotation(&s[34], &s[45], false, &min, &max);
882   HadamardRotation(&s[35], &s[44], false, &min, &max);
883   HadamardRotation(&s[36], &s[43], false, &min, &max);
884   HadamardRotation(&s[37], &s[42], false, &min, &max);
885   HadamardRotation(&s[38], &s[41], false, &min, &max);
886   HadamardRotation(&s[39], &s[40], false, &min, &max);
887   HadamardRotation(&s[48], &s[63], true, &min, &max);
888   HadamardRotation(&s[49], &s[62], true, &min, &max);
889   HadamardRotation(&s[50], &s[61], true, &min, &max);
890   HadamardRotation(&s[51], &s[60], true, &min, &max);
891   HadamardRotation(&s[52], &s[59], true, &min, &max);
892   HadamardRotation(&s[53], &s[58], true, &min, &max);
893   HadamardRotation(&s[54], &s[57], true, &min, &max);
894   HadamardRotation(&s[55], &s[56], true, &min, &max);
895 
896   // stage 30.
897   ButterflyRotation_4(&s[55], &s[40], 32, true);
898   ButterflyRotation_4(&s[54], &s[41], 32, true);
899   ButterflyRotation_4(&s[53], &s[42], 32, true);
900   ButterflyRotation_4(&s[52], &s[43], 32, true);
901   ButterflyRotation_4(&s[51], &s[44], 32, true);
902   ButterflyRotation_4(&s[50], &s[45], 32, true);
903   ButterflyRotation_4(&s[49], &s[46], 32, true);
904   ButterflyRotation_4(&s[48], &s[47], 32, true);
905 
906   // stage 31.
907   for (int i = 0; i < 32; i += 4) {
908     HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
909     HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
910     HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
911     HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
912   }
913   //-- end dct 64 stages
914   if (is_row) {
915     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
916     for (int idx = 0; idx < 64; idx += 8) {
917       int32x4_t output[8];
918       Transpose4x4(&s[idx], &output[0]);
919       Transpose4x4(&s[idx + 4], &output[4]);
920       for (int i = 0; i < 8; ++i) {
921         output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
922       }
923       StoreDst<4>(dst, step, idx, &output[0]);
924       StoreDst<4>(dst, step, idx + 4, &output[4]);
925     }
926   } else {
927     StoreDst<64>(dst, step, 0, &s[0]);
928   }
929 }
930 
931 //------------------------------------------------------------------------------
932 // Asymmetric Discrete Sine Transforms (ADST).
Adst4_NEON(void * dest,int32_t step,bool is_row,int row_shift)933 LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
934                                       int row_shift) {
935   auto* const dst = static_cast<int32_t*>(dest);
936   int32x4_t s[8];
937   int32x4_t x[4];
938 
939   LoadSrc<4>(dst, step, 0, x);
940   if (is_row) {
941     Transpose4x4(x, x);
942   }
943 
944   // stage 1.
945   s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
946   s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
947 
948   // stage 2.
949   const int32x4_t a7 = vsubq_s32(x[0], x[2]);
950   const int32x4_t b7 = vaddq_s32(a7, x[3]);
951 
952   // stage 3.
953   s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
954   s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
955   // s[0] = s[0] + s[3]
956   s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
957   // s[1] = s[1] - s[4]
958   s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
959 
960   s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
961   s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
962 
963   // stage 4.
964   s[0] = vaddq_s32(s[0], s[5]);
965   s[1] = vsubq_s32(s[1], s[6]);
966 
967   // stages 5 and 6.
968   const int32x4_t x0 = vaddq_s32(s[0], s[3]);
969   const int32x4_t x1 = vaddq_s32(s[1], s[3]);
970   const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
971   const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
972   x[0] = vrshrq_n_s32(x0, 12);
973   x[1] = vrshrq_n_s32(x1, 12);
974   x[2] = vrshrq_n_s32(s[2], 12);
975   x[3] = vrshrq_n_s32(x3, 12);
976 
977   if (is_row) {
978     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
979     x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
980     x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
981     x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
982     x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
983     Transpose4x4(x, x);
984   }
985   StoreDst<4>(dst, step, 0, x);
986 }
987 
988 alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
989                                                            2482};
990 
Adst4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)991 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
992                                        bool should_round, int row_shift) {
993   if (adjusted_tx_height > 1) return false;
994 
995   auto* dst = static_cast<int32_t*>(dest);
996   int32x4_t s[2];
997 
998   const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
999   const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1000   const int32x4_t v_src0_round =
1001       vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1002 
1003   const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
1004   const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
1005   s[1] = vdupq_n_s32(0);
1006 
1007   // s0*k0 s0*k1 s0*k2 s0*k1
1008   s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
1009   // 0     0     0     s0*k0
1010   s[1] = vextq_s32(s[1], s[0], 1);
1011 
1012   const int32x4_t x3 = vaddq_s32(s[0], s[1]);
1013   const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
1014 
1015   // vqrshlq_s32 will shift right if shift value is negative.
1016   vst1q_s32(dst,
1017             vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
1018 
1019   return true;
1020 }
1021 
Adst4DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1022 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
1023                                              int width) {
1024   if (adjusted_tx_height > 1) return false;
1025 
1026   auto* dst = static_cast<int32_t*>(dest);
1027   int32x4_t s[4];
1028 
1029   int i = 0;
1030   do {
1031     const int32x4_t v_src = vld1q_s32(&dst[i]);
1032 
1033     s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
1034     s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
1035     s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
1036 
1037     const int32x4_t x0 = s[0];
1038     const int32x4_t x1 = s[1];
1039     const int32x4_t x2 = s[2];
1040     const int32x4_t x3 = vaddq_s32(s[0], s[1]);
1041     const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
1042     const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
1043     const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
1044     const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
1045 
1046     vst1q_s32(&dst[i], dst_0);
1047     vst1q_s32(&dst[i + width * 1], dst_1);
1048     vst1q_s32(&dst[i + width * 2], dst_2);
1049     vst1q_s32(&dst[i + width * 3], dst_3);
1050 
1051     i += 4;
1052   } while (i < width);
1053 
1054   return true;
1055 }
1056 
1057 template <ButterflyRotationFunc butterfly_rotation>
Adst8_NEON(void * dest,int32_t step,bool is_row,int row_shift)1058 LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
1059                                       int row_shift) {
1060   auto* const dst = static_cast<int32_t*>(dest);
1061   const int32_t range = is_row ? kBitdepth10 + 7 : 15;
1062   const int32x4_t min = vdupq_n_s32(-(1 << range));
1063   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
1064   int32x4_t s[8], x[8];
1065 
1066   if (is_row) {
1067     LoadSrc<4>(dst, step, 0, &x[0]);
1068     LoadSrc<4>(dst, step, 4, &x[4]);
1069     Transpose4x4(&x[0], &x[0]);
1070     Transpose4x4(&x[4], &x[4]);
1071   } else {
1072     LoadSrc<8>(dst, step, 0, &x[0]);
1073   }
1074 
1075   // stage 1.
1076   s[0] = x[7];
1077   s[1] = x[0];
1078   s[2] = x[5];
1079   s[3] = x[2];
1080   s[4] = x[3];
1081   s[5] = x[4];
1082   s[6] = x[1];
1083   s[7] = x[6];
1084 
1085   // stage 2.
1086   butterfly_rotation(&s[0], &s[1], 60 - 0, true);
1087   butterfly_rotation(&s[2], &s[3], 60 - 16, true);
1088   butterfly_rotation(&s[4], &s[5], 60 - 32, true);
1089   butterfly_rotation(&s[6], &s[7], 60 - 48, true);
1090 
1091   // stage 3.
1092   HadamardRotation(&s[0], &s[4], false, &min, &max);
1093   HadamardRotation(&s[1], &s[5], false, &min, &max);
1094   HadamardRotation(&s[2], &s[6], false, &min, &max);
1095   HadamardRotation(&s[3], &s[7], false, &min, &max);
1096 
1097   // stage 4.
1098   butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1099   butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1100 
1101   // stage 5.
1102   HadamardRotation(&s[0], &s[2], false, &min, &max);
1103   HadamardRotation(&s[4], &s[6], false, &min, &max);
1104   HadamardRotation(&s[1], &s[3], false, &min, &max);
1105   HadamardRotation(&s[5], &s[7], false, &min, &max);
1106 
1107   // stage 6.
1108   butterfly_rotation(&s[2], &s[3], 32, true);
1109   butterfly_rotation(&s[6], &s[7], 32, true);
1110 
1111   // stage 7.
1112   x[0] = s[0];
1113   x[1] = vqnegq_s32(s[4]);
1114   x[2] = s[6];
1115   x[3] = vqnegq_s32(s[2]);
1116   x[4] = s[3];
1117   x[5] = vqnegq_s32(s[7]);
1118   x[6] = s[5];
1119   x[7] = vqnegq_s32(s[1]);
1120 
1121   if (is_row) {
1122     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
1123     for (int i = 0; i < 8; ++i) {
1124       x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
1125     }
1126     Transpose4x4(&x[0], &x[0]);
1127     Transpose4x4(&x[4], &x[4]);
1128     StoreDst<4>(dst, step, 0, &x[0]);
1129     StoreDst<4>(dst, step, 4, &x[4]);
1130   } else {
1131     StoreDst<8>(dst, step, 0, &x[0]);
1132   }
1133 }
1134 
Adst8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1135 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
1136                                        bool should_round, int row_shift) {
1137   if (adjusted_tx_height > 1) return false;
1138 
1139   auto* dst = static_cast<int32_t*>(dest);
1140   int32x4_t s[8];
1141 
1142   const int32x4_t v_src = vdupq_n_s32(dst[0]);
1143   const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1144   const int32x4_t v_src_round =
1145       vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
1146   // stage 1.
1147   s[1] = vbslq_s32(v_mask, v_src_round, v_src);
1148 
1149   // stage 2.
1150   ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1151 
1152   // stage 3.
1153   s[4] = s[0];
1154   s[5] = s[1];
1155 
1156   // stage 4.
1157   ButterflyRotation_4(&s[4], &s[5], 48, true);
1158 
1159   // stage 5.
1160   s[2] = s[0];
1161   s[3] = s[1];
1162   s[6] = s[4];
1163   s[7] = s[5];
1164 
1165   // stage 6.
1166   ButterflyRotation_4(&s[2], &s[3], 32, true);
1167   ButterflyRotation_4(&s[6], &s[7], 32, true);
1168 
1169   // stage 7.
1170   int32x4_t x[8];
1171   x[0] = s[0];
1172   x[1] = vqnegq_s32(s[4]);
1173   x[2] = s[6];
1174   x[3] = vqnegq_s32(s[2]);
1175   x[4] = s[3];
1176   x[5] = vqnegq_s32(s[7]);
1177   x[6] = s[5];
1178   x[7] = vqnegq_s32(s[1]);
1179 
1180   for (int i = 0; i < 8; ++i) {
1181     // vqrshlq_s32 will shift right if shift value is negative.
1182     x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
1183     vst1q_lane_s32(&dst[i], x[i], 0);
1184   }
1185 
1186   return true;
1187 }
1188 
Adst8DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1189 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
1190                                              int width) {
1191   if (adjusted_tx_height > 1) return false;
1192 
1193   auto* dst = static_cast<int32_t*>(dest);
1194   int32x4_t s[8];
1195 
1196   int i = 0;
1197   do {
1198     const int32x4_t v_src = vld1q_s32(dst);
1199     // stage 1.
1200     s[1] = v_src;
1201 
1202     // stage 2.
1203     ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1204 
1205     // stage 3.
1206     s[4] = s[0];
1207     s[5] = s[1];
1208 
1209     // stage 4.
1210     ButterflyRotation_4(&s[4], &s[5], 48, true);
1211 
1212     // stage 5.
1213     s[2] = s[0];
1214     s[3] = s[1];
1215     s[6] = s[4];
1216     s[7] = s[5];
1217 
1218     // stage 6.
1219     ButterflyRotation_4(&s[2], &s[3], 32, true);
1220     ButterflyRotation_4(&s[6], &s[7], 32, true);
1221 
1222     // stage 7.
1223     int32x4_t x[8];
1224     x[0] = s[0];
1225     x[1] = vqnegq_s32(s[4]);
1226     x[2] = s[6];
1227     x[3] = vqnegq_s32(s[2]);
1228     x[4] = s[3];
1229     x[5] = vqnegq_s32(s[7]);
1230     x[6] = s[5];
1231     x[7] = vqnegq_s32(s[1]);
1232 
1233     for (int j = 0; j < 8; ++j) {
1234       vst1q_s32(&dst[j * width], x[j]);
1235     }
1236     i += 4;
1237     dst += 4;
1238   } while (i < width);
1239 
1240   return true;
1241 }
1242 
1243 template <ButterflyRotationFunc butterfly_rotation>
Adst16_NEON(void * dest,int32_t step,bool is_row,int row_shift)1244 LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
1245                                        int row_shift) {
1246   auto* const dst = static_cast<int32_t*>(dest);
1247   const int32_t range = is_row ? kBitdepth10 + 7 : 15;
1248   const int32x4_t min = vdupq_n_s32(-(1 << range));
1249   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
1250   int32x4_t s[16], x[16];
1251 
1252   if (is_row) {
1253     for (int idx = 0; idx < 16; idx += 8) {
1254       LoadSrc<4>(dst, step, idx, &x[idx]);
1255       LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
1256       Transpose4x4(&x[idx], &x[idx]);
1257       Transpose4x4(&x[idx + 4], &x[idx + 4]);
1258     }
1259   } else {
1260     LoadSrc<16>(dst, step, 0, &x[0]);
1261   }
1262 
1263   // stage 1.
1264   s[0] = x[15];
1265   s[1] = x[0];
1266   s[2] = x[13];
1267   s[3] = x[2];
1268   s[4] = x[11];
1269   s[5] = x[4];
1270   s[6] = x[9];
1271   s[7] = x[6];
1272   s[8] = x[7];
1273   s[9] = x[8];
1274   s[10] = x[5];
1275   s[11] = x[10];
1276   s[12] = x[3];
1277   s[13] = x[12];
1278   s[14] = x[1];
1279   s[15] = x[14];
1280 
1281   // stage 2.
1282   butterfly_rotation(&s[0], &s[1], 62 - 0, true);
1283   butterfly_rotation(&s[2], &s[3], 62 - 8, true);
1284   butterfly_rotation(&s[4], &s[5], 62 - 16, true);
1285   butterfly_rotation(&s[6], &s[7], 62 - 24, true);
1286   butterfly_rotation(&s[8], &s[9], 62 - 32, true);
1287   butterfly_rotation(&s[10], &s[11], 62 - 40, true);
1288   butterfly_rotation(&s[12], &s[13], 62 - 48, true);
1289   butterfly_rotation(&s[14], &s[15], 62 - 56, true);
1290 
1291   // stage 3.
1292   HadamardRotation(&s[0], &s[8], false, &min, &max);
1293   HadamardRotation(&s[1], &s[9], false, &min, &max);
1294   HadamardRotation(&s[2], &s[10], false, &min, &max);
1295   HadamardRotation(&s[3], &s[11], false, &min, &max);
1296   HadamardRotation(&s[4], &s[12], false, &min, &max);
1297   HadamardRotation(&s[5], &s[13], false, &min, &max);
1298   HadamardRotation(&s[6], &s[14], false, &min, &max);
1299   HadamardRotation(&s[7], &s[15], false, &min, &max);
1300 
1301   // stage 4.
1302   butterfly_rotation(&s[8], &s[9], 56 - 0, true);
1303   butterfly_rotation(&s[13], &s[12], 8 + 0, true);
1304   butterfly_rotation(&s[10], &s[11], 56 - 32, true);
1305   butterfly_rotation(&s[15], &s[14], 8 + 32, true);
1306 
1307   // stage 5.
1308   HadamardRotation(&s[0], &s[4], false, &min, &max);
1309   HadamardRotation(&s[8], &s[12], false, &min, &max);
1310   HadamardRotation(&s[1], &s[5], false, &min, &max);
1311   HadamardRotation(&s[9], &s[13], false, &min, &max);
1312   HadamardRotation(&s[2], &s[6], false, &min, &max);
1313   HadamardRotation(&s[10], &s[14], false, &min, &max);
1314   HadamardRotation(&s[3], &s[7], false, &min, &max);
1315   HadamardRotation(&s[11], &s[15], false, &min, &max);
1316 
1317   // stage 6.
1318   butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1319   butterfly_rotation(&s[12], &s[13], 48 - 0, true);
1320   butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1321   butterfly_rotation(&s[15], &s[14], 48 - 32, true);
1322 
1323   // stage 7.
1324   HadamardRotation(&s[0], &s[2], false, &min, &max);
1325   HadamardRotation(&s[4], &s[6], false, &min, &max);
1326   HadamardRotation(&s[8], &s[10], false, &min, &max);
1327   HadamardRotation(&s[12], &s[14], false, &min, &max);
1328   HadamardRotation(&s[1], &s[3], false, &min, &max);
1329   HadamardRotation(&s[5], &s[7], false, &min, &max);
1330   HadamardRotation(&s[9], &s[11], false, &min, &max);
1331   HadamardRotation(&s[13], &s[15], false, &min, &max);
1332 
1333   // stage 8.
1334   butterfly_rotation(&s[2], &s[3], 32, true);
1335   butterfly_rotation(&s[6], &s[7], 32, true);
1336   butterfly_rotation(&s[10], &s[11], 32, true);
1337   butterfly_rotation(&s[14], &s[15], 32, true);
1338 
1339   // stage 9.
1340   x[0] = s[0];
1341   x[1] = vqnegq_s32(s[8]);
1342   x[2] = s[12];
1343   x[3] = vqnegq_s32(s[4]);
1344   x[4] = s[6];
1345   x[5] = vqnegq_s32(s[14]);
1346   x[6] = s[10];
1347   x[7] = vqnegq_s32(s[2]);
1348   x[8] = s[3];
1349   x[9] = vqnegq_s32(s[11]);
1350   x[10] = s[15];
1351   x[11] = vqnegq_s32(s[7]);
1352   x[12] = s[5];
1353   x[13] = vqnegq_s32(s[13]);
1354   x[14] = s[9];
1355   x[15] = vqnegq_s32(s[1]);
1356 
1357   if (is_row) {
1358     const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
1359     for (int i = 0; i < 16; ++i) {
1360       x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
1361     }
1362     for (int idx = 0; idx < 16; idx += 8) {
1363       Transpose4x4(&x[idx], &x[idx]);
1364       Transpose4x4(&x[idx + 4], &x[idx + 4]);
1365       StoreDst<4>(dst, step, idx, &x[idx]);
1366       StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
1367     }
1368   } else {
1369     StoreDst<16>(dst, step, 0, &x[0]);
1370   }
1371 }
1372 
Adst16DcOnlyInternal(int32x4_t * s,int32x4_t * x)1373 LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
1374   // stage 2.
1375   ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
1376 
1377   // stage 3.
1378   s[8] = s[0];
1379   s[9] = s[1];
1380 
1381   // stage 4.
1382   ButterflyRotation_4(&s[8], &s[9], 56, true);
1383 
1384   // stage 5.
1385   s[4] = s[0];
1386   s[12] = s[8];
1387   s[5] = s[1];
1388   s[13] = s[9];
1389 
1390   // stage 6.
1391   ButterflyRotation_4(&s[4], &s[5], 48, true);
1392   ButterflyRotation_4(&s[12], &s[13], 48, true);
1393 
1394   // stage 7.
1395   s[2] = s[0];
1396   s[6] = s[4];
1397   s[10] = s[8];
1398   s[14] = s[12];
1399   s[3] = s[1];
1400   s[7] = s[5];
1401   s[11] = s[9];
1402   s[15] = s[13];
1403 
1404   // stage 8.
1405   ButterflyRotation_4(&s[2], &s[3], 32, true);
1406   ButterflyRotation_4(&s[6], &s[7], 32, true);
1407   ButterflyRotation_4(&s[10], &s[11], 32, true);
1408   ButterflyRotation_4(&s[14], &s[15], 32, true);
1409 
1410   // stage 9.
1411   x[0] = s[0];
1412   x[1] = vqnegq_s32(s[8]);
1413   x[2] = s[12];
1414   x[3] = vqnegq_s32(s[4]);
1415   x[4] = s[6];
1416   x[5] = vqnegq_s32(s[14]);
1417   x[6] = s[10];
1418   x[7] = vqnegq_s32(s[2]);
1419   x[8] = s[3];
1420   x[9] = vqnegq_s32(s[11]);
1421   x[10] = s[15];
1422   x[11] = vqnegq_s32(s[7]);
1423   x[12] = s[5];
1424   x[13] = vqnegq_s32(s[13]);
1425   x[14] = s[9];
1426   x[15] = vqnegq_s32(s[1]);
1427 }
1428 
Adst16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1429 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
1430                                         bool should_round, int row_shift) {
1431   if (adjusted_tx_height > 1) return false;
1432 
1433   auto* dst = static_cast<int32_t*>(dest);
1434   int32x4_t s[16];
1435   int32x4_t x[16];
1436   const int32x4_t v_src = vdupq_n_s32(dst[0]);
1437   const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1438   const int32x4_t v_src_round =
1439       vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
1440   // stage 1.
1441   s[1] = vbslq_s32(v_mask, v_src_round, v_src);
1442 
1443   Adst16DcOnlyInternal(s, x);
1444 
1445   for (int i = 0; i < 16; ++i) {
1446     // vqrshlq_s32 will shift right if shift value is negative.
1447     x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
1448     vst1q_lane_s32(&dst[i], x[i], 0);
1449   }
1450 
1451   return true;
1452 }
1453 
Adst16DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1454 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
1455                                               int adjusted_tx_height,
1456                                               int width) {
1457   if (adjusted_tx_height > 1) return false;
1458 
1459   auto* dst = static_cast<int32_t*>(dest);
1460   int i = 0;
1461   do {
1462     int32x4_t s[16];
1463     int32x4_t x[16];
1464     const int32x4_t v_src = vld1q_s32(dst);
1465     // stage 1.
1466     s[1] = v_src;
1467 
1468     Adst16DcOnlyInternal(s, x);
1469 
1470     for (int j = 0; j < 16; ++j) {
1471       vst1q_s32(&dst[j * width], x[j]);
1472     }
1473     i += 4;
1474     dst += 4;
1475   } while (i < width);
1476 
1477   return true;
1478 }
1479 
1480 //------------------------------------------------------------------------------
1481 // Identity Transforms.
1482 
Identity4_NEON(void * dest,int32_t step,int shift)1483 LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
1484   auto* const dst = static_cast<int32_t*>(dest);
1485   const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1486   const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
1487   const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
1488   for (int i = 0; i < 4; ++i) {
1489     const int32x4_t v_src = vld1q_s32(&dst[i * step]);
1490     const int32x4_t v_src_mult_lo =
1491         vmlaq_s32(v_dual_round, v_src, v_multiplier);
1492     const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
1493     vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
1494   }
1495 }
1496 
Identity4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int tx_height)1497 LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
1498                                            bool should_round, int tx_height) {
1499   if (adjusted_tx_height > 1) return false;
1500 
1501   auto* dst = static_cast<int32_t*>(dest);
1502   const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
1503   const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1504   const int32x4_t v_src_round =
1505       vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1506   const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
1507   const int shift = tx_height < 16 ? 0 : 1;
1508   const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1509   const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
1510   const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
1511   const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
1512   const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
1513   vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
1514   return true;
1515 }
1516 
1517 template <int identity_size>
IdentityColumnStoreToFrame(Array2DView<uint16_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int32_t * source)1518 LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
1519     Array2DView<uint16_t> frame, const int start_x, const int start_y,
1520     const int tx_width, const int tx_height, const int32_t* source) {
1521   static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
1522                 "Invalid identity_size.");
1523   const int stride = frame.columns();
1524   uint16_t* dst = frame[start_y] + start_x;
1525   const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
1526   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
1527 
1528   if (tx_width == 4) {
1529     int i = 0;
1530     do {
1531       int32x4x2_t v_src, v_dst_i, a, b;
1532       v_src.val[0] = vld1q_s32(&source[i * 4]);
1533       v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
1534       if (identity_size == 4) {
1535         v_dst_i.val[0] =
1536             vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
1537         v_dst_i.val[1] =
1538             vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
1539         a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1540         a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1541       } else if (identity_size == 8) {
1542         v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
1543         v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
1544         a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
1545         a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
1546       } else {  // identity_size == 16
1547         v_dst_i.val[0] =
1548             vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
1549         v_dst_i.val[1] =
1550             vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
1551         a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1552         a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1553       }
1554       uint16x4x2_t frame_data;
1555       frame_data.val[0] = vld1_u16(dst);
1556       frame_data.val[1] = vld1_u16(dst + stride);
1557       b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
1558       b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
1559       vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
1560       vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
1561       dst += stride << 1;
1562       i += 2;
1563     } while (i < tx_height);
1564   } else {
1565     int i = 0;
1566     do {
1567       const int row = i * tx_width;
1568       int j = 0;
1569       do {
1570         int32x4x2_t v_src, v_dst_i, a, b;
1571         v_src.val[0] = vld1q_s32(&source[row + j]);
1572         v_src.val[1] = vld1q_s32(&source[row + j + 4]);
1573         if (identity_size == 4) {
1574           v_dst_i.val[0] =
1575               vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
1576           v_dst_i.val[1] =
1577               vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
1578           a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1579           a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1580         } else if (identity_size == 8) {
1581           v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
1582           v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
1583           a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
1584           a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
1585         } else {  // identity_size == 16
1586           v_dst_i.val[0] =
1587               vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
1588           v_dst_i.val[1] =
1589               vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
1590           a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1591           a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1592         }
1593         uint16x4x2_t frame_data;
1594         frame_data.val[0] = vld1_u16(dst + j);
1595         frame_data.val[1] = vld1_u16(dst + j + 4);
1596         b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
1597         b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
1598         vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
1599         vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
1600         j += 8;
1601       } while (j < tx_width);
1602       dst += stride;
1603     } while (++i < tx_height);
1604   }
1605 }
1606 
Identity4RowColumnStoreToFrame(Array2DView<uint16_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int32_t * source)1607 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
1608     Array2DView<uint16_t> frame, const int start_x, const int start_y,
1609     const int tx_width, const int tx_height, const int32_t* source) {
1610   const int stride = frame.columns();
1611   uint16_t* dst = frame[start_y] + start_x;
1612   const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
1613   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
1614 
1615   if (tx_width == 4) {
1616     int i = 0;
1617     do {
1618       const int32x4_t v_src = vld1q_s32(&source[i * 4]);
1619       const int32x4_t v_dst_row =
1620           vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
1621       const int32x4_t v_dst_col =
1622           vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
1623       const uint16x4_t frame_data = vld1_u16(dst);
1624       const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
1625       const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
1626       vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
1627       dst += stride;
1628     } while (++i < tx_height);
1629   } else {
1630     int i = 0;
1631     do {
1632       const int row = i * tx_width;
1633       int j = 0;
1634       do {
1635         int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
1636         v_src.val[0] = vld1q_s32(&source[row + j]);
1637         v_src.val[1] = vld1q_s32(&source[row + j + 4]);
1638         v_src_round.val[0] = vshrq_n_s32(
1639             vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
1640         v_src_round.val[1] = vshrq_n_s32(
1641             vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
1642         v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
1643         v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
1644         v_dst_col.val[0] =
1645             vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
1646         v_dst_col.val[1] =
1647             vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
1648         uint16x4x2_t frame_data;
1649         frame_data.val[0] = vld1_u16(dst + j);
1650         frame_data.val[1] = vld1_u16(dst + j + 4);
1651         a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
1652         a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
1653         b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
1654         b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
1655         vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
1656         vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
1657         j += 8;
1658       } while (j < tx_width);
1659       dst += stride;
1660     } while (++i < tx_height);
1661   }
1662 }
1663 
Identity8Row32_NEON(void * dest,int32_t step)1664 LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
1665   auto* const dst = static_cast<int32_t*>(dest);
1666 
1667   // When combining the identity8 multiplier with the row shift, the
1668   // calculations for tx_height equal to 32 can be simplified from
1669   // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
1670   for (int i = 0; i < 4; ++i) {
1671     const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
1672     const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
1673     const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
1674     const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
1675     vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
1676     vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
1677   }
1678 }
1679 
Identity8Row4_NEON(void * dest,int32_t step)1680 LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
1681   auto* const dst = static_cast<int32_t*>(dest);
1682 
1683   for (int i = 0; i < 4; ++i) {
1684     const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
1685     const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
1686     const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
1687     const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
1688     vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
1689     vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
1690   }
1691 }
1692 
Identity8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1693 LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
1694                                            bool should_round, int row_shift) {
1695   if (adjusted_tx_height > 1) return false;
1696 
1697   auto* dst = static_cast<int32_t*>(dest);
1698   const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
1699   const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1700   const int32x4_t v_src_round =
1701       vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1702   const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
1703   const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
1704   const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
1705   vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
1706   return true;
1707 }
1708 
Identity16Row_NEON(void * dest,int32_t step,int shift)1709 LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
1710                                               int shift) {
1711   auto* const dst = static_cast<int32_t*>(dest);
1712   const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1713   const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
1714 
1715   for (int i = 0; i < 4; ++i) {
1716     for (int j = 0; j < 2; ++j) {
1717       int32x4x2_t v_src;
1718       v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
1719       v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
1720       const int32x4_t v_src_mult_lo =
1721           vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
1722       const int32x4_t v_src_mult_hi =
1723           vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
1724       const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
1725       const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
1726       vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
1727       vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
1728     }
1729   }
1730 }
1731 
Identity16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int shift)1732 LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
1733                                             bool should_round, int shift) {
1734   if (adjusted_tx_height > 1) return false;
1735 
1736   auto* dst = static_cast<int32_t*>(dest);
1737   const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
1738   const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1739   const int32x4_t v_src_round =
1740       vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1741   const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
1742   const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1743   const int32x4_t v_src_mult_lo =
1744       vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
1745   const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
1746   vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
1747   return true;
1748 }
1749 
1750 //------------------------------------------------------------------------------
1751 // row/column transform loops
1752 
1753 template <int tx_height>
FlipColumns(int32_t * source,int tx_width)1754 LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
1755   if (tx_width >= 16) {
1756     int i = 0;
1757     do {
1758       // 00 01 02 03
1759       const int32x4_t a = vld1q_s32(&source[i]);
1760       const int32x4_t b = vld1q_s32(&source[i + 4]);
1761       const int32x4_t c = vld1q_s32(&source[i + 8]);
1762       const int32x4_t d = vld1q_s32(&source[i + 12]);
1763       // 01 00 03 02
1764       const int32x4_t a_rev = vrev64q_s32(a);
1765       const int32x4_t b_rev = vrev64q_s32(b);
1766       const int32x4_t c_rev = vrev64q_s32(c);
1767       const int32x4_t d_rev = vrev64q_s32(d);
1768       // 03 02 01 00
1769       vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
1770       vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
1771       vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
1772       vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
1773       i += 16;
1774     } while (i < tx_width * tx_height);
1775   } else if (tx_width == 8) {
1776     for (int i = 0; i < 8 * tx_height; i += 8) {
1777       // 00 01 02 03
1778       const int32x4_t a = vld1q_s32(&source[i]);
1779       const int32x4_t b = vld1q_s32(&source[i + 4]);
1780       // 01 00 03 02
1781       const int32x4_t a_rev = vrev64q_s32(a);
1782       const int32x4_t b_rev = vrev64q_s32(b);
1783       // 03 02 01 00
1784       vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
1785       vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
1786     }
1787   } else {
1788     // Process two rows per iteration.
1789     for (int i = 0; i < 4 * tx_height; i += 8) {
1790       // 00 01 02 03
1791       const int32x4_t a = vld1q_s32(&source[i]);
1792       const int32x4_t b = vld1q_s32(&source[i + 4]);
1793       // 01 00 03 02
1794       const int32x4_t a_rev = vrev64q_s32(a);
1795       const int32x4_t b_rev = vrev64q_s32(b);
1796       // 03 02 01 00
1797       vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
1798       vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
1799     }
1800   }
1801 }
1802 
1803 template <int tx_width>
ApplyRounding(int32_t * source,int num_rows)1804 LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
1805   // Process two rows per iteration.
1806   int i = 0;
1807   do {
1808     const int32x4_t a_lo = vld1q_s32(&source[i]);
1809     const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
1810     const int32x4_t b_lo =
1811         vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
1812     const int32x4_t b_hi =
1813         vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
1814     vst1q_s32(&source[i], b_lo);
1815     vst1q_s32(&source[i + 4], b_hi);
1816     i += 8;
1817   } while (i < tx_width * num_rows);
1818 }
1819 
1820 template <int tx_width>
RowShift(int32_t * source,int num_rows,int row_shift)1821 LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
1822                                     int row_shift) {
1823   // vqrshlq_s32 will shift right if shift value is negative.
1824   row_shift = -row_shift;
1825 
1826   // Process two rows per iteration.
1827   int i = 0;
1828   do {
1829     const int32x4_t residual0 = vld1q_s32(&source[i]);
1830     const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
1831     vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
1832     vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
1833     i += 8;
1834   } while (i < tx_width * num_rows);
1835 }
1836 
1837 template <int tx_height, bool enable_flip_rows = false>
StoreToFrameWithRound(Array2DView<uint16_t> frame,const int start_x,const int start_y,const int tx_width,const int32_t * source,TransformType tx_type)1838 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
1839     Array2DView<uint16_t> frame, const int start_x, const int start_y,
1840     const int tx_width, const int32_t* source, TransformType tx_type) {
1841   const bool flip_rows =
1842       enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
1843   const int stride = frame.columns();
1844   uint16_t* dst = frame[start_y] + start_x;
1845 
1846   if (tx_width == 4) {
1847     for (int i = 0; i < tx_height; ++i) {
1848       const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
1849       const int32x4_t residual = vld1q_s32(&source[row]);
1850       const uint16x4_t frame_data = vld1_u16(dst);
1851       const int32x4_t a = vrshrq_n_s32(residual, 4);
1852       const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
1853       const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
1854       vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
1855       dst += stride;
1856     }
1857   } else {
1858     for (int i = 0; i < tx_height; ++i) {
1859       const int y = start_y + i;
1860       const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
1861       int j = 0;
1862       do {
1863         const int x = start_x + j;
1864         const int32x4_t residual = vld1q_s32(&source[row + j]);
1865         const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
1866         const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
1867         const int32x4_t a = vrshrq_n_s32(residual, 4);
1868         const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
1869         const uint32x4_t b =
1870             vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
1871         const uint32x4_t b_hi =
1872             vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
1873         const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
1874         const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
1875         vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
1876                                           vdupq_n_u16((1 << kBitdepth10) - 1)));
1877         j += 8;
1878       } while (j < tx_width);
1879     }
1880   }
1881 }
1882 
Dct4TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)1883 void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
1884                                int adjusted_tx_height, void* src_buffer,
1885                                int /*start_x*/, int /*start_y*/,
1886                                void* /*dst_frame*/) {
1887   auto* src = static_cast<int32_t*>(src_buffer);
1888   const int tx_height = kTransformHeight[tx_size];
1889   const bool should_round = (tx_height == 8);
1890   const int row_shift = (tx_height == 16);
1891 
1892   if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
1893     return;
1894   }
1895 
1896   if (should_round) {
1897     ApplyRounding<4>(src, adjusted_tx_height);
1898   }
1899 
1900   // Process 4 1d dct4 rows in parallel per iteration.
1901   int i = adjusted_tx_height;
1902   auto* data = src;
1903   do {
1904     Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
1905                                    row_shift);
1906     data += 16;
1907     i -= 4;
1908   } while (i != 0);
1909 }
1910 
Dct4TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)1911 void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
1912                                   int adjusted_tx_height, void* src_buffer,
1913                                   int start_x, int start_y, void* dst_frame) {
1914   auto* src = static_cast<int32_t*>(src_buffer);
1915   const int tx_width = kTransformWidth[tx_size];
1916 
1917   if (kTransformFlipColumnsMask.Contains(tx_type)) {
1918     FlipColumns<4>(src, tx_width);
1919   }
1920 
1921   if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
1922     // Process 4 1d dct4 columns in parallel per iteration.
1923     int i = tx_width;
1924     auto* data = src;
1925     do {
1926       Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
1927                                      /*row_shift=*/0);
1928       data += 4;
1929       i -= 4;
1930     } while (i != 0);
1931   }
1932 
1933   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
1934   StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
1935 }
1936 
Dct8TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)1937 void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
1938                                int adjusted_tx_height, void* src_buffer,
1939                                int /*start_x*/, int /*start_y*/,
1940                                void* /*dst_frame*/) {
1941   auto* src = static_cast<int32_t*>(src_buffer);
1942   const bool should_round = kShouldRound[tx_size];
1943   const uint8_t row_shift = kTransformRowShift[tx_size];
1944 
1945   if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
1946     return;
1947   }
1948 
1949   if (should_round) {
1950     ApplyRounding<8>(src, adjusted_tx_height);
1951   }
1952 
1953   // Process 4 1d dct8 rows in parallel per iteration.
1954   int i = adjusted_tx_height;
1955   auto* data = src;
1956   do {
1957     Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
1958                                    row_shift);
1959     data += 32;
1960     i -= 4;
1961   } while (i != 0);
1962 }
1963 
Dct8TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)1964 void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
1965                                   int adjusted_tx_height, void* src_buffer,
1966                                   int start_x, int start_y, void* dst_frame) {
1967   auto* src = static_cast<int32_t*>(src_buffer);
1968   const int tx_width = kTransformWidth[tx_size];
1969 
1970   if (kTransformFlipColumnsMask.Contains(tx_type)) {
1971     FlipColumns<8>(src, tx_width);
1972   }
1973 
1974   if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
1975     // Process 4 1d dct8 columns in parallel per iteration.
1976     int i = tx_width;
1977     auto* data = src;
1978     do {
1979       Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
1980                                      /*row_shift=*/0);
1981       data += 4;
1982       i -= 4;
1983     } while (i != 0);
1984   }
1985   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
1986   StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
1987 }
1988 
Dct16TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)1989 void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
1990                                 TransformSize tx_size, int adjusted_tx_height,
1991                                 void* src_buffer, int /*start_x*/,
1992                                 int /*start_y*/, void* /*dst_frame*/) {
1993   auto* src = static_cast<int32_t*>(src_buffer);
1994   const bool should_round = kShouldRound[tx_size];
1995   const uint8_t row_shift = kTransformRowShift[tx_size];
1996 
1997   if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
1998     return;
1999   }
2000 
2001   if (should_round) {
2002     ApplyRounding<16>(src, adjusted_tx_height);
2003   }
2004 
2005   assert(adjusted_tx_height % 4 == 0);
2006   int i = adjusted_tx_height;
2007   auto* data = src;
2008   do {
2009     // Process 4 1d dct16 rows in parallel per iteration.
2010     Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
2011     data += 64;
2012     i -= 4;
2013   } while (i != 0);
2014 }
2015 
Dct16TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2016 void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2017                                    int adjusted_tx_height, void* src_buffer,
2018                                    int start_x, int start_y, void* dst_frame) {
2019   auto* src = static_cast<int32_t*>(src_buffer);
2020   const int tx_width = kTransformWidth[tx_size];
2021 
2022   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2023     FlipColumns<16>(src, tx_width);
2024   }
2025 
2026   if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
2027     // Process 4 1d dct16 columns in parallel per iteration.
2028     int i = tx_width;
2029     auto* data = src;
2030     do {
2031       Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
2032                                       /*row_shift=*/0);
2033       data += 4;
2034       i -= 4;
2035     } while (i != 0);
2036   }
2037   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2038   StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
2039 }
2040 
Dct32TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2041 void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
2042                                 TransformSize tx_size, int adjusted_tx_height,
2043                                 void* src_buffer, int /*start_x*/,
2044                                 int /*start_y*/, void* /*dst_frame*/) {
2045   auto* src = static_cast<int32_t*>(src_buffer);
2046   const bool should_round = kShouldRound[tx_size];
2047   const uint8_t row_shift = kTransformRowShift[tx_size];
2048 
2049   if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
2050     return;
2051   }
2052 
2053   if (should_round) {
2054     ApplyRounding<32>(src, adjusted_tx_height);
2055   }
2056 
2057   assert(adjusted_tx_height % 4 == 0);
2058   int i = adjusted_tx_height;
2059   auto* data = src;
2060   do {
2061     // Process 4 1d dct32 rows in parallel per iteration.
2062     Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
2063     data += 128;
2064     i -= 4;
2065   } while (i != 0);
2066 }
2067 
Dct32TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2068 void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2069                                    int adjusted_tx_height, void* src_buffer,
2070                                    int start_x, int start_y, void* dst_frame) {
2071   auto* src = static_cast<int32_t*>(src_buffer);
2072   const int tx_width = kTransformWidth[tx_size];
2073 
2074   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2075     FlipColumns<32>(src, tx_width);
2076   }
2077 
2078   if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
2079     // Process 4 1d dct32 columns in parallel per iteration.
2080     int i = tx_width;
2081     auto* data = src;
2082     do {
2083       Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
2084       data += 4;
2085       i -= 4;
2086     } while (i != 0);
2087   }
2088   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2089   StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
2090 }
2091 
Dct64TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2092 void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
2093                                 TransformSize tx_size, int adjusted_tx_height,
2094                                 void* src_buffer, int /*start_x*/,
2095                                 int /*start_y*/, void* /*dst_frame*/) {
2096   auto* src = static_cast<int32_t*>(src_buffer);
2097   const bool should_round = kShouldRound[tx_size];
2098   const uint8_t row_shift = kTransformRowShift[tx_size];
2099 
2100   if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
2101     return;
2102   }
2103 
2104   if (should_round) {
2105     ApplyRounding<64>(src, adjusted_tx_height);
2106   }
2107 
2108   assert(adjusted_tx_height % 4 == 0);
2109   int i = adjusted_tx_height;
2110   auto* data = src;
2111   do {
2112     // Process 4 1d dct64 rows in parallel per iteration.
2113     Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
2114     data += 128 * 2;
2115     i -= 4;
2116   } while (i != 0);
2117 }
2118 
Dct64TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2119 void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2120                                    int adjusted_tx_height, void* src_buffer,
2121                                    int start_x, int start_y, void* dst_frame) {
2122   auto* src = static_cast<int32_t*>(src_buffer);
2123   const int tx_width = kTransformWidth[tx_size];
2124 
2125   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2126     FlipColumns<64>(src, tx_width);
2127   }
2128 
2129   if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
2130     // Process 4 1d dct64 columns in parallel per iteration.
2131     int i = tx_width;
2132     auto* data = src;
2133     do {
2134       Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
2135       data += 4;
2136       i -= 4;
2137     } while (i != 0);
2138   }
2139   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2140   StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
2141 }
2142 
Adst4TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2143 void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
2144                                 TransformSize tx_size, int adjusted_tx_height,
2145                                 void* src_buffer, int /*start_x*/,
2146                                 int /*start_y*/, void* /*dst_frame*/) {
2147   auto* src = static_cast<int32_t*>(src_buffer);
2148   const int tx_height = kTransformHeight[tx_size];
2149   const int row_shift = static_cast<int>(tx_height == 16);
2150   const bool should_round = (tx_height == 8);
2151 
2152   if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2153     return;
2154   }
2155 
2156   if (should_round) {
2157     ApplyRounding<4>(src, adjusted_tx_height);
2158   }
2159 
2160   // Process 4 1d adst4 rows in parallel per iteration.
2161   int i = adjusted_tx_height;
2162   auto* data = src;
2163   do {
2164     Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
2165     data += 16;
2166     i -= 4;
2167   } while (i != 0);
2168 }
2169 
Adst4TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2170 void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2171                                    int adjusted_tx_height, void* src_buffer,
2172                                    int start_x, int start_y, void* dst_frame) {
2173   auto* src = static_cast<int32_t*>(src_buffer);
2174   const int tx_width = kTransformWidth[tx_size];
2175 
2176   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2177     FlipColumns<4>(src, tx_width);
2178   }
2179 
2180   if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2181     // Process 4 1d adst4 columns in parallel per iteration.
2182     int i = tx_width;
2183     auto* data = src;
2184     do {
2185       Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
2186       data += 4;
2187       i -= 4;
2188     } while (i != 0);
2189   }
2190 
2191   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2192   StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
2193                                                       tx_width, src, tx_type);
2194 }
2195 
Adst8TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2196 void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
2197                                 TransformSize tx_size, int adjusted_tx_height,
2198                                 void* src_buffer, int /*start_x*/,
2199                                 int /*start_y*/, void* /*dst_frame*/) {
2200   auto* src = static_cast<int32_t*>(src_buffer);
2201   const bool should_round = kShouldRound[tx_size];
2202   const uint8_t row_shift = kTransformRowShift[tx_size];
2203 
2204   if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2205     return;
2206   }
2207 
2208   if (should_round) {
2209     ApplyRounding<8>(src, adjusted_tx_height);
2210   }
2211 
2212   // Process 4 1d adst8 rows in parallel per iteration.
2213   assert(adjusted_tx_height % 4 == 0);
2214   int i = adjusted_tx_height;
2215   auto* data = src;
2216   do {
2217     Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
2218                                     /*transpose=*/true, row_shift);
2219     data += 32;
2220     i -= 4;
2221   } while (i != 0);
2222 }
2223 
Adst8TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2224 void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2225                                    int adjusted_tx_height, void* src_buffer,
2226                                    int start_x, int start_y, void* dst_frame) {
2227   auto* src = static_cast<int32_t*>(src_buffer);
2228   const int tx_width = kTransformWidth[tx_size];
2229 
2230   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2231     FlipColumns<8>(src, tx_width);
2232   }
2233 
2234   if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2235     // Process 4 1d adst8 columns in parallel per iteration.
2236     int i = tx_width;
2237     auto* data = src;
2238     do {
2239       Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
2240                                       /*row_shift=*/0);
2241       data += 4;
2242       i -= 4;
2243     } while (i != 0);
2244   }
2245   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2246   StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
2247                                                       tx_width, src, tx_type);
2248 }
2249 
Adst16TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2250 void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
2251                                  TransformSize tx_size, int adjusted_tx_height,
2252                                  void* src_buffer, int /*start_x*/,
2253                                  int /*start_y*/, void* /*dst_frame*/) {
2254   auto* src = static_cast<int32_t*>(src_buffer);
2255   const bool should_round = kShouldRound[tx_size];
2256   const uint8_t row_shift = kTransformRowShift[tx_size];
2257 
2258   if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2259     return;
2260   }
2261 
2262   if (should_round) {
2263     ApplyRounding<16>(src, adjusted_tx_height);
2264   }
2265 
2266   assert(adjusted_tx_height % 4 == 0);
2267   int i = adjusted_tx_height;
2268   do {
2269     // Process 4 1d adst16 rows in parallel per iteration.
2270     Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
2271     src += 64;
2272     i -= 4;
2273   } while (i != 0);
2274 }
2275 
Adst16TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2276 void Adst16TransformLoopColumn_NEON(TransformType tx_type,
2277                                     TransformSize tx_size,
2278                                     int adjusted_tx_height, void* src_buffer,
2279                                     int start_x, int start_y, void* dst_frame) {
2280   auto* src = static_cast<int32_t*>(src_buffer);
2281   const int tx_width = kTransformWidth[tx_size];
2282 
2283   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2284     FlipColumns<16>(src, tx_width);
2285   }
2286 
2287   if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2288     int i = tx_width;
2289     auto* data = src;
2290     do {
2291       // Process 4 1d adst16 columns in parallel per iteration.
2292       Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
2293                                        /*row_shift=*/0);
2294       data += 4;
2295       i -= 4;
2296     } while (i != 0);
2297   }
2298   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2299   StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
2300                                                        tx_width, src, tx_type);
2301 }
2302 
Identity4TransformLoopRow_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2303 void Identity4TransformLoopRow_NEON(TransformType tx_type,
2304                                     TransformSize tx_size,
2305                                     int adjusted_tx_height, void* src_buffer,
2306                                     int /*start_x*/, int /*start_y*/,
2307                                     void* /*dst_frame*/) {
2308   // Special case: Process row calculations during column transform call.
2309   // Improves performance.
2310   if (tx_type == kTransformTypeIdentityIdentity &&
2311       tx_size == kTransformSize4x4) {
2312     return;
2313   }
2314 
2315   auto* src = static_cast<int32_t*>(src_buffer);
2316   const int tx_height = kTransformHeight[tx_size];
2317   const bool should_round = (tx_height == 8);
2318 
2319   if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
2320     return;
2321   }
2322 
2323   if (should_round) {
2324     ApplyRounding<4>(src, adjusted_tx_height);
2325   }
2326 
2327   const int shift = tx_height > 8 ? 1 : 0;
2328   int i = adjusted_tx_height;
2329   do {
2330     Identity4_NEON(src, /*step=*/4, shift);
2331     src += 16;
2332     i -= 4;
2333   } while (i != 0);
2334 }
2335 
Identity4TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2336 void Identity4TransformLoopColumn_NEON(TransformType tx_type,
2337                                        TransformSize tx_size,
2338                                        int adjusted_tx_height, void* src_buffer,
2339                                        int start_x, int start_y,
2340                                        void* dst_frame) {
2341   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2342   auto* src = static_cast<int32_t*>(src_buffer);
2343   const int tx_width = kTransformWidth[tx_size];
2344 
2345   // Special case: Process row calculations during column transform call.
2346   if (tx_type == kTransformTypeIdentityIdentity &&
2347       (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
2348     Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
2349                                    adjusted_tx_height, src);
2350     return;
2351   }
2352 
2353   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2354     FlipColumns<4>(src, tx_width);
2355   }
2356 
2357   IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
2358                                 adjusted_tx_height, src);
2359 }
2360 
Identity8TransformLoopRow_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2361 void Identity8TransformLoopRow_NEON(TransformType tx_type,
2362                                     TransformSize tx_size,
2363                                     int adjusted_tx_height, void* src_buffer,
2364                                     int /*start_x*/, int /*start_y*/,
2365                                     void* /*dst_frame*/) {
2366   // Special case: Process row calculations during column transform call.
2367   // Improves performance.
2368   if (tx_type == kTransformTypeIdentityIdentity &&
2369       tx_size == kTransformSize8x4) {
2370     return;
2371   }
2372 
2373   auto* src = static_cast<int32_t*>(src_buffer);
2374   const int tx_height = kTransformHeight[tx_size];
2375   const bool should_round = kShouldRound[tx_size];
2376   const uint8_t row_shift = kTransformRowShift[tx_size];
2377 
2378   if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2379     return;
2380   }
2381   if (should_round) {
2382     ApplyRounding<8>(src, adjusted_tx_height);
2383   }
2384 
2385   // When combining the identity8 multiplier with the row shift, the
2386   // calculations for tx_height == 8 and tx_height == 16 can be simplified
2387   // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
2388   // bit value.
2389   if ((tx_height & 0x18) != 0) {
2390     for (int i = 0; i < tx_height; ++i) {
2391       const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
2392       const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
2393       vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
2394       vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
2395     }
2396     return;
2397   }
2398   if (tx_height == 32) {
2399     int i = adjusted_tx_height;
2400     do {
2401       Identity8Row32_NEON(src, /*step=*/8);
2402       src += 32;
2403       i -= 4;
2404     } while (i != 0);
2405     return;
2406   }
2407 
2408   assert(tx_size == kTransformSize8x4);
2409   int i = adjusted_tx_height;
2410   do {
2411     Identity8Row4_NEON(src, /*step=*/8);
2412     src += 32;
2413     i -= 4;
2414   } while (i != 0);
2415 }
2416 
Identity8TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2417 void Identity8TransformLoopColumn_NEON(TransformType tx_type,
2418                                        TransformSize tx_size,
2419                                        int adjusted_tx_height, void* src_buffer,
2420                                        int start_x, int start_y,
2421                                        void* dst_frame) {
2422   auto* src = static_cast<int32_t*>(src_buffer);
2423   const int tx_width = kTransformWidth[tx_size];
2424 
2425   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2426     FlipColumns<8>(src, tx_width);
2427   }
2428   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2429   IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
2430                                 adjusted_tx_height, src);
2431 }
2432 
Identity16TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2433 void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
2434                                      TransformSize tx_size,
2435                                      int adjusted_tx_height, void* src_buffer,
2436                                      int /*start_x*/, int /*start_y*/,
2437                                      void* /*dst_frame*/) {
2438   auto* src = static_cast<int32_t*>(src_buffer);
2439   const bool should_round = kShouldRound[tx_size];
2440   const uint8_t row_shift = kTransformRowShift[tx_size];
2441 
2442   if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2443     return;
2444   }
2445 
2446   if (should_round) {
2447     ApplyRounding<16>(src, adjusted_tx_height);
2448   }
2449   int i = adjusted_tx_height;
2450   do {
2451     Identity16Row_NEON(src, /*step=*/16, row_shift);
2452     src += 64;
2453     i -= 4;
2454   } while (i != 0);
2455 }
2456 
Identity16TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2457 void Identity16TransformLoopColumn_NEON(TransformType tx_type,
2458                                         TransformSize tx_size,
2459                                         int adjusted_tx_height,
2460                                         void* src_buffer, int start_x,
2461                                         int start_y, void* dst_frame) {
2462   auto* src = static_cast<int32_t*>(src_buffer);
2463   const int tx_width = kTransformWidth[tx_size];
2464 
2465   if (kTransformFlipColumnsMask.Contains(tx_type)) {
2466     FlipColumns<16>(src, tx_width);
2467   }
2468   auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2469   IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
2470                                  adjusted_tx_height, src);
2471 }
2472 
2473 //------------------------------------------------------------------------------
2474 
Init10bpp()2475 void Init10bpp() {
2476   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
2477   assert(dsp != nullptr);
2478   // Maximum transform size for Dct is 64.
2479   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
2480       Dct4TransformLoopRow_NEON;
2481   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
2482       Dct4TransformLoopColumn_NEON;
2483   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
2484       Dct8TransformLoopRow_NEON;
2485   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
2486       Dct8TransformLoopColumn_NEON;
2487   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
2488       Dct16TransformLoopRow_NEON;
2489   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
2490       Dct16TransformLoopColumn_NEON;
2491   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
2492       Dct32TransformLoopRow_NEON;
2493   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
2494       Dct32TransformLoopColumn_NEON;
2495   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
2496       Dct64TransformLoopRow_NEON;
2497   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
2498       Dct64TransformLoopColumn_NEON;
2499 
2500   // Maximum transform size for Adst is 16.
2501   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
2502       Adst4TransformLoopRow_NEON;
2503   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
2504       Adst4TransformLoopColumn_NEON;
2505   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
2506       Adst8TransformLoopRow_NEON;
2507   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
2508       Adst8TransformLoopColumn_NEON;
2509   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
2510       Adst16TransformLoopRow_NEON;
2511   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
2512       Adst16TransformLoopColumn_NEON;
2513 
2514   // Maximum transform size for Identity transform is 32.
2515   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
2516       Identity4TransformLoopRow_NEON;
2517   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
2518       Identity4TransformLoopColumn_NEON;
2519   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
2520       Identity8TransformLoopRow_NEON;
2521   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
2522       Identity8TransformLoopColumn_NEON;
2523   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
2524       Identity16TransformLoopRow_NEON;
2525   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
2526       Identity16TransformLoopColumn_NEON;
2527 }
2528 
2529 }  // namespace
2530 
InverseTransformInit10bpp_NEON()2531 void InverseTransformInit10bpp_NEON() { Init10bpp(); }
2532 
2533 }  // namespace dsp
2534 }  // namespace libgav1
2535 #else   // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
2536 namespace libgav1 {
2537 namespace dsp {
2538 
InverseTransformInit10bpp_NEON()2539 void InverseTransformInit10bpp_NEON() {}
2540 
2541 }  // namespace dsp
2542 }  // namespace libgav1
2543 #endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
2544