1 // Copyright 2021 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/inverse_transform.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
19
20 #include <arm_neon.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstdint>
25
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/utils/array_2d.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32 #include "src/utils/constants.h"
33
34 namespace libgav1 {
35 namespace dsp {
36 namespace {
37
38 // Include the constants and utility functions inside the anonymous namespace.
39 #include "src/dsp/inverse_transform.inc"
40
41 //------------------------------------------------------------------------------
42
Transpose4x4(const int32x4_t in[4],int32x4_t out[4])43 LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
44 int32x4_t out[4]) {
45 // in:
46 // 00 01 02 03
47 // 10 11 12 13
48 // 20 21 22 23
49 // 30 31 32 33
50
51 // 00 10 02 12 a.val[0]
52 // 01 11 03 13 a.val[1]
53 // 20 30 22 32 b.val[0]
54 // 21 31 23 33 b.val[1]
55 const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
56 const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
57 out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
58 out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
59 out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
60 out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
61 // out:
62 // 00 10 20 30
63 // 01 11 21 31
64 // 02 12 22 32
65 // 03 13 23 33
66 }
67
68 //------------------------------------------------------------------------------
69 template <int store_count>
StoreDst(int32_t * dst,int32_t stride,int32_t idx,const int32x4_t * const s)70 LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
71 const int32x4_t* const s) {
72 assert(store_count % 4 == 0);
73 for (int i = 0; i < store_count; i += 4) {
74 vst1q_s32(&dst[i * stride + idx], s[i]);
75 vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
76 vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
77 vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
78 }
79 }
80
81 template <int load_count>
LoadSrc(const int32_t * src,int32_t stride,int32_t idx,int32x4_t * x)82 LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
83 int32_t idx, int32x4_t* x) {
84 assert(load_count % 4 == 0);
85 for (int i = 0; i < load_count; i += 4) {
86 x[i] = vld1q_s32(&src[i * stride + idx]);
87 x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
88 x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
89 x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
90 }
91 }
92
93 // Butterfly rotate 4 values.
ButterflyRotation_4(int32x4_t * a,int32x4_t * b,const int angle,const bool flip)94 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
95 const int angle,
96 const bool flip) {
97 const int32_t cos128 = Cos128(angle);
98 const int32_t sin128 = Sin128(angle);
99 const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
100 const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
101 // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
102 // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
103 // bit lane.
104 const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
105 const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
106 const int32x4_t x = vrshrq_n_s32(x0, 12);
107 const int32x4_t y = vrshrq_n_s32(y0, 12);
108 if (flip) {
109 *a = y;
110 *b = x;
111 } else {
112 *a = x;
113 *b = y;
114 }
115 }
116
ButterflyRotation_FirstIsZero(int32x4_t * a,int32x4_t * b,const int angle,const bool flip)117 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
118 int32x4_t* b,
119 const int angle,
120 const bool flip) {
121 const int32_t cos128 = Cos128(angle);
122 const int32_t sin128 = Sin128(angle);
123 assert(sin128 <= 0xfff);
124 const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
125 const int32x4_t y0 = vmulq_n_s32(*b, cos128);
126 const int32x4_t x = vrshrq_n_s32(x0, 12);
127 const int32x4_t y = vrshrq_n_s32(y0, 12);
128 if (flip) {
129 *a = y;
130 *b = x;
131 } else {
132 *a = x;
133 *b = y;
134 }
135 }
136
ButterflyRotation_SecondIsZero(int32x4_t * a,int32x4_t * b,const int angle,const bool flip)137 LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
138 int32x4_t* b,
139 const int angle,
140 const bool flip) {
141 const int32_t cos128 = Cos128(angle);
142 const int32_t sin128 = Sin128(angle);
143 const int32x4_t x0 = vmulq_n_s32(*a, cos128);
144 const int32x4_t y0 = vmulq_n_s32(*a, sin128);
145 const int32x4_t x = vrshrq_n_s32(x0, 12);
146 const int32x4_t y = vrshrq_n_s32(y0, 12);
147 if (flip) {
148 *a = y;
149 *b = x;
150 } else {
151 *a = x;
152 *b = y;
153 }
154 }
155
HadamardRotation(int32x4_t * a,int32x4_t * b,bool flip)156 LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
157 bool flip) {
158 int32x4_t x, y;
159 if (flip) {
160 y = vqaddq_s32(*b, *a);
161 x = vqsubq_s32(*b, *a);
162 } else {
163 x = vqaddq_s32(*a, *b);
164 y = vqsubq_s32(*a, *b);
165 }
166 *a = x;
167 *b = y;
168 }
169
HadamardRotation(int32x4_t * a,int32x4_t * b,bool flip,const int32x4_t * min,const int32x4_t * max)170 LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
171 bool flip, const int32x4_t* min,
172 const int32x4_t* max) {
173 int32x4_t x, y;
174 if (flip) {
175 y = vqaddq_s32(*b, *a);
176 x = vqsubq_s32(*b, *a);
177 } else {
178 x = vqaddq_s32(*a, *b);
179 y = vqsubq_s32(*a, *b);
180 }
181 *a = vmaxq_s32(vminq_s32(x, *max), *min);
182 *b = vmaxq_s32(vminq_s32(y, *max), *min);
183 }
184
185 using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
186 bool flip);
187
188 //------------------------------------------------------------------------------
189 // Discrete Cosine Transforms (DCT).
190
191 template <int width>
DctDcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)192 LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
193 bool should_round, int row_shift) {
194 if (adjusted_tx_height > 1) return false;
195
196 auto* dst = static_cast<int32_t*>(dest);
197 const int32x4_t v_src = vdupq_n_s32(dst[0]);
198 const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
199 const int32x4_t v_src_round =
200 vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
201 const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
202 const int32_t cos128 = Cos128(32);
203 const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
204 // vqrshlq_s32 will shift right if shift value is negative.
205 const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
206 // Clamp result to signed 16 bits.
207 const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
208 if (width == 4) {
209 vst1q_s32(dst, result);
210 } else {
211 for (int i = 0; i < width; i += 4) {
212 vst1q_s32(dst, result);
213 dst += 4;
214 }
215 }
216 return true;
217 }
218
219 template <int height>
DctDcOnlyColumn(void * dest,int adjusted_tx_height,int width)220 LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
221 int width) {
222 if (adjusted_tx_height > 1) return false;
223
224 auto* dst = static_cast<int32_t*>(dest);
225 const int32_t cos128 = Cos128(32);
226
227 // Calculate dc values for first row.
228 if (width == 4) {
229 const int32x4_t v_src = vld1q_s32(dst);
230 const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
231 vst1q_s32(dst, xy);
232 } else {
233 int i = 0;
234 do {
235 const int32x4_t v_src = vld1q_s32(&dst[i]);
236 const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
237 vst1q_s32(&dst[i], xy);
238 i += 4;
239 } while (i < width);
240 }
241
242 // Copy first row to the rest of the block.
243 for (int y = 1; y < height; ++y) {
244 memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
245 }
246 return true;
247 }
248
249 template <ButterflyRotationFunc butterfly_rotation,
250 bool is_fast_butterfly = false>
Dct4Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)251 LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
252 const int32x4_t* max,
253 const bool is_last_stage) {
254 // stage 12.
255 if (is_fast_butterfly) {
256 ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
257 ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
258 } else {
259 butterfly_rotation(&s[0], &s[1], 32, true);
260 butterfly_rotation(&s[2], &s[3], 48, false);
261 }
262
263 // stage 17.
264 if (is_last_stage) {
265 HadamardRotation(&s[0], &s[3], false);
266 HadamardRotation(&s[1], &s[2], false);
267 } else {
268 HadamardRotation(&s[0], &s[3], false, min, max);
269 HadamardRotation(&s[1], &s[2], false, min, max);
270 }
271 }
272
273 template <ButterflyRotationFunc butterfly_rotation>
Dct4_NEON(void * dest,int32_t step,bool is_row,int row_shift)274 LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
275 int row_shift) {
276 auto* const dst = static_cast<int32_t*>(dest);
277 // When |is_row| is true, set range to the row range, otherwise, set to the
278 // column range.
279 const int32_t range = is_row ? kBitdepth10 + 7 : 15;
280 const int32x4_t min = vdupq_n_s32(-(1 << range));
281 const int32x4_t max = vdupq_n_s32((1 << range) - 1);
282 int32x4_t s[4], x[4];
283
284 LoadSrc<4>(dst, step, 0, x);
285 if (is_row) {
286 Transpose4x4(x, x);
287 }
288
289 // stage 1.
290 // kBitReverseLookup 0, 2, 1, 3
291 s[0] = x[0];
292 s[1] = x[2];
293 s[2] = x[1];
294 s[3] = x[3];
295
296 Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
297
298 if (is_row) {
299 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
300 for (int i = 0; i < 4; ++i) {
301 s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
302 }
303 Transpose4x4(s, s);
304 }
305 StoreDst<4>(dst, step, 0, s);
306 }
307
308 template <ButterflyRotationFunc butterfly_rotation,
309 bool is_fast_butterfly = false>
Dct8Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)310 LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
311 const int32x4_t* max,
312 const bool is_last_stage) {
313 // stage 8.
314 if (is_fast_butterfly) {
315 ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
316 ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
317 } else {
318 butterfly_rotation(&s[4], &s[7], 56, false);
319 butterfly_rotation(&s[5], &s[6], 24, false);
320 }
321
322 // stage 13.
323 HadamardRotation(&s[4], &s[5], false, min, max);
324 HadamardRotation(&s[6], &s[7], true, min, max);
325
326 // stage 18.
327 butterfly_rotation(&s[6], &s[5], 32, true);
328
329 // stage 22.
330 if (is_last_stage) {
331 HadamardRotation(&s[0], &s[7], false);
332 HadamardRotation(&s[1], &s[6], false);
333 HadamardRotation(&s[2], &s[5], false);
334 HadamardRotation(&s[3], &s[4], false);
335 } else {
336 HadamardRotation(&s[0], &s[7], false, min, max);
337 HadamardRotation(&s[1], &s[6], false, min, max);
338 HadamardRotation(&s[2], &s[5], false, min, max);
339 HadamardRotation(&s[3], &s[4], false, min, max);
340 }
341 }
342
343 // Process dct8 rows or columns, depending on the |is_row| flag.
344 template <ButterflyRotationFunc butterfly_rotation>
Dct8_NEON(void * dest,int32_t step,bool is_row,int row_shift)345 LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
346 int row_shift) {
347 auto* const dst = static_cast<int32_t*>(dest);
348 const int32_t range = is_row ? kBitdepth10 + 7 : 15;
349 const int32x4_t min = vdupq_n_s32(-(1 << range));
350 const int32x4_t max = vdupq_n_s32((1 << range) - 1);
351 int32x4_t s[8], x[8];
352
353 if (is_row) {
354 LoadSrc<4>(dst, step, 0, &x[0]);
355 LoadSrc<4>(dst, step, 4, &x[4]);
356 Transpose4x4(&x[0], &x[0]);
357 Transpose4x4(&x[4], &x[4]);
358 } else {
359 LoadSrc<8>(dst, step, 0, &x[0]);
360 }
361
362 // stage 1.
363 // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
364 s[0] = x[0];
365 s[1] = x[4];
366 s[2] = x[2];
367 s[3] = x[6];
368 s[4] = x[1];
369 s[5] = x[5];
370 s[6] = x[3];
371 s[7] = x[7];
372
373 Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
374 Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
375
376 if (is_row) {
377 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
378 for (int i = 0; i < 8; ++i) {
379 s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
380 }
381 Transpose4x4(&s[0], &s[0]);
382 Transpose4x4(&s[4], &s[4]);
383 StoreDst<4>(dst, step, 0, &s[0]);
384 StoreDst<4>(dst, step, 4, &s[4]);
385 } else {
386 StoreDst<8>(dst, step, 0, &s[0]);
387 }
388 }
389
390 template <ButterflyRotationFunc butterfly_rotation,
391 bool is_fast_butterfly = false>
Dct16Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)392 LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
393 const int32x4_t* max,
394 const bool is_last_stage) {
395 // stage 5.
396 if (is_fast_butterfly) {
397 ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
398 ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
399 ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
400 ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
401 } else {
402 butterfly_rotation(&s[8], &s[15], 60, false);
403 butterfly_rotation(&s[9], &s[14], 28, false);
404 butterfly_rotation(&s[10], &s[13], 44, false);
405 butterfly_rotation(&s[11], &s[12], 12, false);
406 }
407
408 // stage 9.
409 HadamardRotation(&s[8], &s[9], false, min, max);
410 HadamardRotation(&s[10], &s[11], true, min, max);
411 HadamardRotation(&s[12], &s[13], false, min, max);
412 HadamardRotation(&s[14], &s[15], true, min, max);
413
414 // stage 14.
415 butterfly_rotation(&s[14], &s[9], 48, true);
416 butterfly_rotation(&s[13], &s[10], 112, true);
417
418 // stage 19.
419 HadamardRotation(&s[8], &s[11], false, min, max);
420 HadamardRotation(&s[9], &s[10], false, min, max);
421 HadamardRotation(&s[12], &s[15], true, min, max);
422 HadamardRotation(&s[13], &s[14], true, min, max);
423
424 // stage 23.
425 butterfly_rotation(&s[13], &s[10], 32, true);
426 butterfly_rotation(&s[12], &s[11], 32, true);
427
428 // stage 26.
429 if (is_last_stage) {
430 HadamardRotation(&s[0], &s[15], false);
431 HadamardRotation(&s[1], &s[14], false);
432 HadamardRotation(&s[2], &s[13], false);
433 HadamardRotation(&s[3], &s[12], false);
434 HadamardRotation(&s[4], &s[11], false);
435 HadamardRotation(&s[5], &s[10], false);
436 HadamardRotation(&s[6], &s[9], false);
437 HadamardRotation(&s[7], &s[8], false);
438 } else {
439 HadamardRotation(&s[0], &s[15], false, min, max);
440 HadamardRotation(&s[1], &s[14], false, min, max);
441 HadamardRotation(&s[2], &s[13], false, min, max);
442 HadamardRotation(&s[3], &s[12], false, min, max);
443 HadamardRotation(&s[4], &s[11], false, min, max);
444 HadamardRotation(&s[5], &s[10], false, min, max);
445 HadamardRotation(&s[6], &s[9], false, min, max);
446 HadamardRotation(&s[7], &s[8], false, min, max);
447 }
448 }
449
450 // Process dct16 rows or columns, depending on the |is_row| flag.
451 template <ButterflyRotationFunc butterfly_rotation>
Dct16_NEON(void * dest,int32_t step,bool is_row,int row_shift)452 LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
453 int row_shift) {
454 auto* const dst = static_cast<int32_t*>(dest);
455 const int32_t range = is_row ? kBitdepth10 + 7 : 15;
456 const int32x4_t min = vdupq_n_s32(-(1 << range));
457 const int32x4_t max = vdupq_n_s32((1 << range) - 1);
458 int32x4_t s[16], x[16];
459
460 if (is_row) {
461 for (int idx = 0; idx < 16; idx += 8) {
462 LoadSrc<4>(dst, step, idx, &x[idx]);
463 LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
464 Transpose4x4(&x[idx], &x[idx]);
465 Transpose4x4(&x[idx + 4], &x[idx + 4]);
466 }
467 } else {
468 LoadSrc<16>(dst, step, 0, &x[0]);
469 }
470
471 // stage 1
472 // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
473 s[0] = x[0];
474 s[1] = x[8];
475 s[2] = x[4];
476 s[3] = x[12];
477 s[4] = x[2];
478 s[5] = x[10];
479 s[6] = x[6];
480 s[7] = x[14];
481 s[8] = x[1];
482 s[9] = x[9];
483 s[10] = x[5];
484 s[11] = x[13];
485 s[12] = x[3];
486 s[13] = x[11];
487 s[14] = x[7];
488 s[15] = x[15];
489
490 Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
491 Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
492 Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
493
494 if (is_row) {
495 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
496 for (int i = 0; i < 16; ++i) {
497 s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
498 }
499 for (int idx = 0; idx < 16; idx += 8) {
500 Transpose4x4(&s[idx], &s[idx]);
501 Transpose4x4(&s[idx + 4], &s[idx + 4]);
502 StoreDst<4>(dst, step, idx, &s[idx]);
503 StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
504 }
505 } else {
506 StoreDst<16>(dst, step, 0, &s[0]);
507 }
508 }
509
510 template <ButterflyRotationFunc butterfly_rotation,
511 bool is_fast_butterfly = false>
Dct32Stages(int32x4_t * s,const int32x4_t * min,const int32x4_t * max,const bool is_last_stage)512 LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
513 const int32x4_t* max,
514 const bool is_last_stage) {
515 // stage 3
516 if (is_fast_butterfly) {
517 ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
518 ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
519 ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
520 ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
521 ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
522 ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
523 ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
524 ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
525 } else {
526 butterfly_rotation(&s[16], &s[31], 62, false);
527 butterfly_rotation(&s[17], &s[30], 30, false);
528 butterfly_rotation(&s[18], &s[29], 46, false);
529 butterfly_rotation(&s[19], &s[28], 14, false);
530 butterfly_rotation(&s[20], &s[27], 54, false);
531 butterfly_rotation(&s[21], &s[26], 22, false);
532 butterfly_rotation(&s[22], &s[25], 38, false);
533 butterfly_rotation(&s[23], &s[24], 6, false);
534 }
535
536 // stage 6.
537 HadamardRotation(&s[16], &s[17], false, min, max);
538 HadamardRotation(&s[18], &s[19], true, min, max);
539 HadamardRotation(&s[20], &s[21], false, min, max);
540 HadamardRotation(&s[22], &s[23], true, min, max);
541 HadamardRotation(&s[24], &s[25], false, min, max);
542 HadamardRotation(&s[26], &s[27], true, min, max);
543 HadamardRotation(&s[28], &s[29], false, min, max);
544 HadamardRotation(&s[30], &s[31], true, min, max);
545
546 // stage 10.
547 butterfly_rotation(&s[30], &s[17], 24 + 32, true);
548 butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
549 butterfly_rotation(&s[26], &s[21], 24, true);
550 butterfly_rotation(&s[25], &s[22], 24 + 64, true);
551
552 // stage 15.
553 HadamardRotation(&s[16], &s[19], false, min, max);
554 HadamardRotation(&s[17], &s[18], false, min, max);
555 HadamardRotation(&s[20], &s[23], true, min, max);
556 HadamardRotation(&s[21], &s[22], true, min, max);
557 HadamardRotation(&s[24], &s[27], false, min, max);
558 HadamardRotation(&s[25], &s[26], false, min, max);
559 HadamardRotation(&s[28], &s[31], true, min, max);
560 HadamardRotation(&s[29], &s[30], true, min, max);
561
562 // stage 20.
563 butterfly_rotation(&s[29], &s[18], 48, true);
564 butterfly_rotation(&s[28], &s[19], 48, true);
565 butterfly_rotation(&s[27], &s[20], 48 + 64, true);
566 butterfly_rotation(&s[26], &s[21], 48 + 64, true);
567
568 // stage 24.
569 HadamardRotation(&s[16], &s[23], false, min, max);
570 HadamardRotation(&s[17], &s[22], false, min, max);
571 HadamardRotation(&s[18], &s[21], false, min, max);
572 HadamardRotation(&s[19], &s[20], false, min, max);
573 HadamardRotation(&s[24], &s[31], true, min, max);
574 HadamardRotation(&s[25], &s[30], true, min, max);
575 HadamardRotation(&s[26], &s[29], true, min, max);
576 HadamardRotation(&s[27], &s[28], true, min, max);
577
578 // stage 27.
579 butterfly_rotation(&s[27], &s[20], 32, true);
580 butterfly_rotation(&s[26], &s[21], 32, true);
581 butterfly_rotation(&s[25], &s[22], 32, true);
582 butterfly_rotation(&s[24], &s[23], 32, true);
583
584 // stage 29.
585 if (is_last_stage) {
586 HadamardRotation(&s[0], &s[31], false);
587 HadamardRotation(&s[1], &s[30], false);
588 HadamardRotation(&s[2], &s[29], false);
589 HadamardRotation(&s[3], &s[28], false);
590 HadamardRotation(&s[4], &s[27], false);
591 HadamardRotation(&s[5], &s[26], false);
592 HadamardRotation(&s[6], &s[25], false);
593 HadamardRotation(&s[7], &s[24], false);
594 HadamardRotation(&s[8], &s[23], false);
595 HadamardRotation(&s[9], &s[22], false);
596 HadamardRotation(&s[10], &s[21], false);
597 HadamardRotation(&s[11], &s[20], false);
598 HadamardRotation(&s[12], &s[19], false);
599 HadamardRotation(&s[13], &s[18], false);
600 HadamardRotation(&s[14], &s[17], false);
601 HadamardRotation(&s[15], &s[16], false);
602 } else {
603 HadamardRotation(&s[0], &s[31], false, min, max);
604 HadamardRotation(&s[1], &s[30], false, min, max);
605 HadamardRotation(&s[2], &s[29], false, min, max);
606 HadamardRotation(&s[3], &s[28], false, min, max);
607 HadamardRotation(&s[4], &s[27], false, min, max);
608 HadamardRotation(&s[5], &s[26], false, min, max);
609 HadamardRotation(&s[6], &s[25], false, min, max);
610 HadamardRotation(&s[7], &s[24], false, min, max);
611 HadamardRotation(&s[8], &s[23], false, min, max);
612 HadamardRotation(&s[9], &s[22], false, min, max);
613 HadamardRotation(&s[10], &s[21], false, min, max);
614 HadamardRotation(&s[11], &s[20], false, min, max);
615 HadamardRotation(&s[12], &s[19], false, min, max);
616 HadamardRotation(&s[13], &s[18], false, min, max);
617 HadamardRotation(&s[14], &s[17], false, min, max);
618 HadamardRotation(&s[15], &s[16], false, min, max);
619 }
620 }
621
622 // Process dct32 rows or columns, depending on the |is_row| flag.
Dct32_NEON(void * dest,const int32_t step,const bool is_row,int row_shift)623 LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
624 const bool is_row, int row_shift) {
625 auto* const dst = static_cast<int32_t*>(dest);
626 const int32_t range = is_row ? kBitdepth10 + 7 : 15;
627 const int32x4_t min = vdupq_n_s32(-(1 << range));
628 const int32x4_t max = vdupq_n_s32((1 << range) - 1);
629 int32x4_t s[32], x[32];
630
631 if (is_row) {
632 for (int idx = 0; idx < 32; idx += 8) {
633 LoadSrc<4>(dst, step, idx, &x[idx]);
634 LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
635 Transpose4x4(&x[idx], &x[idx]);
636 Transpose4x4(&x[idx + 4], &x[idx + 4]);
637 }
638 } else {
639 LoadSrc<32>(dst, step, 0, &x[0]);
640 }
641
642 // stage 1
643 // kBitReverseLookup
644 // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
645 s[0] = x[0];
646 s[1] = x[16];
647 s[2] = x[8];
648 s[3] = x[24];
649 s[4] = x[4];
650 s[5] = x[20];
651 s[6] = x[12];
652 s[7] = x[28];
653 s[8] = x[2];
654 s[9] = x[18];
655 s[10] = x[10];
656 s[11] = x[26];
657 s[12] = x[6];
658 s[13] = x[22];
659 s[14] = x[14];
660 s[15] = x[30];
661
662 // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
663 s[16] = x[1];
664 s[17] = x[17];
665 s[18] = x[9];
666 s[19] = x[25];
667 s[20] = x[5];
668 s[21] = x[21];
669 s[22] = x[13];
670 s[23] = x[29];
671 s[24] = x[3];
672 s[25] = x[19];
673 s[26] = x[11];
674 s[27] = x[27];
675 s[28] = x[7];
676 s[29] = x[23];
677 s[30] = x[15];
678 s[31] = x[31];
679
680 Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
681 Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
682 Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
683 Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
684
685 if (is_row) {
686 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
687 for (int idx = 0; idx < 32; idx += 8) {
688 int32x4_t output[8];
689 Transpose4x4(&s[idx], &output[0]);
690 Transpose4x4(&s[idx + 4], &output[4]);
691 for (int i = 0; i < 8; ++i) {
692 output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
693 }
694 StoreDst<4>(dst, step, idx, &output[0]);
695 StoreDst<4>(dst, step, idx + 4, &output[4]);
696 }
697 } else {
698 StoreDst<32>(dst, step, 0, &s[0]);
699 }
700 }
701
Dct64_NEON(void * dest,int32_t step,bool is_row,int row_shift)702 void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
703 auto* const dst = static_cast<int32_t*>(dest);
704 const int32_t range = is_row ? kBitdepth10 + 7 : 15;
705 const int32x4_t min = vdupq_n_s32(-(1 << range));
706 const int32x4_t max = vdupq_n_s32((1 << range) - 1);
707 int32x4_t s[64], x[32];
708
709 if (is_row) {
710 // The last 32 values of every row are always zero if the |tx_width| is
711 // 64.
712 for (int idx = 0; idx < 32; idx += 8) {
713 LoadSrc<4>(dst, step, idx, &x[idx]);
714 LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
715 Transpose4x4(&x[idx], &x[idx]);
716 Transpose4x4(&x[idx + 4], &x[idx + 4]);
717 }
718 } else {
719 // The last 32 values of every column are always zero if the |tx_height| is
720 // 64.
721 LoadSrc<32>(dst, step, 0, &x[0]);
722 }
723
724 // stage 1
725 // kBitReverseLookup
726 // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
727 s[0] = x[0];
728 s[2] = x[16];
729 s[4] = x[8];
730 s[6] = x[24];
731 s[8] = x[4];
732 s[10] = x[20];
733 s[12] = x[12];
734 s[14] = x[28];
735
736 // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
737 s[16] = x[2];
738 s[18] = x[18];
739 s[20] = x[10];
740 s[22] = x[26];
741 s[24] = x[6];
742 s[26] = x[22];
743 s[28] = x[14];
744 s[30] = x[30];
745
746 // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
747 s[32] = x[1];
748 s[34] = x[17];
749 s[36] = x[9];
750 s[38] = x[25];
751 s[40] = x[5];
752 s[42] = x[21];
753 s[44] = x[13];
754 s[46] = x[29];
755
756 // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
757 s[48] = x[3];
758 s[50] = x[19];
759 s[52] = x[11];
760 s[54] = x[27];
761 s[56] = x[7];
762 s[58] = x[23];
763 s[60] = x[15];
764 s[62] = x[31];
765
766 Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
767 s, &min, &max, /*is_last_stage=*/false);
768 Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
769 s, &min, &max, /*is_last_stage=*/false);
770 Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
771 s, &min, &max, /*is_last_stage=*/false);
772 Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
773 s, &min, &max, /*is_last_stage=*/false);
774
775 //-- start dct 64 stages
776 // stage 2.
777 ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
778 ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
779 ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
780 ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
781 ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
782 ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
783 ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
784 ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
785 ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
786 ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
787 ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
788 ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
789 ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
790 ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
791 ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
792 ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
793
794 // stage 4.
795 HadamardRotation(&s[32], &s[33], false, &min, &max);
796 HadamardRotation(&s[34], &s[35], true, &min, &max);
797 HadamardRotation(&s[36], &s[37], false, &min, &max);
798 HadamardRotation(&s[38], &s[39], true, &min, &max);
799 HadamardRotation(&s[40], &s[41], false, &min, &max);
800 HadamardRotation(&s[42], &s[43], true, &min, &max);
801 HadamardRotation(&s[44], &s[45], false, &min, &max);
802 HadamardRotation(&s[46], &s[47], true, &min, &max);
803 HadamardRotation(&s[48], &s[49], false, &min, &max);
804 HadamardRotation(&s[50], &s[51], true, &min, &max);
805 HadamardRotation(&s[52], &s[53], false, &min, &max);
806 HadamardRotation(&s[54], &s[55], true, &min, &max);
807 HadamardRotation(&s[56], &s[57], false, &min, &max);
808 HadamardRotation(&s[58], &s[59], true, &min, &max);
809 HadamardRotation(&s[60], &s[61], false, &min, &max);
810 HadamardRotation(&s[62], &s[63], true, &min, &max);
811
812 // stage 7.
813 ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
814 ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
815 ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
816 ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
817 ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
818 ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
819 ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
820 ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
821
822 // stage 11.
823 HadamardRotation(&s[32], &s[35], false, &min, &max);
824 HadamardRotation(&s[33], &s[34], false, &min, &max);
825 HadamardRotation(&s[36], &s[39], true, &min, &max);
826 HadamardRotation(&s[37], &s[38], true, &min, &max);
827 HadamardRotation(&s[40], &s[43], false, &min, &max);
828 HadamardRotation(&s[41], &s[42], false, &min, &max);
829 HadamardRotation(&s[44], &s[47], true, &min, &max);
830 HadamardRotation(&s[45], &s[46], true, &min, &max);
831 HadamardRotation(&s[48], &s[51], false, &min, &max);
832 HadamardRotation(&s[49], &s[50], false, &min, &max);
833 HadamardRotation(&s[52], &s[55], true, &min, &max);
834 HadamardRotation(&s[53], &s[54], true, &min, &max);
835 HadamardRotation(&s[56], &s[59], false, &min, &max);
836 HadamardRotation(&s[57], &s[58], false, &min, &max);
837 HadamardRotation(&s[60], &s[63], true, &min, &max);
838 HadamardRotation(&s[61], &s[62], true, &min, &max);
839
840 // stage 16.
841 ButterflyRotation_4(&s[61], &s[34], 56, true);
842 ButterflyRotation_4(&s[60], &s[35], 56, true);
843 ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
844 ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
845 ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
846 ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
847 ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
848 ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
849
850 // stage 21.
851 HadamardRotation(&s[32], &s[39], false, &min, &max);
852 HadamardRotation(&s[33], &s[38], false, &min, &max);
853 HadamardRotation(&s[34], &s[37], false, &min, &max);
854 HadamardRotation(&s[35], &s[36], false, &min, &max);
855 HadamardRotation(&s[40], &s[47], true, &min, &max);
856 HadamardRotation(&s[41], &s[46], true, &min, &max);
857 HadamardRotation(&s[42], &s[45], true, &min, &max);
858 HadamardRotation(&s[43], &s[44], true, &min, &max);
859 HadamardRotation(&s[48], &s[55], false, &min, &max);
860 HadamardRotation(&s[49], &s[54], false, &min, &max);
861 HadamardRotation(&s[50], &s[53], false, &min, &max);
862 HadamardRotation(&s[51], &s[52], false, &min, &max);
863 HadamardRotation(&s[56], &s[63], true, &min, &max);
864 HadamardRotation(&s[57], &s[62], true, &min, &max);
865 HadamardRotation(&s[58], &s[61], true, &min, &max);
866 HadamardRotation(&s[59], &s[60], true, &min, &max);
867
868 // stage 25.
869 ButterflyRotation_4(&s[59], &s[36], 48, true);
870 ButterflyRotation_4(&s[58], &s[37], 48, true);
871 ButterflyRotation_4(&s[57], &s[38], 48, true);
872 ButterflyRotation_4(&s[56], &s[39], 48, true);
873 ButterflyRotation_4(&s[55], &s[40], 112, true);
874 ButterflyRotation_4(&s[54], &s[41], 112, true);
875 ButterflyRotation_4(&s[53], &s[42], 112, true);
876 ButterflyRotation_4(&s[52], &s[43], 112, true);
877
878 // stage 28.
879 HadamardRotation(&s[32], &s[47], false, &min, &max);
880 HadamardRotation(&s[33], &s[46], false, &min, &max);
881 HadamardRotation(&s[34], &s[45], false, &min, &max);
882 HadamardRotation(&s[35], &s[44], false, &min, &max);
883 HadamardRotation(&s[36], &s[43], false, &min, &max);
884 HadamardRotation(&s[37], &s[42], false, &min, &max);
885 HadamardRotation(&s[38], &s[41], false, &min, &max);
886 HadamardRotation(&s[39], &s[40], false, &min, &max);
887 HadamardRotation(&s[48], &s[63], true, &min, &max);
888 HadamardRotation(&s[49], &s[62], true, &min, &max);
889 HadamardRotation(&s[50], &s[61], true, &min, &max);
890 HadamardRotation(&s[51], &s[60], true, &min, &max);
891 HadamardRotation(&s[52], &s[59], true, &min, &max);
892 HadamardRotation(&s[53], &s[58], true, &min, &max);
893 HadamardRotation(&s[54], &s[57], true, &min, &max);
894 HadamardRotation(&s[55], &s[56], true, &min, &max);
895
896 // stage 30.
897 ButterflyRotation_4(&s[55], &s[40], 32, true);
898 ButterflyRotation_4(&s[54], &s[41], 32, true);
899 ButterflyRotation_4(&s[53], &s[42], 32, true);
900 ButterflyRotation_4(&s[52], &s[43], 32, true);
901 ButterflyRotation_4(&s[51], &s[44], 32, true);
902 ButterflyRotation_4(&s[50], &s[45], 32, true);
903 ButterflyRotation_4(&s[49], &s[46], 32, true);
904 ButterflyRotation_4(&s[48], &s[47], 32, true);
905
906 // stage 31.
907 for (int i = 0; i < 32; i += 4) {
908 HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
909 HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
910 HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
911 HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
912 }
913 //-- end dct 64 stages
914 if (is_row) {
915 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
916 for (int idx = 0; idx < 64; idx += 8) {
917 int32x4_t output[8];
918 Transpose4x4(&s[idx], &output[0]);
919 Transpose4x4(&s[idx + 4], &output[4]);
920 for (int i = 0; i < 8; ++i) {
921 output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
922 }
923 StoreDst<4>(dst, step, idx, &output[0]);
924 StoreDst<4>(dst, step, idx + 4, &output[4]);
925 }
926 } else {
927 StoreDst<64>(dst, step, 0, &s[0]);
928 }
929 }
930
931 //------------------------------------------------------------------------------
932 // Asymmetric Discrete Sine Transforms (ADST).
Adst4_NEON(void * dest,int32_t step,bool is_row,int row_shift)933 LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
934 int row_shift) {
935 auto* const dst = static_cast<int32_t*>(dest);
936 int32x4_t s[8];
937 int32x4_t x[4];
938
939 LoadSrc<4>(dst, step, 0, x);
940 if (is_row) {
941 Transpose4x4(x, x);
942 }
943
944 // stage 1.
945 s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
946 s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
947
948 // stage 2.
949 const int32x4_t a7 = vsubq_s32(x[0], x[2]);
950 const int32x4_t b7 = vaddq_s32(a7, x[3]);
951
952 // stage 3.
953 s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
954 s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
955 // s[0] = s[0] + s[3]
956 s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
957 // s[1] = s[1] - s[4]
958 s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
959
960 s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
961 s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
962
963 // stage 4.
964 s[0] = vaddq_s32(s[0], s[5]);
965 s[1] = vsubq_s32(s[1], s[6]);
966
967 // stages 5 and 6.
968 const int32x4_t x0 = vaddq_s32(s[0], s[3]);
969 const int32x4_t x1 = vaddq_s32(s[1], s[3]);
970 const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
971 const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
972 x[0] = vrshrq_n_s32(x0, 12);
973 x[1] = vrshrq_n_s32(x1, 12);
974 x[2] = vrshrq_n_s32(s[2], 12);
975 x[3] = vrshrq_n_s32(x3, 12);
976
977 if (is_row) {
978 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
979 x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
980 x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
981 x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
982 x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
983 Transpose4x4(x, x);
984 }
985 StoreDst<4>(dst, step, 0, x);
986 }
987
988 alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
989 2482};
990
Adst4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)991 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
992 bool should_round, int row_shift) {
993 if (adjusted_tx_height > 1) return false;
994
995 auto* dst = static_cast<int32_t*>(dest);
996 int32x4_t s[2];
997
998 const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
999 const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1000 const int32x4_t v_src0_round =
1001 vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1002
1003 const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
1004 const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
1005 s[1] = vdupq_n_s32(0);
1006
1007 // s0*k0 s0*k1 s0*k2 s0*k1
1008 s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
1009 // 0 0 0 s0*k0
1010 s[1] = vextq_s32(s[1], s[0], 1);
1011
1012 const int32x4_t x3 = vaddq_s32(s[0], s[1]);
1013 const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
1014
1015 // vqrshlq_s32 will shift right if shift value is negative.
1016 vst1q_s32(dst,
1017 vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
1018
1019 return true;
1020 }
1021
Adst4DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1022 LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
1023 int width) {
1024 if (adjusted_tx_height > 1) return false;
1025
1026 auto* dst = static_cast<int32_t*>(dest);
1027 int32x4_t s[4];
1028
1029 int i = 0;
1030 do {
1031 const int32x4_t v_src = vld1q_s32(&dst[i]);
1032
1033 s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
1034 s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
1035 s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
1036
1037 const int32x4_t x0 = s[0];
1038 const int32x4_t x1 = s[1];
1039 const int32x4_t x2 = s[2];
1040 const int32x4_t x3 = vaddq_s32(s[0], s[1]);
1041 const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
1042 const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
1043 const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
1044 const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
1045
1046 vst1q_s32(&dst[i], dst_0);
1047 vst1q_s32(&dst[i + width * 1], dst_1);
1048 vst1q_s32(&dst[i + width * 2], dst_2);
1049 vst1q_s32(&dst[i + width * 3], dst_3);
1050
1051 i += 4;
1052 } while (i < width);
1053
1054 return true;
1055 }
1056
1057 template <ButterflyRotationFunc butterfly_rotation>
Adst8_NEON(void * dest,int32_t step,bool is_row,int row_shift)1058 LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
1059 int row_shift) {
1060 auto* const dst = static_cast<int32_t*>(dest);
1061 const int32_t range = is_row ? kBitdepth10 + 7 : 15;
1062 const int32x4_t min = vdupq_n_s32(-(1 << range));
1063 const int32x4_t max = vdupq_n_s32((1 << range) - 1);
1064 int32x4_t s[8], x[8];
1065
1066 if (is_row) {
1067 LoadSrc<4>(dst, step, 0, &x[0]);
1068 LoadSrc<4>(dst, step, 4, &x[4]);
1069 Transpose4x4(&x[0], &x[0]);
1070 Transpose4x4(&x[4], &x[4]);
1071 } else {
1072 LoadSrc<8>(dst, step, 0, &x[0]);
1073 }
1074
1075 // stage 1.
1076 s[0] = x[7];
1077 s[1] = x[0];
1078 s[2] = x[5];
1079 s[3] = x[2];
1080 s[4] = x[3];
1081 s[5] = x[4];
1082 s[6] = x[1];
1083 s[7] = x[6];
1084
1085 // stage 2.
1086 butterfly_rotation(&s[0], &s[1], 60 - 0, true);
1087 butterfly_rotation(&s[2], &s[3], 60 - 16, true);
1088 butterfly_rotation(&s[4], &s[5], 60 - 32, true);
1089 butterfly_rotation(&s[6], &s[7], 60 - 48, true);
1090
1091 // stage 3.
1092 HadamardRotation(&s[0], &s[4], false, &min, &max);
1093 HadamardRotation(&s[1], &s[5], false, &min, &max);
1094 HadamardRotation(&s[2], &s[6], false, &min, &max);
1095 HadamardRotation(&s[3], &s[7], false, &min, &max);
1096
1097 // stage 4.
1098 butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1099 butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1100
1101 // stage 5.
1102 HadamardRotation(&s[0], &s[2], false, &min, &max);
1103 HadamardRotation(&s[4], &s[6], false, &min, &max);
1104 HadamardRotation(&s[1], &s[3], false, &min, &max);
1105 HadamardRotation(&s[5], &s[7], false, &min, &max);
1106
1107 // stage 6.
1108 butterfly_rotation(&s[2], &s[3], 32, true);
1109 butterfly_rotation(&s[6], &s[7], 32, true);
1110
1111 // stage 7.
1112 x[0] = s[0];
1113 x[1] = vqnegq_s32(s[4]);
1114 x[2] = s[6];
1115 x[3] = vqnegq_s32(s[2]);
1116 x[4] = s[3];
1117 x[5] = vqnegq_s32(s[7]);
1118 x[6] = s[5];
1119 x[7] = vqnegq_s32(s[1]);
1120
1121 if (is_row) {
1122 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
1123 for (int i = 0; i < 8; ++i) {
1124 x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
1125 }
1126 Transpose4x4(&x[0], &x[0]);
1127 Transpose4x4(&x[4], &x[4]);
1128 StoreDst<4>(dst, step, 0, &x[0]);
1129 StoreDst<4>(dst, step, 4, &x[4]);
1130 } else {
1131 StoreDst<8>(dst, step, 0, &x[0]);
1132 }
1133 }
1134
Adst8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1135 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
1136 bool should_round, int row_shift) {
1137 if (adjusted_tx_height > 1) return false;
1138
1139 auto* dst = static_cast<int32_t*>(dest);
1140 int32x4_t s[8];
1141
1142 const int32x4_t v_src = vdupq_n_s32(dst[0]);
1143 const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1144 const int32x4_t v_src_round =
1145 vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
1146 // stage 1.
1147 s[1] = vbslq_s32(v_mask, v_src_round, v_src);
1148
1149 // stage 2.
1150 ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1151
1152 // stage 3.
1153 s[4] = s[0];
1154 s[5] = s[1];
1155
1156 // stage 4.
1157 ButterflyRotation_4(&s[4], &s[5], 48, true);
1158
1159 // stage 5.
1160 s[2] = s[0];
1161 s[3] = s[1];
1162 s[6] = s[4];
1163 s[7] = s[5];
1164
1165 // stage 6.
1166 ButterflyRotation_4(&s[2], &s[3], 32, true);
1167 ButterflyRotation_4(&s[6], &s[7], 32, true);
1168
1169 // stage 7.
1170 int32x4_t x[8];
1171 x[0] = s[0];
1172 x[1] = vqnegq_s32(s[4]);
1173 x[2] = s[6];
1174 x[3] = vqnegq_s32(s[2]);
1175 x[4] = s[3];
1176 x[5] = vqnegq_s32(s[7]);
1177 x[6] = s[5];
1178 x[7] = vqnegq_s32(s[1]);
1179
1180 for (int i = 0; i < 8; ++i) {
1181 // vqrshlq_s32 will shift right if shift value is negative.
1182 x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
1183 vst1q_lane_s32(&dst[i], x[i], 0);
1184 }
1185
1186 return true;
1187 }
1188
Adst8DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1189 LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
1190 int width) {
1191 if (adjusted_tx_height > 1) return false;
1192
1193 auto* dst = static_cast<int32_t*>(dest);
1194 int32x4_t s[8];
1195
1196 int i = 0;
1197 do {
1198 const int32x4_t v_src = vld1q_s32(dst);
1199 // stage 1.
1200 s[1] = v_src;
1201
1202 // stage 2.
1203 ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
1204
1205 // stage 3.
1206 s[4] = s[0];
1207 s[5] = s[1];
1208
1209 // stage 4.
1210 ButterflyRotation_4(&s[4], &s[5], 48, true);
1211
1212 // stage 5.
1213 s[2] = s[0];
1214 s[3] = s[1];
1215 s[6] = s[4];
1216 s[7] = s[5];
1217
1218 // stage 6.
1219 ButterflyRotation_4(&s[2], &s[3], 32, true);
1220 ButterflyRotation_4(&s[6], &s[7], 32, true);
1221
1222 // stage 7.
1223 int32x4_t x[8];
1224 x[0] = s[0];
1225 x[1] = vqnegq_s32(s[4]);
1226 x[2] = s[6];
1227 x[3] = vqnegq_s32(s[2]);
1228 x[4] = s[3];
1229 x[5] = vqnegq_s32(s[7]);
1230 x[6] = s[5];
1231 x[7] = vqnegq_s32(s[1]);
1232
1233 for (int j = 0; j < 8; ++j) {
1234 vst1q_s32(&dst[j * width], x[j]);
1235 }
1236 i += 4;
1237 dst += 4;
1238 } while (i < width);
1239
1240 return true;
1241 }
1242
1243 template <ButterflyRotationFunc butterfly_rotation>
Adst16_NEON(void * dest,int32_t step,bool is_row,int row_shift)1244 LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
1245 int row_shift) {
1246 auto* const dst = static_cast<int32_t*>(dest);
1247 const int32_t range = is_row ? kBitdepth10 + 7 : 15;
1248 const int32x4_t min = vdupq_n_s32(-(1 << range));
1249 const int32x4_t max = vdupq_n_s32((1 << range) - 1);
1250 int32x4_t s[16], x[16];
1251
1252 if (is_row) {
1253 for (int idx = 0; idx < 16; idx += 8) {
1254 LoadSrc<4>(dst, step, idx, &x[idx]);
1255 LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
1256 Transpose4x4(&x[idx], &x[idx]);
1257 Transpose4x4(&x[idx + 4], &x[idx + 4]);
1258 }
1259 } else {
1260 LoadSrc<16>(dst, step, 0, &x[0]);
1261 }
1262
1263 // stage 1.
1264 s[0] = x[15];
1265 s[1] = x[0];
1266 s[2] = x[13];
1267 s[3] = x[2];
1268 s[4] = x[11];
1269 s[5] = x[4];
1270 s[6] = x[9];
1271 s[7] = x[6];
1272 s[8] = x[7];
1273 s[9] = x[8];
1274 s[10] = x[5];
1275 s[11] = x[10];
1276 s[12] = x[3];
1277 s[13] = x[12];
1278 s[14] = x[1];
1279 s[15] = x[14];
1280
1281 // stage 2.
1282 butterfly_rotation(&s[0], &s[1], 62 - 0, true);
1283 butterfly_rotation(&s[2], &s[3], 62 - 8, true);
1284 butterfly_rotation(&s[4], &s[5], 62 - 16, true);
1285 butterfly_rotation(&s[6], &s[7], 62 - 24, true);
1286 butterfly_rotation(&s[8], &s[9], 62 - 32, true);
1287 butterfly_rotation(&s[10], &s[11], 62 - 40, true);
1288 butterfly_rotation(&s[12], &s[13], 62 - 48, true);
1289 butterfly_rotation(&s[14], &s[15], 62 - 56, true);
1290
1291 // stage 3.
1292 HadamardRotation(&s[0], &s[8], false, &min, &max);
1293 HadamardRotation(&s[1], &s[9], false, &min, &max);
1294 HadamardRotation(&s[2], &s[10], false, &min, &max);
1295 HadamardRotation(&s[3], &s[11], false, &min, &max);
1296 HadamardRotation(&s[4], &s[12], false, &min, &max);
1297 HadamardRotation(&s[5], &s[13], false, &min, &max);
1298 HadamardRotation(&s[6], &s[14], false, &min, &max);
1299 HadamardRotation(&s[7], &s[15], false, &min, &max);
1300
1301 // stage 4.
1302 butterfly_rotation(&s[8], &s[9], 56 - 0, true);
1303 butterfly_rotation(&s[13], &s[12], 8 + 0, true);
1304 butterfly_rotation(&s[10], &s[11], 56 - 32, true);
1305 butterfly_rotation(&s[15], &s[14], 8 + 32, true);
1306
1307 // stage 5.
1308 HadamardRotation(&s[0], &s[4], false, &min, &max);
1309 HadamardRotation(&s[8], &s[12], false, &min, &max);
1310 HadamardRotation(&s[1], &s[5], false, &min, &max);
1311 HadamardRotation(&s[9], &s[13], false, &min, &max);
1312 HadamardRotation(&s[2], &s[6], false, &min, &max);
1313 HadamardRotation(&s[10], &s[14], false, &min, &max);
1314 HadamardRotation(&s[3], &s[7], false, &min, &max);
1315 HadamardRotation(&s[11], &s[15], false, &min, &max);
1316
1317 // stage 6.
1318 butterfly_rotation(&s[4], &s[5], 48 - 0, true);
1319 butterfly_rotation(&s[12], &s[13], 48 - 0, true);
1320 butterfly_rotation(&s[7], &s[6], 48 - 32, true);
1321 butterfly_rotation(&s[15], &s[14], 48 - 32, true);
1322
1323 // stage 7.
1324 HadamardRotation(&s[0], &s[2], false, &min, &max);
1325 HadamardRotation(&s[4], &s[6], false, &min, &max);
1326 HadamardRotation(&s[8], &s[10], false, &min, &max);
1327 HadamardRotation(&s[12], &s[14], false, &min, &max);
1328 HadamardRotation(&s[1], &s[3], false, &min, &max);
1329 HadamardRotation(&s[5], &s[7], false, &min, &max);
1330 HadamardRotation(&s[9], &s[11], false, &min, &max);
1331 HadamardRotation(&s[13], &s[15], false, &min, &max);
1332
1333 // stage 8.
1334 butterfly_rotation(&s[2], &s[3], 32, true);
1335 butterfly_rotation(&s[6], &s[7], 32, true);
1336 butterfly_rotation(&s[10], &s[11], 32, true);
1337 butterfly_rotation(&s[14], &s[15], 32, true);
1338
1339 // stage 9.
1340 x[0] = s[0];
1341 x[1] = vqnegq_s32(s[8]);
1342 x[2] = s[12];
1343 x[3] = vqnegq_s32(s[4]);
1344 x[4] = s[6];
1345 x[5] = vqnegq_s32(s[14]);
1346 x[6] = s[10];
1347 x[7] = vqnegq_s32(s[2]);
1348 x[8] = s[3];
1349 x[9] = vqnegq_s32(s[11]);
1350 x[10] = s[15];
1351 x[11] = vqnegq_s32(s[7]);
1352 x[12] = s[5];
1353 x[13] = vqnegq_s32(s[13]);
1354 x[14] = s[9];
1355 x[15] = vqnegq_s32(s[1]);
1356
1357 if (is_row) {
1358 const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
1359 for (int i = 0; i < 16; ++i) {
1360 x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
1361 }
1362 for (int idx = 0; idx < 16; idx += 8) {
1363 Transpose4x4(&x[idx], &x[idx]);
1364 Transpose4x4(&x[idx + 4], &x[idx + 4]);
1365 StoreDst<4>(dst, step, idx, &x[idx]);
1366 StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
1367 }
1368 } else {
1369 StoreDst<16>(dst, step, 0, &x[0]);
1370 }
1371 }
1372
Adst16DcOnlyInternal(int32x4_t * s,int32x4_t * x)1373 LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
1374 // stage 2.
1375 ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
1376
1377 // stage 3.
1378 s[8] = s[0];
1379 s[9] = s[1];
1380
1381 // stage 4.
1382 ButterflyRotation_4(&s[8], &s[9], 56, true);
1383
1384 // stage 5.
1385 s[4] = s[0];
1386 s[12] = s[8];
1387 s[5] = s[1];
1388 s[13] = s[9];
1389
1390 // stage 6.
1391 ButterflyRotation_4(&s[4], &s[5], 48, true);
1392 ButterflyRotation_4(&s[12], &s[13], 48, true);
1393
1394 // stage 7.
1395 s[2] = s[0];
1396 s[6] = s[4];
1397 s[10] = s[8];
1398 s[14] = s[12];
1399 s[3] = s[1];
1400 s[7] = s[5];
1401 s[11] = s[9];
1402 s[15] = s[13];
1403
1404 // stage 8.
1405 ButterflyRotation_4(&s[2], &s[3], 32, true);
1406 ButterflyRotation_4(&s[6], &s[7], 32, true);
1407 ButterflyRotation_4(&s[10], &s[11], 32, true);
1408 ButterflyRotation_4(&s[14], &s[15], 32, true);
1409
1410 // stage 9.
1411 x[0] = s[0];
1412 x[1] = vqnegq_s32(s[8]);
1413 x[2] = s[12];
1414 x[3] = vqnegq_s32(s[4]);
1415 x[4] = s[6];
1416 x[5] = vqnegq_s32(s[14]);
1417 x[6] = s[10];
1418 x[7] = vqnegq_s32(s[2]);
1419 x[8] = s[3];
1420 x[9] = vqnegq_s32(s[11]);
1421 x[10] = s[15];
1422 x[11] = vqnegq_s32(s[7]);
1423 x[12] = s[5];
1424 x[13] = vqnegq_s32(s[13]);
1425 x[14] = s[9];
1426 x[15] = vqnegq_s32(s[1]);
1427 }
1428
Adst16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1429 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
1430 bool should_round, int row_shift) {
1431 if (adjusted_tx_height > 1) return false;
1432
1433 auto* dst = static_cast<int32_t*>(dest);
1434 int32x4_t s[16];
1435 int32x4_t x[16];
1436 const int32x4_t v_src = vdupq_n_s32(dst[0]);
1437 const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1438 const int32x4_t v_src_round =
1439 vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
1440 // stage 1.
1441 s[1] = vbslq_s32(v_mask, v_src_round, v_src);
1442
1443 Adst16DcOnlyInternal(s, x);
1444
1445 for (int i = 0; i < 16; ++i) {
1446 // vqrshlq_s32 will shift right if shift value is negative.
1447 x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
1448 vst1q_lane_s32(&dst[i], x[i], 0);
1449 }
1450
1451 return true;
1452 }
1453
Adst16DcOnlyColumn(void * dest,int adjusted_tx_height,int width)1454 LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
1455 int adjusted_tx_height,
1456 int width) {
1457 if (adjusted_tx_height > 1) return false;
1458
1459 auto* dst = static_cast<int32_t*>(dest);
1460 int i = 0;
1461 do {
1462 int32x4_t s[16];
1463 int32x4_t x[16];
1464 const int32x4_t v_src = vld1q_s32(dst);
1465 // stage 1.
1466 s[1] = v_src;
1467
1468 Adst16DcOnlyInternal(s, x);
1469
1470 for (int j = 0; j < 16; ++j) {
1471 vst1q_s32(&dst[j * width], x[j]);
1472 }
1473 i += 4;
1474 dst += 4;
1475 } while (i < width);
1476
1477 return true;
1478 }
1479
1480 //------------------------------------------------------------------------------
1481 // Identity Transforms.
1482
Identity4_NEON(void * dest,int32_t step,int shift)1483 LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
1484 auto* const dst = static_cast<int32_t*>(dest);
1485 const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1486 const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
1487 const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
1488 for (int i = 0; i < 4; ++i) {
1489 const int32x4_t v_src = vld1q_s32(&dst[i * step]);
1490 const int32x4_t v_src_mult_lo =
1491 vmlaq_s32(v_dual_round, v_src, v_multiplier);
1492 const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
1493 vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
1494 }
1495 }
1496
Identity4DcOnly(void * dest,int adjusted_tx_height,bool should_round,int tx_height)1497 LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
1498 bool should_round, int tx_height) {
1499 if (adjusted_tx_height > 1) return false;
1500
1501 auto* dst = static_cast<int32_t*>(dest);
1502 const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
1503 const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1504 const int32x4_t v_src_round =
1505 vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1506 const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
1507 const int shift = tx_height < 16 ? 0 : 1;
1508 const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1509 const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
1510 const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
1511 const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
1512 const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
1513 vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
1514 return true;
1515 }
1516
1517 template <int identity_size>
IdentityColumnStoreToFrame(Array2DView<uint16_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int32_t * source)1518 LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
1519 Array2DView<uint16_t> frame, const int start_x, const int start_y,
1520 const int tx_width, const int tx_height, const int32_t* source) {
1521 static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
1522 "Invalid identity_size.");
1523 const int stride = frame.columns();
1524 uint16_t* dst = frame[start_y] + start_x;
1525 const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
1526 const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
1527
1528 if (tx_width == 4) {
1529 int i = 0;
1530 do {
1531 int32x4x2_t v_src, v_dst_i, a, b;
1532 v_src.val[0] = vld1q_s32(&source[i * 4]);
1533 v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
1534 if (identity_size == 4) {
1535 v_dst_i.val[0] =
1536 vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
1537 v_dst_i.val[1] =
1538 vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
1539 a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1540 a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1541 } else if (identity_size == 8) {
1542 v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
1543 v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
1544 a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
1545 a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
1546 } else { // identity_size == 16
1547 v_dst_i.val[0] =
1548 vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
1549 v_dst_i.val[1] =
1550 vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
1551 a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1552 a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1553 }
1554 uint16x4x2_t frame_data;
1555 frame_data.val[0] = vld1_u16(dst);
1556 frame_data.val[1] = vld1_u16(dst + stride);
1557 b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
1558 b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
1559 vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
1560 vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
1561 dst += stride << 1;
1562 i += 2;
1563 } while (i < tx_height);
1564 } else {
1565 int i = 0;
1566 do {
1567 const int row = i * tx_width;
1568 int j = 0;
1569 do {
1570 int32x4x2_t v_src, v_dst_i, a, b;
1571 v_src.val[0] = vld1q_s32(&source[row + j]);
1572 v_src.val[1] = vld1q_s32(&source[row + j + 4]);
1573 if (identity_size == 4) {
1574 v_dst_i.val[0] =
1575 vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
1576 v_dst_i.val[1] =
1577 vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
1578 a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1579 a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1580 } else if (identity_size == 8) {
1581 v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
1582 v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
1583 a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
1584 a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
1585 } else { // identity_size == 16
1586 v_dst_i.val[0] =
1587 vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
1588 v_dst_i.val[1] =
1589 vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
1590 a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
1591 a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
1592 }
1593 uint16x4x2_t frame_data;
1594 frame_data.val[0] = vld1_u16(dst + j);
1595 frame_data.val[1] = vld1_u16(dst + j + 4);
1596 b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
1597 b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
1598 vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
1599 vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
1600 j += 8;
1601 } while (j < tx_width);
1602 dst += stride;
1603 } while (++i < tx_height);
1604 }
1605 }
1606
Identity4RowColumnStoreToFrame(Array2DView<uint16_t> frame,const int start_x,const int start_y,const int tx_width,const int tx_height,const int32_t * source)1607 LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
1608 Array2DView<uint16_t> frame, const int start_x, const int start_y,
1609 const int tx_width, const int tx_height, const int32_t* source) {
1610 const int stride = frame.columns();
1611 uint16_t* dst = frame[start_y] + start_x;
1612 const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
1613 const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
1614
1615 if (tx_width == 4) {
1616 int i = 0;
1617 do {
1618 const int32x4_t v_src = vld1q_s32(&source[i * 4]);
1619 const int32x4_t v_dst_row =
1620 vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
1621 const int32x4_t v_dst_col =
1622 vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
1623 const uint16x4_t frame_data = vld1_u16(dst);
1624 const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
1625 const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
1626 vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
1627 dst += stride;
1628 } while (++i < tx_height);
1629 } else {
1630 int i = 0;
1631 do {
1632 const int row = i * tx_width;
1633 int j = 0;
1634 do {
1635 int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
1636 v_src.val[0] = vld1q_s32(&source[row + j]);
1637 v_src.val[1] = vld1q_s32(&source[row + j + 4]);
1638 v_src_round.val[0] = vshrq_n_s32(
1639 vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
1640 v_src_round.val[1] = vshrq_n_s32(
1641 vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
1642 v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
1643 v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
1644 v_dst_col.val[0] =
1645 vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
1646 v_dst_col.val[1] =
1647 vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
1648 uint16x4x2_t frame_data;
1649 frame_data.val[0] = vld1_u16(dst + j);
1650 frame_data.val[1] = vld1_u16(dst + j + 4);
1651 a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
1652 a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
1653 b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
1654 b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
1655 vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
1656 vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
1657 j += 8;
1658 } while (j < tx_width);
1659 dst += stride;
1660 } while (++i < tx_height);
1661 }
1662 }
1663
Identity8Row32_NEON(void * dest,int32_t step)1664 LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
1665 auto* const dst = static_cast<int32_t*>(dest);
1666
1667 // When combining the identity8 multiplier with the row shift, the
1668 // calculations for tx_height equal to 32 can be simplified from
1669 // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
1670 for (int i = 0; i < 4; ++i) {
1671 const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
1672 const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
1673 const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
1674 const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
1675 vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
1676 vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
1677 }
1678 }
1679
Identity8Row4_NEON(void * dest,int32_t step)1680 LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
1681 auto* const dst = static_cast<int32_t*>(dest);
1682
1683 for (int i = 0; i < 4; ++i) {
1684 const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
1685 const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
1686 const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
1687 const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
1688 vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
1689 vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
1690 }
1691 }
1692
Identity8DcOnly(void * dest,int adjusted_tx_height,bool should_round,int row_shift)1693 LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
1694 bool should_round, int row_shift) {
1695 if (adjusted_tx_height > 1) return false;
1696
1697 auto* dst = static_cast<int32_t*>(dest);
1698 const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
1699 const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1700 const int32x4_t v_src_round =
1701 vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1702 const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
1703 const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
1704 const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
1705 vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
1706 return true;
1707 }
1708
Identity16Row_NEON(void * dest,int32_t step,int shift)1709 LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
1710 int shift) {
1711 auto* const dst = static_cast<int32_t*>(dest);
1712 const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1713 const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
1714
1715 for (int i = 0; i < 4; ++i) {
1716 for (int j = 0; j < 2; ++j) {
1717 int32x4x2_t v_src;
1718 v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
1719 v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
1720 const int32x4_t v_src_mult_lo =
1721 vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
1722 const int32x4_t v_src_mult_hi =
1723 vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
1724 const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
1725 const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
1726 vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
1727 vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
1728 }
1729 }
1730 }
1731
Identity16DcOnly(void * dest,int adjusted_tx_height,bool should_round,int shift)1732 LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
1733 bool should_round, int shift) {
1734 if (adjusted_tx_height > 1) return false;
1735
1736 auto* dst = static_cast<int32_t*>(dest);
1737 const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
1738 const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
1739 const int32x4_t v_src_round =
1740 vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
1741 const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
1742 const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
1743 const int32x4_t v_src_mult_lo =
1744 vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
1745 const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
1746 vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
1747 return true;
1748 }
1749
1750 //------------------------------------------------------------------------------
1751 // row/column transform loops
1752
1753 template <int tx_height>
FlipColumns(int32_t * source,int tx_width)1754 LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
1755 if (tx_width >= 16) {
1756 int i = 0;
1757 do {
1758 // 00 01 02 03
1759 const int32x4_t a = vld1q_s32(&source[i]);
1760 const int32x4_t b = vld1q_s32(&source[i + 4]);
1761 const int32x4_t c = vld1q_s32(&source[i + 8]);
1762 const int32x4_t d = vld1q_s32(&source[i + 12]);
1763 // 01 00 03 02
1764 const int32x4_t a_rev = vrev64q_s32(a);
1765 const int32x4_t b_rev = vrev64q_s32(b);
1766 const int32x4_t c_rev = vrev64q_s32(c);
1767 const int32x4_t d_rev = vrev64q_s32(d);
1768 // 03 02 01 00
1769 vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
1770 vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
1771 vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
1772 vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
1773 i += 16;
1774 } while (i < tx_width * tx_height);
1775 } else if (tx_width == 8) {
1776 for (int i = 0; i < 8 * tx_height; i += 8) {
1777 // 00 01 02 03
1778 const int32x4_t a = vld1q_s32(&source[i]);
1779 const int32x4_t b = vld1q_s32(&source[i + 4]);
1780 // 01 00 03 02
1781 const int32x4_t a_rev = vrev64q_s32(a);
1782 const int32x4_t b_rev = vrev64q_s32(b);
1783 // 03 02 01 00
1784 vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
1785 vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
1786 }
1787 } else {
1788 // Process two rows per iteration.
1789 for (int i = 0; i < 4 * tx_height; i += 8) {
1790 // 00 01 02 03
1791 const int32x4_t a = vld1q_s32(&source[i]);
1792 const int32x4_t b = vld1q_s32(&source[i + 4]);
1793 // 01 00 03 02
1794 const int32x4_t a_rev = vrev64q_s32(a);
1795 const int32x4_t b_rev = vrev64q_s32(b);
1796 // 03 02 01 00
1797 vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
1798 vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
1799 }
1800 }
1801 }
1802
1803 template <int tx_width>
ApplyRounding(int32_t * source,int num_rows)1804 LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
1805 // Process two rows per iteration.
1806 int i = 0;
1807 do {
1808 const int32x4_t a_lo = vld1q_s32(&source[i]);
1809 const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
1810 const int32x4_t b_lo =
1811 vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
1812 const int32x4_t b_hi =
1813 vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
1814 vst1q_s32(&source[i], b_lo);
1815 vst1q_s32(&source[i + 4], b_hi);
1816 i += 8;
1817 } while (i < tx_width * num_rows);
1818 }
1819
1820 template <int tx_width>
RowShift(int32_t * source,int num_rows,int row_shift)1821 LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
1822 int row_shift) {
1823 // vqrshlq_s32 will shift right if shift value is negative.
1824 row_shift = -row_shift;
1825
1826 // Process two rows per iteration.
1827 int i = 0;
1828 do {
1829 const int32x4_t residual0 = vld1q_s32(&source[i]);
1830 const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
1831 vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
1832 vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
1833 i += 8;
1834 } while (i < tx_width * num_rows);
1835 }
1836
1837 template <int tx_height, bool enable_flip_rows = false>
StoreToFrameWithRound(Array2DView<uint16_t> frame,const int start_x,const int start_y,const int tx_width,const int32_t * source,TransformType tx_type)1838 LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
1839 Array2DView<uint16_t> frame, const int start_x, const int start_y,
1840 const int tx_width, const int32_t* source, TransformType tx_type) {
1841 const bool flip_rows =
1842 enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
1843 const int stride = frame.columns();
1844 uint16_t* dst = frame[start_y] + start_x;
1845
1846 if (tx_width == 4) {
1847 for (int i = 0; i < tx_height; ++i) {
1848 const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
1849 const int32x4_t residual = vld1q_s32(&source[row]);
1850 const uint16x4_t frame_data = vld1_u16(dst);
1851 const int32x4_t a = vrshrq_n_s32(residual, 4);
1852 const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
1853 const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
1854 vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
1855 dst += stride;
1856 }
1857 } else {
1858 for (int i = 0; i < tx_height; ++i) {
1859 const int y = start_y + i;
1860 const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
1861 int j = 0;
1862 do {
1863 const int x = start_x + j;
1864 const int32x4_t residual = vld1q_s32(&source[row + j]);
1865 const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
1866 const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
1867 const int32x4_t a = vrshrq_n_s32(residual, 4);
1868 const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
1869 const uint32x4_t b =
1870 vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
1871 const uint32x4_t b_hi =
1872 vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
1873 const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
1874 const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
1875 vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
1876 vdupq_n_u16((1 << kBitdepth10) - 1)));
1877 j += 8;
1878 } while (j < tx_width);
1879 }
1880 }
1881 }
1882
Dct4TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)1883 void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
1884 int adjusted_tx_height, void* src_buffer,
1885 int /*start_x*/, int /*start_y*/,
1886 void* /*dst_frame*/) {
1887 auto* src = static_cast<int32_t*>(src_buffer);
1888 const int tx_height = kTransformHeight[tx_size];
1889 const bool should_round = (tx_height == 8);
1890 const int row_shift = (tx_height == 16);
1891
1892 if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
1893 return;
1894 }
1895
1896 if (should_round) {
1897 ApplyRounding<4>(src, adjusted_tx_height);
1898 }
1899
1900 // Process 4 1d dct4 rows in parallel per iteration.
1901 int i = adjusted_tx_height;
1902 auto* data = src;
1903 do {
1904 Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
1905 row_shift);
1906 data += 16;
1907 i -= 4;
1908 } while (i != 0);
1909 }
1910
Dct4TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)1911 void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
1912 int adjusted_tx_height, void* src_buffer,
1913 int start_x, int start_y, void* dst_frame) {
1914 auto* src = static_cast<int32_t*>(src_buffer);
1915 const int tx_width = kTransformWidth[tx_size];
1916
1917 if (kTransformFlipColumnsMask.Contains(tx_type)) {
1918 FlipColumns<4>(src, tx_width);
1919 }
1920
1921 if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
1922 // Process 4 1d dct4 columns in parallel per iteration.
1923 int i = tx_width;
1924 auto* data = src;
1925 do {
1926 Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
1927 /*row_shift=*/0);
1928 data += 4;
1929 i -= 4;
1930 } while (i != 0);
1931 }
1932
1933 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
1934 StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
1935 }
1936
Dct8TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)1937 void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
1938 int adjusted_tx_height, void* src_buffer,
1939 int /*start_x*/, int /*start_y*/,
1940 void* /*dst_frame*/) {
1941 auto* src = static_cast<int32_t*>(src_buffer);
1942 const bool should_round = kShouldRound[tx_size];
1943 const uint8_t row_shift = kTransformRowShift[tx_size];
1944
1945 if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
1946 return;
1947 }
1948
1949 if (should_round) {
1950 ApplyRounding<8>(src, adjusted_tx_height);
1951 }
1952
1953 // Process 4 1d dct8 rows in parallel per iteration.
1954 int i = adjusted_tx_height;
1955 auto* data = src;
1956 do {
1957 Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
1958 row_shift);
1959 data += 32;
1960 i -= 4;
1961 } while (i != 0);
1962 }
1963
Dct8TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)1964 void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
1965 int adjusted_tx_height, void* src_buffer,
1966 int start_x, int start_y, void* dst_frame) {
1967 auto* src = static_cast<int32_t*>(src_buffer);
1968 const int tx_width = kTransformWidth[tx_size];
1969
1970 if (kTransformFlipColumnsMask.Contains(tx_type)) {
1971 FlipColumns<8>(src, tx_width);
1972 }
1973
1974 if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
1975 // Process 4 1d dct8 columns in parallel per iteration.
1976 int i = tx_width;
1977 auto* data = src;
1978 do {
1979 Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
1980 /*row_shift=*/0);
1981 data += 4;
1982 i -= 4;
1983 } while (i != 0);
1984 }
1985 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
1986 StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
1987 }
1988
Dct16TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)1989 void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
1990 TransformSize tx_size, int adjusted_tx_height,
1991 void* src_buffer, int /*start_x*/,
1992 int /*start_y*/, void* /*dst_frame*/) {
1993 auto* src = static_cast<int32_t*>(src_buffer);
1994 const bool should_round = kShouldRound[tx_size];
1995 const uint8_t row_shift = kTransformRowShift[tx_size];
1996
1997 if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
1998 return;
1999 }
2000
2001 if (should_round) {
2002 ApplyRounding<16>(src, adjusted_tx_height);
2003 }
2004
2005 assert(adjusted_tx_height % 4 == 0);
2006 int i = adjusted_tx_height;
2007 auto* data = src;
2008 do {
2009 // Process 4 1d dct16 rows in parallel per iteration.
2010 Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
2011 data += 64;
2012 i -= 4;
2013 } while (i != 0);
2014 }
2015
Dct16TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2016 void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2017 int adjusted_tx_height, void* src_buffer,
2018 int start_x, int start_y, void* dst_frame) {
2019 auto* src = static_cast<int32_t*>(src_buffer);
2020 const int tx_width = kTransformWidth[tx_size];
2021
2022 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2023 FlipColumns<16>(src, tx_width);
2024 }
2025
2026 if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
2027 // Process 4 1d dct16 columns in parallel per iteration.
2028 int i = tx_width;
2029 auto* data = src;
2030 do {
2031 Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
2032 /*row_shift=*/0);
2033 data += 4;
2034 i -= 4;
2035 } while (i != 0);
2036 }
2037 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2038 StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
2039 }
2040
Dct32TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2041 void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
2042 TransformSize tx_size, int adjusted_tx_height,
2043 void* src_buffer, int /*start_x*/,
2044 int /*start_y*/, void* /*dst_frame*/) {
2045 auto* src = static_cast<int32_t*>(src_buffer);
2046 const bool should_round = kShouldRound[tx_size];
2047 const uint8_t row_shift = kTransformRowShift[tx_size];
2048
2049 if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
2050 return;
2051 }
2052
2053 if (should_round) {
2054 ApplyRounding<32>(src, adjusted_tx_height);
2055 }
2056
2057 assert(adjusted_tx_height % 4 == 0);
2058 int i = adjusted_tx_height;
2059 auto* data = src;
2060 do {
2061 // Process 4 1d dct32 rows in parallel per iteration.
2062 Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
2063 data += 128;
2064 i -= 4;
2065 } while (i != 0);
2066 }
2067
Dct32TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2068 void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2069 int adjusted_tx_height, void* src_buffer,
2070 int start_x, int start_y, void* dst_frame) {
2071 auto* src = static_cast<int32_t*>(src_buffer);
2072 const int tx_width = kTransformWidth[tx_size];
2073
2074 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2075 FlipColumns<32>(src, tx_width);
2076 }
2077
2078 if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
2079 // Process 4 1d dct32 columns in parallel per iteration.
2080 int i = tx_width;
2081 auto* data = src;
2082 do {
2083 Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
2084 data += 4;
2085 i -= 4;
2086 } while (i != 0);
2087 }
2088 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2089 StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
2090 }
2091
Dct64TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2092 void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
2093 TransformSize tx_size, int adjusted_tx_height,
2094 void* src_buffer, int /*start_x*/,
2095 int /*start_y*/, void* /*dst_frame*/) {
2096 auto* src = static_cast<int32_t*>(src_buffer);
2097 const bool should_round = kShouldRound[tx_size];
2098 const uint8_t row_shift = kTransformRowShift[tx_size];
2099
2100 if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
2101 return;
2102 }
2103
2104 if (should_round) {
2105 ApplyRounding<64>(src, adjusted_tx_height);
2106 }
2107
2108 assert(adjusted_tx_height % 4 == 0);
2109 int i = adjusted_tx_height;
2110 auto* data = src;
2111 do {
2112 // Process 4 1d dct64 rows in parallel per iteration.
2113 Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
2114 data += 128 * 2;
2115 i -= 4;
2116 } while (i != 0);
2117 }
2118
Dct64TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2119 void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2120 int adjusted_tx_height, void* src_buffer,
2121 int start_x, int start_y, void* dst_frame) {
2122 auto* src = static_cast<int32_t*>(src_buffer);
2123 const int tx_width = kTransformWidth[tx_size];
2124
2125 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2126 FlipColumns<64>(src, tx_width);
2127 }
2128
2129 if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
2130 // Process 4 1d dct64 columns in parallel per iteration.
2131 int i = tx_width;
2132 auto* data = src;
2133 do {
2134 Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
2135 data += 4;
2136 i -= 4;
2137 } while (i != 0);
2138 }
2139 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2140 StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
2141 }
2142
Adst4TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2143 void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
2144 TransformSize tx_size, int adjusted_tx_height,
2145 void* src_buffer, int /*start_x*/,
2146 int /*start_y*/, void* /*dst_frame*/) {
2147 auto* src = static_cast<int32_t*>(src_buffer);
2148 const int tx_height = kTransformHeight[tx_size];
2149 const int row_shift = static_cast<int>(tx_height == 16);
2150 const bool should_round = (tx_height == 8);
2151
2152 if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2153 return;
2154 }
2155
2156 if (should_round) {
2157 ApplyRounding<4>(src, adjusted_tx_height);
2158 }
2159
2160 // Process 4 1d adst4 rows in parallel per iteration.
2161 int i = adjusted_tx_height;
2162 auto* data = src;
2163 do {
2164 Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
2165 data += 16;
2166 i -= 4;
2167 } while (i != 0);
2168 }
2169
Adst4TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2170 void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2171 int adjusted_tx_height, void* src_buffer,
2172 int start_x, int start_y, void* dst_frame) {
2173 auto* src = static_cast<int32_t*>(src_buffer);
2174 const int tx_width = kTransformWidth[tx_size];
2175
2176 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2177 FlipColumns<4>(src, tx_width);
2178 }
2179
2180 if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2181 // Process 4 1d adst4 columns in parallel per iteration.
2182 int i = tx_width;
2183 auto* data = src;
2184 do {
2185 Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
2186 data += 4;
2187 i -= 4;
2188 } while (i != 0);
2189 }
2190
2191 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2192 StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
2193 tx_width, src, tx_type);
2194 }
2195
Adst8TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2196 void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
2197 TransformSize tx_size, int adjusted_tx_height,
2198 void* src_buffer, int /*start_x*/,
2199 int /*start_y*/, void* /*dst_frame*/) {
2200 auto* src = static_cast<int32_t*>(src_buffer);
2201 const bool should_round = kShouldRound[tx_size];
2202 const uint8_t row_shift = kTransformRowShift[tx_size];
2203
2204 if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2205 return;
2206 }
2207
2208 if (should_round) {
2209 ApplyRounding<8>(src, adjusted_tx_height);
2210 }
2211
2212 // Process 4 1d adst8 rows in parallel per iteration.
2213 assert(adjusted_tx_height % 4 == 0);
2214 int i = adjusted_tx_height;
2215 auto* data = src;
2216 do {
2217 Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
2218 /*transpose=*/true, row_shift);
2219 data += 32;
2220 i -= 4;
2221 } while (i != 0);
2222 }
2223
Adst8TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2224 void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
2225 int adjusted_tx_height, void* src_buffer,
2226 int start_x, int start_y, void* dst_frame) {
2227 auto* src = static_cast<int32_t*>(src_buffer);
2228 const int tx_width = kTransformWidth[tx_size];
2229
2230 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2231 FlipColumns<8>(src, tx_width);
2232 }
2233
2234 if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2235 // Process 4 1d adst8 columns in parallel per iteration.
2236 int i = tx_width;
2237 auto* data = src;
2238 do {
2239 Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
2240 /*row_shift=*/0);
2241 data += 4;
2242 i -= 4;
2243 } while (i != 0);
2244 }
2245 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2246 StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
2247 tx_width, src, tx_type);
2248 }
2249
Adst16TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2250 void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
2251 TransformSize tx_size, int adjusted_tx_height,
2252 void* src_buffer, int /*start_x*/,
2253 int /*start_y*/, void* /*dst_frame*/) {
2254 auto* src = static_cast<int32_t*>(src_buffer);
2255 const bool should_round = kShouldRound[tx_size];
2256 const uint8_t row_shift = kTransformRowShift[tx_size];
2257
2258 if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2259 return;
2260 }
2261
2262 if (should_round) {
2263 ApplyRounding<16>(src, adjusted_tx_height);
2264 }
2265
2266 assert(adjusted_tx_height % 4 == 0);
2267 int i = adjusted_tx_height;
2268 do {
2269 // Process 4 1d adst16 rows in parallel per iteration.
2270 Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
2271 src += 64;
2272 i -= 4;
2273 } while (i != 0);
2274 }
2275
Adst16TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2276 void Adst16TransformLoopColumn_NEON(TransformType tx_type,
2277 TransformSize tx_size,
2278 int adjusted_tx_height, void* src_buffer,
2279 int start_x, int start_y, void* dst_frame) {
2280 auto* src = static_cast<int32_t*>(src_buffer);
2281 const int tx_width = kTransformWidth[tx_size];
2282
2283 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2284 FlipColumns<16>(src, tx_width);
2285 }
2286
2287 if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
2288 int i = tx_width;
2289 auto* data = src;
2290 do {
2291 // Process 4 1d adst16 columns in parallel per iteration.
2292 Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
2293 /*row_shift=*/0);
2294 data += 4;
2295 i -= 4;
2296 } while (i != 0);
2297 }
2298 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2299 StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
2300 tx_width, src, tx_type);
2301 }
2302
Identity4TransformLoopRow_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2303 void Identity4TransformLoopRow_NEON(TransformType tx_type,
2304 TransformSize tx_size,
2305 int adjusted_tx_height, void* src_buffer,
2306 int /*start_x*/, int /*start_y*/,
2307 void* /*dst_frame*/) {
2308 // Special case: Process row calculations during column transform call.
2309 // Improves performance.
2310 if (tx_type == kTransformTypeIdentityIdentity &&
2311 tx_size == kTransformSize4x4) {
2312 return;
2313 }
2314
2315 auto* src = static_cast<int32_t*>(src_buffer);
2316 const int tx_height = kTransformHeight[tx_size];
2317 const bool should_round = (tx_height == 8);
2318
2319 if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
2320 return;
2321 }
2322
2323 if (should_round) {
2324 ApplyRounding<4>(src, adjusted_tx_height);
2325 }
2326
2327 const int shift = tx_height > 8 ? 1 : 0;
2328 int i = adjusted_tx_height;
2329 do {
2330 Identity4_NEON(src, /*step=*/4, shift);
2331 src += 16;
2332 i -= 4;
2333 } while (i != 0);
2334 }
2335
Identity4TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2336 void Identity4TransformLoopColumn_NEON(TransformType tx_type,
2337 TransformSize tx_size,
2338 int adjusted_tx_height, void* src_buffer,
2339 int start_x, int start_y,
2340 void* dst_frame) {
2341 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2342 auto* src = static_cast<int32_t*>(src_buffer);
2343 const int tx_width = kTransformWidth[tx_size];
2344
2345 // Special case: Process row calculations during column transform call.
2346 if (tx_type == kTransformTypeIdentityIdentity &&
2347 (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
2348 Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
2349 adjusted_tx_height, src);
2350 return;
2351 }
2352
2353 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2354 FlipColumns<4>(src, tx_width);
2355 }
2356
2357 IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
2358 adjusted_tx_height, src);
2359 }
2360
Identity8TransformLoopRow_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2361 void Identity8TransformLoopRow_NEON(TransformType tx_type,
2362 TransformSize tx_size,
2363 int adjusted_tx_height, void* src_buffer,
2364 int /*start_x*/, int /*start_y*/,
2365 void* /*dst_frame*/) {
2366 // Special case: Process row calculations during column transform call.
2367 // Improves performance.
2368 if (tx_type == kTransformTypeIdentityIdentity &&
2369 tx_size == kTransformSize8x4) {
2370 return;
2371 }
2372
2373 auto* src = static_cast<int32_t*>(src_buffer);
2374 const int tx_height = kTransformHeight[tx_size];
2375 const bool should_round = kShouldRound[tx_size];
2376 const uint8_t row_shift = kTransformRowShift[tx_size];
2377
2378 if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2379 return;
2380 }
2381 if (should_round) {
2382 ApplyRounding<8>(src, adjusted_tx_height);
2383 }
2384
2385 // When combining the identity8 multiplier with the row shift, the
2386 // calculations for tx_height == 8 and tx_height == 16 can be simplified
2387 // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
2388 // bit value.
2389 if ((tx_height & 0x18) != 0) {
2390 for (int i = 0; i < tx_height; ++i) {
2391 const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
2392 const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
2393 vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
2394 vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
2395 }
2396 return;
2397 }
2398 if (tx_height == 32) {
2399 int i = adjusted_tx_height;
2400 do {
2401 Identity8Row32_NEON(src, /*step=*/8);
2402 src += 32;
2403 i -= 4;
2404 } while (i != 0);
2405 return;
2406 }
2407
2408 assert(tx_size == kTransformSize8x4);
2409 int i = adjusted_tx_height;
2410 do {
2411 Identity8Row4_NEON(src, /*step=*/8);
2412 src += 32;
2413 i -= 4;
2414 } while (i != 0);
2415 }
2416
Identity8TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2417 void Identity8TransformLoopColumn_NEON(TransformType tx_type,
2418 TransformSize tx_size,
2419 int adjusted_tx_height, void* src_buffer,
2420 int start_x, int start_y,
2421 void* dst_frame) {
2422 auto* src = static_cast<int32_t*>(src_buffer);
2423 const int tx_width = kTransformWidth[tx_size];
2424
2425 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2426 FlipColumns<8>(src, tx_width);
2427 }
2428 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2429 IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
2430 adjusted_tx_height, src);
2431 }
2432
Identity16TransformLoopRow_NEON(TransformType,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int,int,void *)2433 void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
2434 TransformSize tx_size,
2435 int adjusted_tx_height, void* src_buffer,
2436 int /*start_x*/, int /*start_y*/,
2437 void* /*dst_frame*/) {
2438 auto* src = static_cast<int32_t*>(src_buffer);
2439 const bool should_round = kShouldRound[tx_size];
2440 const uint8_t row_shift = kTransformRowShift[tx_size];
2441
2442 if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
2443 return;
2444 }
2445
2446 if (should_round) {
2447 ApplyRounding<16>(src, adjusted_tx_height);
2448 }
2449 int i = adjusted_tx_height;
2450 do {
2451 Identity16Row_NEON(src, /*step=*/16, row_shift);
2452 src += 64;
2453 i -= 4;
2454 } while (i != 0);
2455 }
2456
Identity16TransformLoopColumn_NEON(TransformType tx_type,TransformSize tx_size,int adjusted_tx_height,void * src_buffer,int start_x,int start_y,void * dst_frame)2457 void Identity16TransformLoopColumn_NEON(TransformType tx_type,
2458 TransformSize tx_size,
2459 int adjusted_tx_height,
2460 void* src_buffer, int start_x,
2461 int start_y, void* dst_frame) {
2462 auto* src = static_cast<int32_t*>(src_buffer);
2463 const int tx_width = kTransformWidth[tx_size];
2464
2465 if (kTransformFlipColumnsMask.Contains(tx_type)) {
2466 FlipColumns<16>(src, tx_width);
2467 }
2468 auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
2469 IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
2470 adjusted_tx_height, src);
2471 }
2472
2473 //------------------------------------------------------------------------------
2474
Init10bpp()2475 void Init10bpp() {
2476 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
2477 assert(dsp != nullptr);
2478 // Maximum transform size for Dct is 64.
2479 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
2480 Dct4TransformLoopRow_NEON;
2481 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
2482 Dct4TransformLoopColumn_NEON;
2483 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
2484 Dct8TransformLoopRow_NEON;
2485 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
2486 Dct8TransformLoopColumn_NEON;
2487 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
2488 Dct16TransformLoopRow_NEON;
2489 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
2490 Dct16TransformLoopColumn_NEON;
2491 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
2492 Dct32TransformLoopRow_NEON;
2493 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
2494 Dct32TransformLoopColumn_NEON;
2495 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
2496 Dct64TransformLoopRow_NEON;
2497 dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
2498 Dct64TransformLoopColumn_NEON;
2499
2500 // Maximum transform size for Adst is 16.
2501 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
2502 Adst4TransformLoopRow_NEON;
2503 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
2504 Adst4TransformLoopColumn_NEON;
2505 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
2506 Adst8TransformLoopRow_NEON;
2507 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
2508 Adst8TransformLoopColumn_NEON;
2509 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
2510 Adst16TransformLoopRow_NEON;
2511 dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
2512 Adst16TransformLoopColumn_NEON;
2513
2514 // Maximum transform size for Identity transform is 32.
2515 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
2516 Identity4TransformLoopRow_NEON;
2517 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
2518 Identity4TransformLoopColumn_NEON;
2519 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
2520 Identity8TransformLoopRow_NEON;
2521 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
2522 Identity8TransformLoopColumn_NEON;
2523 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
2524 Identity16TransformLoopRow_NEON;
2525 dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
2526 Identity16TransformLoopColumn_NEON;
2527 }
2528
2529 } // namespace
2530
InverseTransformInit10bpp_NEON()2531 void InverseTransformInit10bpp_NEON() { Init10bpp(); }
2532
2533 } // namespace dsp
2534 } // namespace libgav1
2535 #else // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
2536 namespace libgav1 {
2537 namespace dsp {
2538
InverseTransformInit10bpp_NEON()2539 void InverseTransformInit10bpp_NEON() {}
2540
2541 } // namespace dsp
2542 } // namespace libgav1
2543 #endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
2544