1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "config/aom_config.h"
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
18 #include "aom_dsp/arm/mem_neon.h"
19
av1_block_error_sve(const tran_low_t * coeff,const tran_low_t * dqcoeff,intptr_t block_size,int64_t * ssz)20 int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
21 intptr_t block_size, int64_t *ssz) {
22 int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
23 int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
24
25 assert(block_size >= 16);
26 assert((block_size % 16) == 0);
27
28 do {
29 const int16x8_t c0 = load_tran_low_to_s16q(coeff);
30 const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
31 const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
32 const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
33
34 const int16x8_t diff0 = vsubq_s16(c0, d0);
35 const int16x8_t diff1 = vsubq_s16(c1, d1);
36
37 error[0] = aom_sdotq_s16(error[0], diff0, diff0);
38 error[1] = aom_sdotq_s16(error[1], diff1, diff1);
39 sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0);
40 sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1);
41
42 coeff += 16;
43 dqcoeff += 16;
44 block_size -= 16;
45 } while (block_size != 0);
46
47 *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1]));
48 return vaddvq_s64(vaddq_s64(error[0], error[1]));
49 }
50
av1_block_error_lp_sve(const int16_t * coeff,const int16_t * dqcoeff,int block_size)51 int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
52 int block_size) {
53 if (block_size % 32 == 0) {
54 int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
55 vdupq_n_s64(0) };
56
57 do {
58 const int16x8_t c0 = vld1q_s16(coeff);
59 const int16x8_t c1 = vld1q_s16(coeff + 8);
60 const int16x8_t c2 = vld1q_s16(coeff + 16);
61 const int16x8_t c3 = vld1q_s16(coeff + 24);
62 const int16x8_t d0 = vld1q_s16(dqcoeff);
63 const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
64 const int16x8_t d2 = vld1q_s16(dqcoeff + 16);
65 const int16x8_t d3 = vld1q_s16(dqcoeff + 24);
66
67 const int16x8_t diff0 = vsubq_s16(c0, d0);
68 const int16x8_t diff1 = vsubq_s16(c1, d1);
69 const int16x8_t diff2 = vsubq_s16(c2, d2);
70 const int16x8_t diff3 = vsubq_s16(c3, d3);
71
72 error[0] = aom_sdotq_s16(error[0], diff0, diff0);
73 error[1] = aom_sdotq_s16(error[1], diff1, diff1);
74 error[2] = aom_sdotq_s16(error[2], diff2, diff2);
75 error[3] = aom_sdotq_s16(error[3], diff3, diff3);
76
77 coeff += 32;
78 dqcoeff += 32;
79 block_size -= 32;
80 } while (block_size != 0);
81
82 error[0] = vaddq_s64(error[0], error[1]);
83 error[2] = vaddq_s64(error[2], error[3]);
84 error[0] = vaddq_s64(error[0], error[2]);
85 return vaddvq_s64(error[0]);
86 }
87 assert(block_size == 16);
88
89 int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
90
91 do {
92 const int16x8_t c0 = vld1q_s16(coeff);
93 const int16x8_t c1 = vld1q_s16(coeff + 8);
94 const int16x8_t d0 = vld1q_s16(dqcoeff);
95 const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
96
97 const int16x8_t diff0 = vsubq_s16(c0, d0);
98 const int16x8_t diff1 = vsubq_s16(c1, d1);
99
100 error[0] = aom_sdotq_s16(error[0], diff0, diff0);
101 error[1] = aom_sdotq_s16(error[1], diff1, diff1);
102
103 coeff += 16;
104 dqcoeff += 16;
105 block_size -= 16;
106 } while (block_size != 0);
107
108 return vaddvq_s64(vaddq_s64(error[0], error[1]));
109 }
110