1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14
15 #include "vpx/vpx_integer.h"
16
vpx_sad8x16_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)17 unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
18 unsigned char *ref_ptr, int ref_stride) {
19 uint8x8_t d0, d8;
20 uint16x8_t q12;
21 uint32x4_t q1;
22 uint64x2_t q3;
23 uint32x2_t d5;
24 int i;
25
26 d0 = vld1_u8(src_ptr);
27 src_ptr += src_stride;
28 d8 = vld1_u8(ref_ptr);
29 ref_ptr += ref_stride;
30 q12 = vabdl_u8(d0, d8);
31
32 for (i = 0; i < 15; i++) {
33 d0 = vld1_u8(src_ptr);
34 src_ptr += src_stride;
35 d8 = vld1_u8(ref_ptr);
36 ref_ptr += ref_stride;
37 q12 = vabal_u8(q12, d0, d8);
38 }
39
40 q1 = vpaddlq_u16(q12);
41 q3 = vpaddlq_u32(q1);
42 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
43 vreinterpret_u32_u64(vget_high_u64(q3)));
44
45 return vget_lane_u32(d5, 0);
46 }
47
vpx_sad4x4_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)48 unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
49 unsigned char *ref_ptr, int ref_stride) {
50 uint8x8_t d0, d8;
51 uint16x8_t q12;
52 uint32x2_t d1;
53 uint64x1_t d3;
54 int i;
55
56 d0 = vld1_u8(src_ptr);
57 src_ptr += src_stride;
58 d8 = vld1_u8(ref_ptr);
59 ref_ptr += ref_stride;
60 q12 = vabdl_u8(d0, d8);
61
62 for (i = 0; i < 3; i++) {
63 d0 = vld1_u8(src_ptr);
64 src_ptr += src_stride;
65 d8 = vld1_u8(ref_ptr);
66 ref_ptr += ref_stride;
67 q12 = vabal_u8(q12, d0, d8);
68 }
69
70 d1 = vpaddl_u16(vget_low_u16(q12));
71 d3 = vpaddl_u32(d1);
72
73 return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
74 }
75
vpx_sad16x8_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)76 unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
77 unsigned char *ref_ptr, int ref_stride) {
78 uint8x16_t q0, q4;
79 uint16x8_t q12, q13;
80 uint32x4_t q1;
81 uint64x2_t q3;
82 uint32x2_t d5;
83 int i;
84
85 q0 = vld1q_u8(src_ptr);
86 src_ptr += src_stride;
87 q4 = vld1q_u8(ref_ptr);
88 ref_ptr += ref_stride;
89 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
90 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
91
92 for (i = 0; i < 7; i++) {
93 q0 = vld1q_u8(src_ptr);
94 src_ptr += src_stride;
95 q4 = vld1q_u8(ref_ptr);
96 ref_ptr += ref_stride;
97 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
98 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
99 }
100
101 q12 = vaddq_u16(q12, q13);
102 q1 = vpaddlq_u16(q12);
103 q3 = vpaddlq_u32(q1);
104 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
105 vreinterpret_u32_u64(vget_high_u64(q3)));
106
107 return vget_lane_u32(d5, 0);
108 }
109
horizontal_long_add_16x8(const uint16x8_t vec_lo,const uint16x8_t vec_hi)110 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
111 const uint16x8_t vec_hi) {
112 const uint32x4_t vec_l_lo =
113 vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
114 const uint32x4_t vec_l_hi =
115 vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
116 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
117 const uint64x2_t b = vpaddlq_u32(a);
118 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
119 vreinterpret_u32_u64(vget_high_u64(b)));
120 return vget_lane_u32(c, 0);
121 }
horizontal_add_16x8(const uint16x8_t vec_16x8)122 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
123 const uint32x4_t a = vpaddlq_u16(vec_16x8);
124 const uint64x2_t b = vpaddlq_u32(a);
125 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
126 vreinterpret_u32_u64(vget_high_u64(b)));
127 return vget_lane_u32(c, 0);
128 }
129
vpx_sad64x64_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)130 unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
131 const uint8_t *ref, int ref_stride) {
132 int i;
133 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
134 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
135 for (i = 0; i < 64; ++i) {
136 const uint8x16_t vec_src_00 = vld1q_u8(src);
137 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
138 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
139 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
140 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
141 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
142 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
143 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
144 src += src_stride;
145 ref += ref_stride;
146 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
147 vget_low_u8(vec_ref_00));
148 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
149 vget_high_u8(vec_ref_00));
150 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
151 vget_low_u8(vec_ref_16));
152 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
153 vget_high_u8(vec_ref_16));
154 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
155 vget_low_u8(vec_ref_32));
156 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
157 vget_high_u8(vec_ref_32));
158 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
159 vget_low_u8(vec_ref_48));
160 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
161 vget_high_u8(vec_ref_48));
162 }
163 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
164 }
165
vpx_sad32x32_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)166 unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
167 const uint8_t *ref, int ref_stride) {
168 int i;
169 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
170 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
171
172 for (i = 0; i < 32; ++i) {
173 const uint8x16_t vec_src_00 = vld1q_u8(src);
174 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
175 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
176 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
177 src += src_stride;
178 ref += ref_stride;
179 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
180 vget_low_u8(vec_ref_00));
181 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
182 vget_high_u8(vec_ref_00));
183 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
184 vget_low_u8(vec_ref_16));
185 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
186 vget_high_u8(vec_ref_16));
187 }
188 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
189 }
190
vpx_sad16x16_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)191 unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
192 const uint8_t *ref, int ref_stride) {
193 int i;
194 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
195 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
196
197 for (i = 0; i < 16; ++i) {
198 const uint8x16_t vec_src = vld1q_u8(src);
199 const uint8x16_t vec_ref = vld1q_u8(ref);
200 src += src_stride;
201 ref += ref_stride;
202 vec_accum_lo =
203 vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
204 vec_accum_hi =
205 vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
206 }
207 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
208 }
209
vpx_sad8x8_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)210 unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
211 const uint8_t *ref, int ref_stride) {
212 int i;
213 uint16x8_t vec_accum = vdupq_n_u16(0);
214
215 for (i = 0; i < 8; ++i) {
216 const uint8x8_t vec_src = vld1_u8(src);
217 const uint8x8_t vec_ref = vld1_u8(ref);
218 src += src_stride;
219 ref += ref_stride;
220 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
221 }
222 return horizontal_add_16x8(vec_accum);
223 }
224