1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15
vpx_convolve_avg_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)16 void vpx_convolve_avg_neon(
17 const uint8_t *src, // r0
18 ptrdiff_t src_stride, // r1
19 uint8_t *dst, // r2
20 ptrdiff_t dst_stride, // r3
21 const int16_t *filter_x,
22 int filter_x_stride,
23 const int16_t *filter_y,
24 int filter_y_stride,
25 int w,
26 int h) {
27 uint8_t *d;
28 uint8x8_t d0u8, d1u8, d2u8, d3u8;
29 uint32x2_t d0u32, d2u32;
30 uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
31 (void)filter_x; (void)filter_x_stride;
32 (void)filter_y; (void)filter_y_stride;
33
34 d = dst;
35 if (w > 32) { // avg64
36 for (; h > 0; h -= 1) {
37 q0u8 = vld1q_u8(src);
38 q1u8 = vld1q_u8(src + 16);
39 q2u8 = vld1q_u8(src + 32);
40 q3u8 = vld1q_u8(src + 48);
41 src += src_stride;
42 q8u8 = vld1q_u8(d);
43 q9u8 = vld1q_u8(d + 16);
44 q10u8 = vld1q_u8(d + 32);
45 q11u8 = vld1q_u8(d + 48);
46 d += dst_stride;
47
48 q0u8 = vrhaddq_u8(q0u8, q8u8);
49 q1u8 = vrhaddq_u8(q1u8, q9u8);
50 q2u8 = vrhaddq_u8(q2u8, q10u8);
51 q3u8 = vrhaddq_u8(q3u8, q11u8);
52
53 vst1q_u8(dst, q0u8);
54 vst1q_u8(dst + 16, q1u8);
55 vst1q_u8(dst + 32, q2u8);
56 vst1q_u8(dst + 48, q3u8);
57 dst += dst_stride;
58 }
59 } else if (w == 32) { // avg32
60 for (; h > 0; h -= 2) {
61 q0u8 = vld1q_u8(src);
62 q1u8 = vld1q_u8(src + 16);
63 src += src_stride;
64 q2u8 = vld1q_u8(src);
65 q3u8 = vld1q_u8(src + 16);
66 src += src_stride;
67 q8u8 = vld1q_u8(d);
68 q9u8 = vld1q_u8(d + 16);
69 d += dst_stride;
70 q10u8 = vld1q_u8(d);
71 q11u8 = vld1q_u8(d + 16);
72 d += dst_stride;
73
74 q0u8 = vrhaddq_u8(q0u8, q8u8);
75 q1u8 = vrhaddq_u8(q1u8, q9u8);
76 q2u8 = vrhaddq_u8(q2u8, q10u8);
77 q3u8 = vrhaddq_u8(q3u8, q11u8);
78
79 vst1q_u8(dst, q0u8);
80 vst1q_u8(dst + 16, q1u8);
81 dst += dst_stride;
82 vst1q_u8(dst, q2u8);
83 vst1q_u8(dst + 16, q3u8);
84 dst += dst_stride;
85 }
86 } else if (w > 8) { // avg16
87 for (; h > 0; h -= 2) {
88 q0u8 = vld1q_u8(src);
89 src += src_stride;
90 q1u8 = vld1q_u8(src);
91 src += src_stride;
92 q2u8 = vld1q_u8(d);
93 d += dst_stride;
94 q3u8 = vld1q_u8(d);
95 d += dst_stride;
96
97 q0u8 = vrhaddq_u8(q0u8, q2u8);
98 q1u8 = vrhaddq_u8(q1u8, q3u8);
99
100 vst1q_u8(dst, q0u8);
101 dst += dst_stride;
102 vst1q_u8(dst, q1u8);
103 dst += dst_stride;
104 }
105 } else if (w == 8) { // avg8
106 for (; h > 0; h -= 2) {
107 d0u8 = vld1_u8(src);
108 src += src_stride;
109 d1u8 = vld1_u8(src);
110 src += src_stride;
111 d2u8 = vld1_u8(d);
112 d += dst_stride;
113 d3u8 = vld1_u8(d);
114 d += dst_stride;
115
116 q0u8 = vcombine_u8(d0u8, d1u8);
117 q1u8 = vcombine_u8(d2u8, d3u8);
118 q0u8 = vrhaddq_u8(q0u8, q1u8);
119
120 vst1_u8(dst, vget_low_u8(q0u8));
121 dst += dst_stride;
122 vst1_u8(dst, vget_high_u8(q0u8));
123 dst += dst_stride;
124 }
125 } else { // avg4
126 for (; h > 0; h -= 2) {
127 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
128 src += src_stride;
129 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
130 src += src_stride;
131 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
132 d += dst_stride;
133 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
134 d += dst_stride;
135
136 d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
137 vreinterpret_u8_u32(d2u32));
138
139 d0u32 = vreinterpret_u32_u8(d0u8);
140 vst1_lane_u32((uint32_t *)dst, d0u32, 0);
141 dst += dst_stride;
142 vst1_lane_u32((uint32_t *)dst, d0u32, 1);
143 dst += dst_stride;
144 }
145 }
146 return;
147 }
148