1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15
vpx_highbd_convolve_avg_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)16 void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
17 uint16_t *dst, ptrdiff_t dst_stride,
18 const int16_t *filter_x, int filter_x_stride,
19 const int16_t *filter_y, int filter_y_stride,
20 int w, int h, int bd) {
21 (void)filter_x;
22 (void)filter_x_stride;
23 (void)filter_y;
24 (void)filter_y_stride;
25 (void)bd;
26
27 if (w < 8) { // avg4
28 uint16x4_t s0, s1, d0, d1;
29 uint16x8_t s01, d01;
30 do {
31 s0 = vld1_u16(src);
32 d0 = vld1_u16(dst);
33 src += src_stride;
34 s1 = vld1_u16(src);
35 d1 = vld1_u16(dst + dst_stride);
36 src += src_stride;
37 s01 = vcombine_u16(s0, s1);
38 d01 = vcombine_u16(d0, d1);
39 d01 = vrhaddq_u16(s01, d01);
40 vst1_u16(dst, vget_low_u16(d01));
41 dst += dst_stride;
42 vst1_u16(dst, vget_high_u16(d01));
43 dst += dst_stride;
44 h -= 2;
45 } while (h > 0);
46 } else if (w == 8) { // avg8
47 uint16x8_t s0, s1, d0, d1;
48 do {
49 s0 = vld1q_u16(src);
50 d0 = vld1q_u16(dst);
51 src += src_stride;
52 s1 = vld1q_u16(src);
53 d1 = vld1q_u16(dst + dst_stride);
54 src += src_stride;
55
56 d0 = vrhaddq_u16(s0, d0);
57 d1 = vrhaddq_u16(s1, d1);
58
59 vst1q_u16(dst, d0);
60 dst += dst_stride;
61 vst1q_u16(dst, d1);
62 dst += dst_stride;
63 h -= 2;
64 } while (h > 0);
65 } else if (w < 32) { // avg16
66 uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h;
67 do {
68 s0l = vld1q_u16(src);
69 s0h = vld1q_u16(src + 8);
70 d0l = vld1q_u16(dst);
71 d0h = vld1q_u16(dst + 8);
72 src += src_stride;
73 s1l = vld1q_u16(src);
74 s1h = vld1q_u16(src + 8);
75 d1l = vld1q_u16(dst + dst_stride);
76 d1h = vld1q_u16(dst + dst_stride + 8);
77 src += src_stride;
78
79 d0l = vrhaddq_u16(s0l, d0l);
80 d0h = vrhaddq_u16(s0h, d0h);
81 d1l = vrhaddq_u16(s1l, d1l);
82 d1h = vrhaddq_u16(s1h, d1h);
83
84 vst1q_u16(dst, d0l);
85 vst1q_u16(dst + 8, d0h);
86 dst += dst_stride;
87 vst1q_u16(dst, d1l);
88 vst1q_u16(dst + 8, d1h);
89 dst += dst_stride;
90 h -= 2;
91 } while (h > 0);
92 } else if (w == 32) { // avg32
93 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
94 do {
95 s0 = vld1q_u16(src);
96 s1 = vld1q_u16(src + 8);
97 s2 = vld1q_u16(src + 16);
98 s3 = vld1q_u16(src + 24);
99 d0 = vld1q_u16(dst);
100 d1 = vld1q_u16(dst + 8);
101 d2 = vld1q_u16(dst + 16);
102 d3 = vld1q_u16(dst + 24);
103 src += src_stride;
104
105 d0 = vrhaddq_u16(s0, d0);
106 d1 = vrhaddq_u16(s1, d1);
107 d2 = vrhaddq_u16(s2, d2);
108 d3 = vrhaddq_u16(s3, d3);
109
110 vst1q_u16(dst, d0);
111 vst1q_u16(dst + 8, d1);
112 vst1q_u16(dst + 16, d2);
113 vst1q_u16(dst + 24, d3);
114 dst += dst_stride;
115
116 s0 = vld1q_u16(src);
117 s1 = vld1q_u16(src + 8);
118 s2 = vld1q_u16(src + 16);
119 s3 = vld1q_u16(src + 24);
120 d0 = vld1q_u16(dst);
121 d1 = vld1q_u16(dst + 8);
122 d2 = vld1q_u16(dst + 16);
123 d3 = vld1q_u16(dst + 24);
124 src += src_stride;
125
126 d0 = vrhaddq_u16(s0, d0);
127 d1 = vrhaddq_u16(s1, d1);
128 d2 = vrhaddq_u16(s2, d2);
129 d3 = vrhaddq_u16(s3, d3);
130
131 vst1q_u16(dst, d0);
132 vst1q_u16(dst + 8, d1);
133 vst1q_u16(dst + 16, d2);
134 vst1q_u16(dst + 24, d3);
135 dst += dst_stride;
136 h -= 2;
137 } while (h > 0);
138 } else { // avg64
139 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
140 do {
141 s0 = vld1q_u16(src);
142 s1 = vld1q_u16(src + 8);
143 s2 = vld1q_u16(src + 16);
144 s3 = vld1q_u16(src + 24);
145 d0 = vld1q_u16(dst);
146 d1 = vld1q_u16(dst + 8);
147 d2 = vld1q_u16(dst + 16);
148 d3 = vld1q_u16(dst + 24);
149
150 d0 = vrhaddq_u16(s0, d0);
151 d1 = vrhaddq_u16(s1, d1);
152 d2 = vrhaddq_u16(s2, d2);
153 d3 = vrhaddq_u16(s3, d3);
154
155 vst1q_u16(dst, d0);
156 vst1q_u16(dst + 8, d1);
157 vst1q_u16(dst + 16, d2);
158 vst1q_u16(dst + 24, d3);
159
160 s0 = vld1q_u16(src + 32);
161 s1 = vld1q_u16(src + 40);
162 s2 = vld1q_u16(src + 48);
163 s3 = vld1q_u16(src + 56);
164 d0 = vld1q_u16(dst + 32);
165 d1 = vld1q_u16(dst + 40);
166 d2 = vld1q_u16(dst + 48);
167 d3 = vld1q_u16(dst + 56);
168
169 d0 = vrhaddq_u16(s0, d0);
170 d1 = vrhaddq_u16(s1, d1);
171 d2 = vrhaddq_u16(s2, d2);
172 d3 = vrhaddq_u16(s3, d3);
173
174 vst1q_u16(dst + 32, d0);
175 vst1q_u16(dst + 40, d1);
176 vst1q_u16(dst + 48, d2);
177 vst1q_u16(dst + 56, d3);
178 src += src_stride;
179 dst += dst_stride;
180 } while (--h);
181 }
182 }
183