1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h>
12
13 #include "config/aom_dsp_rtcd.h"
14
copy_128(const uint8_t * src,uint8_t * dst)15 static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
16 __m128i s[8];
17 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
18 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
19 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
20 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
21 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
22 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
23 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
24 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
25 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
26 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
27 _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
28 _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
29 _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
30 _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
31 _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
32 _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
33 }
34
aom_convolve_copy_sse2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)35 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
36 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
37 if (w >= 16) {
38 assert(!((intptr_t)dst % 16));
39 assert(!(dst_stride % 16));
40 }
41
42 if (w == 2) {
43 do {
44 memmove(dst, src, 2 * sizeof(*src));
45 src += src_stride;
46 dst += dst_stride;
47 memmove(dst, src, 2 * sizeof(*src));
48 src += src_stride;
49 dst += dst_stride;
50 h -= 2;
51 } while (h);
52 } else if (w == 4) {
53 do {
54 memmove(dst, src, 4 * sizeof(*src));
55 src += src_stride;
56 dst += dst_stride;
57 memmove(dst, src, 4 * sizeof(*src));
58 src += src_stride;
59 dst += dst_stride;
60 h -= 2;
61 } while (h);
62 } else if (w == 8) {
63 do {
64 __m128i s[2];
65 s[0] = _mm_loadl_epi64((__m128i *)src);
66 src += src_stride;
67 s[1] = _mm_loadl_epi64((__m128i *)src);
68 src += src_stride;
69 _mm_storel_epi64((__m128i *)dst, s[0]);
70 dst += dst_stride;
71 _mm_storel_epi64((__m128i *)dst, s[1]);
72 dst += dst_stride;
73 h -= 2;
74 } while (h);
75 } else if (w == 16) {
76 do {
77 __m128i s[2];
78 s[0] = _mm_loadu_si128((__m128i *)src);
79 src += src_stride;
80 s[1] = _mm_loadu_si128((__m128i *)src);
81 src += src_stride;
82 _mm_store_si128((__m128i *)dst, s[0]);
83 dst += dst_stride;
84 _mm_store_si128((__m128i *)dst, s[1]);
85 dst += dst_stride;
86 h -= 2;
87 } while (h);
88 } else if (w == 32) {
89 do {
90 __m128i s[4];
91 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
92 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
93 src += src_stride;
94 s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
95 s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
96 src += src_stride;
97 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
98 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
99 dst += dst_stride;
100 _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
101 _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
102 dst += dst_stride;
103 h -= 2;
104 } while (h);
105 } else if (w == 64) {
106 do {
107 __m128i s[8];
108 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
109 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
110 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
111 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
112 src += src_stride;
113 s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
114 s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
115 s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
116 s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
117 src += src_stride;
118 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
119 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
120 _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
121 _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
122 dst += dst_stride;
123 _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
124 _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
125 _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
126 _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
127 dst += dst_stride;
128 h -= 2;
129 } while (h);
130 } else {
131 do {
132 copy_128(src, dst);
133 src += src_stride;
134 dst += dst_stride;
135 copy_128(src, dst);
136 src += src_stride;
137 dst += dst_stride;
138 h -= 2;
139 } while (h);
140 }
141 }
142
highbd_copy_64(const uint16_t * src,uint16_t * dst)143 static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
144 __m128i s[8];
145 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
146 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
147 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
148 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
149 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
150 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
151 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
152 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
153 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
154 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
155 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
156 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
157 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
158 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
159 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
160 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
161 }
162
highbd_copy_128(const uint16_t * src,uint16_t * dst)163 static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
164 __m128i s[16];
165 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
166 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
167 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
168 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
169 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
170 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
171 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
172 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
173 s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
174 s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
175 s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
176 s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
177 s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
178 s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
179 s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
180 s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
181 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
182 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
183 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
184 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
185 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
186 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
187 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
188 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
189 _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
190 _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
191 _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
192 _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
193 _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
194 _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
195 _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
196 _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
197 }
198
aom_highbd_convolve_copy_sse2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)199 void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
200 uint16_t *dst, ptrdiff_t dst_stride, int w,
201 int h) {
202 if (w >= 16) {
203 assert(!((intptr_t)dst % 16));
204 assert(!(dst_stride % 16));
205 }
206
207 if (w == 2) {
208 do {
209 __m128i s = _mm_loadl_epi64((__m128i *)src);
210 *(int *)dst = _mm_cvtsi128_si32(s);
211 src += src_stride;
212 dst += dst_stride;
213 s = _mm_loadl_epi64((__m128i *)src);
214 *(int *)dst = _mm_cvtsi128_si32(s);
215 src += src_stride;
216 dst += dst_stride;
217 h -= 2;
218 } while (h);
219 } else if (w == 4) {
220 do {
221 __m128i s[2];
222 s[0] = _mm_loadl_epi64((__m128i *)src);
223 src += src_stride;
224 s[1] = _mm_loadl_epi64((__m128i *)src);
225 src += src_stride;
226 _mm_storel_epi64((__m128i *)dst, s[0]);
227 dst += dst_stride;
228 _mm_storel_epi64((__m128i *)dst, s[1]);
229 dst += dst_stride;
230 h -= 2;
231 } while (h);
232 } else if (w == 8) {
233 do {
234 __m128i s[2];
235 s[0] = _mm_loadu_si128((__m128i *)src);
236 src += src_stride;
237 s[1] = _mm_loadu_si128((__m128i *)src);
238 src += src_stride;
239 _mm_store_si128((__m128i *)dst, s[0]);
240 dst += dst_stride;
241 _mm_store_si128((__m128i *)dst, s[1]);
242 dst += dst_stride;
243 h -= 2;
244 } while (h);
245 } else if (w == 16) {
246 do {
247 __m128i s[4];
248 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
249 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
250 src += src_stride;
251 s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
252 s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
253 src += src_stride;
254 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
255 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
256 dst += dst_stride;
257 _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
258 _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
259 dst += dst_stride;
260 h -= 2;
261 } while (h);
262 } else if (w == 32) {
263 do {
264 __m128i s[8];
265 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
266 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
267 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
268 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
269 src += src_stride;
270 s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
271 s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
272 s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
273 s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
274 src += src_stride;
275 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
276 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
277 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
278 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
279 dst += dst_stride;
280 _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
281 _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
282 _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
283 _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
284 dst += dst_stride;
285 h -= 2;
286 } while (h);
287 } else if (w == 64) {
288 do {
289 highbd_copy_64(src, dst);
290 src += src_stride;
291 dst += dst_stride;
292 highbd_copy_64(src, dst);
293 src += src_stride;
294 dst += dst_stride;
295 h -= 2;
296 } while (h);
297 } else {
298 do {
299 highbd_copy_128(src, dst);
300 src += src_stride;
301 dst += dst_stride;
302 highbd_copy_128(src, dst);
303 src += src_stride;
304 dst += dst_stride;
305 h -= 2;
306 } while (h);
307 }
308 }
309