• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <immintrin.h>
12 
13 #include "config/aom_dsp_rtcd.h"
14 
copy_128(const uint8_t * src,uint8_t * dst)15 static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
16   __m128i s[8];
17   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
18   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
19   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
20   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
21   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
22   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
23   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
24   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
25   _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
26   _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
27   _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
28   _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
29   _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
30   _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
31   _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
32   _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
33 }
34 
aom_convolve_copy_sse2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)35 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
36                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
37   if (w >= 16) {
38     assert(!((intptr_t)dst % 16));
39     assert(!(dst_stride % 16));
40   }
41 
42   if (w == 2) {
43     do {
44       memmove(dst, src, 2 * sizeof(*src));
45       src += src_stride;
46       dst += dst_stride;
47       memmove(dst, src, 2 * sizeof(*src));
48       src += src_stride;
49       dst += dst_stride;
50       h -= 2;
51     } while (h);
52   } else if (w == 4) {
53     do {
54       memmove(dst, src, 4 * sizeof(*src));
55       src += src_stride;
56       dst += dst_stride;
57       memmove(dst, src, 4 * sizeof(*src));
58       src += src_stride;
59       dst += dst_stride;
60       h -= 2;
61     } while (h);
62   } else if (w == 8) {
63     do {
64       __m128i s[2];
65       s[0] = _mm_loadl_epi64((__m128i *)src);
66       src += src_stride;
67       s[1] = _mm_loadl_epi64((__m128i *)src);
68       src += src_stride;
69       _mm_storel_epi64((__m128i *)dst, s[0]);
70       dst += dst_stride;
71       _mm_storel_epi64((__m128i *)dst, s[1]);
72       dst += dst_stride;
73       h -= 2;
74     } while (h);
75   } else if (w == 16) {
76     do {
77       __m128i s[2];
78       s[0] = _mm_loadu_si128((__m128i *)src);
79       src += src_stride;
80       s[1] = _mm_loadu_si128((__m128i *)src);
81       src += src_stride;
82       _mm_store_si128((__m128i *)dst, s[0]);
83       dst += dst_stride;
84       _mm_store_si128((__m128i *)dst, s[1]);
85       dst += dst_stride;
86       h -= 2;
87     } while (h);
88   } else if (w == 32) {
89     do {
90       __m128i s[4];
91       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
92       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
93       src += src_stride;
94       s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
95       s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
96       src += src_stride;
97       _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
98       _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
99       dst += dst_stride;
100       _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
101       _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
102       dst += dst_stride;
103       h -= 2;
104     } while (h);
105   } else if (w == 64) {
106     do {
107       __m128i s[8];
108       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
109       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
110       s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
111       s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
112       src += src_stride;
113       s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
114       s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
115       s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
116       s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
117       src += src_stride;
118       _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
119       _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
120       _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
121       _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
122       dst += dst_stride;
123       _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
124       _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
125       _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
126       _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
127       dst += dst_stride;
128       h -= 2;
129     } while (h);
130   } else {
131     do {
132       copy_128(src, dst);
133       src += src_stride;
134       dst += dst_stride;
135       copy_128(src, dst);
136       src += src_stride;
137       dst += dst_stride;
138       h -= 2;
139     } while (h);
140   }
141 }
142 
highbd_copy_64(const uint16_t * src,uint16_t * dst)143 static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
144   __m128i s[8];
145   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
146   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
147   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
148   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
149   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
150   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
151   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
152   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
153   _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
154   _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
155   _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
156   _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
157   _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
158   _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
159   _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
160   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
161 }
162 
highbd_copy_128(const uint16_t * src,uint16_t * dst)163 static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
164   __m128i s[16];
165   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
166   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
167   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
168   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
169   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
170   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
171   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
172   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
173   s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
174   s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
175   s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
176   s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
177   s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
178   s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
179   s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
180   s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
181   _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
182   _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
183   _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
184   _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
185   _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
186   _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
187   _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
188   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
189   _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
190   _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
191   _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
192   _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
193   _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
194   _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
195   _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
196   _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
197 }
198 
aom_highbd_convolve_copy_sse2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)199 void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
200                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
201                                    int h) {
202   if (w >= 16) {
203     assert(!((intptr_t)dst % 16));
204     assert(!(dst_stride % 16));
205   }
206 
207   if (w == 2) {
208     do {
209       __m128i s = _mm_loadl_epi64((__m128i *)src);
210       *(int *)dst = _mm_cvtsi128_si32(s);
211       src += src_stride;
212       dst += dst_stride;
213       s = _mm_loadl_epi64((__m128i *)src);
214       *(int *)dst = _mm_cvtsi128_si32(s);
215       src += src_stride;
216       dst += dst_stride;
217       h -= 2;
218     } while (h);
219   } else if (w == 4) {
220     do {
221       __m128i s[2];
222       s[0] = _mm_loadl_epi64((__m128i *)src);
223       src += src_stride;
224       s[1] = _mm_loadl_epi64((__m128i *)src);
225       src += src_stride;
226       _mm_storel_epi64((__m128i *)dst, s[0]);
227       dst += dst_stride;
228       _mm_storel_epi64((__m128i *)dst, s[1]);
229       dst += dst_stride;
230       h -= 2;
231     } while (h);
232   } else if (w == 8) {
233     do {
234       __m128i s[2];
235       s[0] = _mm_loadu_si128((__m128i *)src);
236       src += src_stride;
237       s[1] = _mm_loadu_si128((__m128i *)src);
238       src += src_stride;
239       _mm_store_si128((__m128i *)dst, s[0]);
240       dst += dst_stride;
241       _mm_store_si128((__m128i *)dst, s[1]);
242       dst += dst_stride;
243       h -= 2;
244     } while (h);
245   } else if (w == 16) {
246     do {
247       __m128i s[4];
248       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
249       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
250       src += src_stride;
251       s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
252       s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
253       src += src_stride;
254       _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
255       _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
256       dst += dst_stride;
257       _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
258       _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
259       dst += dst_stride;
260       h -= 2;
261     } while (h);
262   } else if (w == 32) {
263     do {
264       __m128i s[8];
265       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
266       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
267       s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
268       s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
269       src += src_stride;
270       s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
271       s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
272       s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
273       s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
274       src += src_stride;
275       _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
276       _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
277       _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
278       _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
279       dst += dst_stride;
280       _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
281       _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
282       _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
283       _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
284       dst += dst_stride;
285       h -= 2;
286     } while (h);
287   } else if (w == 64) {
288     do {
289       highbd_copy_64(src, dst);
290       src += src_stride;
291       dst += dst_stride;
292       highbd_copy_64(src, dst);
293       src += src_stride;
294       dst += dst_stride;
295       h -= 2;
296     } while (h);
297   } else {
298     do {
299       highbd_copy_128(src, dst);
300       src += src_stride;
301       dst += dst_stride;
302       highbd_copy_128(src, dst);
303       src += src_stride;
304       dst += dst_stride;
305       h -= 2;
306     } while (h);
307   }
308 }
309