1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h>
12
13 #include "config/aom_dsp_rtcd.h"
14
copy_128(const uint8_t * src,uint8_t * dst)15 static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
16 __m256i s[4];
17 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
18 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
19 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
20 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
21 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
22 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
23 _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
24 _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
25 }
26
aom_convolve_copy_avx2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)27 void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
28 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
29 if (w >= 16) {
30 assert(!((intptr_t)dst % 16));
31 assert(!(dst_stride % 16));
32 }
33
34 if (w == 2) {
35 do {
36 memmove(dst, src, 2 * sizeof(*src));
37 src += src_stride;
38 dst += dst_stride;
39 memmove(dst, src, 2 * sizeof(*src));
40 src += src_stride;
41 dst += dst_stride;
42 h -= 2;
43 } while (h);
44 } else if (w == 4) {
45 do {
46 memmove(dst, src, 4 * sizeof(*src));
47 src += src_stride;
48 dst += dst_stride;
49 memmove(dst, src, 4 * sizeof(*src));
50 src += src_stride;
51 dst += dst_stride;
52 h -= 2;
53 } while (h);
54 } else if (w == 8) {
55 do {
56 __m128i s[2];
57 s[0] = _mm_loadl_epi64((__m128i *)src);
58 src += src_stride;
59 s[1] = _mm_loadl_epi64((__m128i *)src);
60 src += src_stride;
61 _mm_storel_epi64((__m128i *)dst, s[0]);
62 dst += dst_stride;
63 _mm_storel_epi64((__m128i *)dst, s[1]);
64 dst += dst_stride;
65 h -= 2;
66 } while (h);
67 } else if (w == 16) {
68 do {
69 __m128i s[2];
70 s[0] = _mm_loadu_si128((__m128i *)src);
71 src += src_stride;
72 s[1] = _mm_loadu_si128((__m128i *)src);
73 src += src_stride;
74 _mm_store_si128((__m128i *)dst, s[0]);
75 dst += dst_stride;
76 _mm_store_si128((__m128i *)dst, s[1]);
77 dst += dst_stride;
78 h -= 2;
79 } while (h);
80 } else if (w == 32) {
81 do {
82 __m256i s[2];
83 s[0] = _mm256_loadu_si256((__m256i *)src);
84 src += src_stride;
85 s[1] = _mm256_loadu_si256((__m256i *)src);
86 src += src_stride;
87 _mm256_storeu_si256((__m256i *)dst, s[0]);
88 dst += dst_stride;
89 _mm256_storeu_si256((__m256i *)dst, s[1]);
90 dst += dst_stride;
91 h -= 2;
92 } while (h);
93 } else if (w == 64) {
94 do {
95 __m256i s[4];
96 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
97 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
98 src += src_stride;
99 s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
100 s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
101 src += src_stride;
102 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
103 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
104 dst += dst_stride;
105 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
106 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
107 dst += dst_stride;
108 h -= 2;
109 } while (h);
110 } else {
111 do {
112 copy_128(src, dst);
113 src += src_stride;
114 dst += dst_stride;
115 copy_128(src, dst);
116 src += src_stride;
117 dst += dst_stride;
118 h -= 2;
119 } while (h);
120 }
121 }
122
123 #if CONFIG_AV1_HIGHBITDEPTH
124
highbd_copy_64(const uint16_t * src,uint16_t * dst)125 static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
126 __m256i s[4];
127 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
128 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
129 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
130 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
131 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
132 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
133 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
134 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
135 }
136
highbd_copy_128(const uint16_t * src,uint16_t * dst)137 static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
138 __m256i s[8];
139 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
140 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
141 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
142 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
143 s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
144 s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
145 s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
146 s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
147
148 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
149 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
150 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
151 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
152 _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
153 _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
154 _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
155 _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
156 }
157
aom_highbd_convolve_copy_avx2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)158 void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
159 uint16_t *dst, ptrdiff_t dst_stride, int w,
160 int h) {
161 if (w >= 16) {
162 assert(!((intptr_t)dst % 16));
163 assert(!(dst_stride % 16));
164 }
165
166 if (w == 2) {
167 do {
168 memmove(dst, src, 2 * sizeof(*src));
169 src += src_stride;
170 dst += dst_stride;
171 memmove(dst, src, 2 * sizeof(*src));
172 src += src_stride;
173 dst += dst_stride;
174 h -= 2;
175 } while (h);
176 } else if (w == 4) {
177 do {
178 __m128i s[2];
179 s[0] = _mm_loadl_epi64((__m128i *)src);
180 src += src_stride;
181 s[1] = _mm_loadl_epi64((__m128i *)src);
182 src += src_stride;
183 _mm_storel_epi64((__m128i *)dst, s[0]);
184 dst += dst_stride;
185 _mm_storel_epi64((__m128i *)dst, s[1]);
186 dst += dst_stride;
187 h -= 2;
188 } while (h);
189 } else if (w == 8) {
190 do {
191 __m128i s[2];
192 s[0] = _mm_loadu_si128((__m128i *)src);
193 src += src_stride;
194 s[1] = _mm_loadu_si128((__m128i *)src);
195 src += src_stride;
196 _mm_store_si128((__m128i *)dst, s[0]);
197 dst += dst_stride;
198 _mm_store_si128((__m128i *)dst, s[1]);
199 dst += dst_stride;
200 h -= 2;
201 } while (h);
202 } else if (w == 16) {
203 do {
204 __m256i s[2];
205 s[0] = _mm256_loadu_si256((__m256i *)src);
206 src += src_stride;
207 s[1] = _mm256_loadu_si256((__m256i *)src);
208 src += src_stride;
209 _mm256_storeu_si256((__m256i *)dst, s[0]);
210 dst += dst_stride;
211 _mm256_storeu_si256((__m256i *)dst, s[1]);
212 dst += dst_stride;
213 h -= 2;
214 } while (h);
215 } else if (w == 32) {
216 do {
217 __m256i s[4];
218 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
219 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
220 src += src_stride;
221 s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
222 s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
223 src += src_stride;
224 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
225 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
226 dst += dst_stride;
227 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
228 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
229 dst += dst_stride;
230 h -= 2;
231 } while (h);
232 } else if (w == 64) {
233 do {
234 highbd_copy_64(src, dst);
235 src += src_stride;
236 dst += dst_stride;
237 highbd_copy_64(src, dst);
238 src += src_stride;
239 dst += dst_stride;
240 h -= 2;
241 } while (h);
242 } else {
243 assert(w == 128);
244 do {
245 highbd_copy_128(src, dst);
246 src += src_stride;
247 dst += dst_stride;
248 highbd_copy_128(src, dst);
249 src += src_stride;
250 dst += dst_stride;
251 h -= 2;
252 } while (h);
253 }
254 }
255
256 #endif // CONFIG_AV1_HIGHBITDEPTH
257