1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
14
15 static const uint8_t mc_filt_mask_arr[16 * 3] = {
16 /* 8 width cases */
17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
18 /* 4 width cases */
19 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
20 /* 4 width cases */
21 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
22 };
23
common_hz_8t_4x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)24 static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
25 uint8_t *dst, int32_t dst_stride,
26 const int8_t *filter) {
27 __m128i src0, src1, src2, src3;
28 __m128i filter0, filter1, filter2, filter3;
29 __m128i mask0, mask1, mask2, mask3;
30 __m128i out, out0, out1;
31
32 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
33 src -= 3;
34 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
35 filter0, filter1, filter2, filter3);
36 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
37 mask3 = __lsx_vaddi_bu(mask0, 6);
38
39 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
40 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
41 src1, src2, src3);
42 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
43 filter0, filter1, filter2, filter3, out0, out1);
44 out = __lsx_vssrarni_b_h(out1, out0, 7);
45 out = __lsx_vxori_b(out, 128);
46 __lsx_vstelm_w(out, dst, 0, 0);
47 dst += dst_stride;
48 __lsx_vstelm_w(out, dst, 0, 1);
49 dst += dst_stride;
50 __lsx_vstelm_w(out, dst, 0, 2);
51 dst += dst_stride;
52 __lsx_vstelm_w(out, dst, 0, 3);
53 }
54
common_hz_8t_4x8_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)55 static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
56 uint8_t *dst, int32_t dst_stride,
57 const int8_t *filter) {
58 int32_t src_stride2 = src_stride << 1;
59 int32_t src_stride3 = src_stride + src_stride2;
60 int32_t src_stride4 = src_stride2 << 1;
61 __m128i src0, src1, src2, src3;
62 __m128i filter0, filter1, filter2, filter3;
63 __m128i mask0, mask1, mask2, mask3;
64 __m128i out0, out1, out2, out3;
65 uint8_t *_src = (uint8_t *)src - 3;
66
67 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
68 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
69 mask3 = __lsx_vaddi_bu(mask0, 6);
70 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
71 filter0, filter1, filter2, filter3);
72
73 src0 = __lsx_vld(_src, 0);
74 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
75 src3 = __lsx_vldx(_src, src_stride3);
76 _src += src_stride4;
77 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
78 src1, src2, src3);
79 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
80 filter0, filter1, filter2, filter3, out0, out1);
81 src0 = __lsx_vld(_src, 0);
82 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
83 src3 = __lsx_vldx(_src, src_stride3);
84 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
85 src1, src2, src3);
86 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
87 filter0, filter1, filter2, filter3, out2, out3);
88 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
89 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
90
91 __lsx_vstelm_w(out0, dst, 0, 0);
92 dst += dst_stride;
93 __lsx_vstelm_w(out0, dst, 0, 1);
94 dst += dst_stride;
95 __lsx_vstelm_w(out0, dst, 0, 2);
96 dst += dst_stride;
97 __lsx_vstelm_w(out0, dst, 0, 3);
98 dst += dst_stride;
99 __lsx_vstelm_w(out1, dst, 0, 0);
100 dst += dst_stride;
101 __lsx_vstelm_w(out1, dst, 0, 1);
102 dst += dst_stride;
103 __lsx_vstelm_w(out1, dst, 0, 2);
104 dst += dst_stride;
105 __lsx_vstelm_w(out1, dst, 0, 3);
106 }
107
common_hz_8t_4w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)108 static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
109 uint8_t *dst, int32_t dst_stride,
110 const int8_t *filter, int32_t height) {
111 if (height == 4) {
112 common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
113 } else if (height == 8) {
114 common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
115 }
116 }
117
common_hz_8t_8x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)118 static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
119 uint8_t *dst, int32_t dst_stride,
120 const int8_t *filter) {
121 __m128i src0, src1, src2, src3;
122 __m128i filter0, filter1, filter2, filter3;
123 __m128i mask0, mask1, mask2, mask3;
124 __m128i out0, out1, out2, out3;
125
126 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
127 src -= 3;
128 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
129 mask3 = __lsx_vaddi_bu(mask0, 6);
130 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
131 filter0, filter1, filter2, filter3);
132
133 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
134 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
135 src1, src2, src3);
136 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
137 filter0, filter1, filter2, filter3, out0, out1,
138 out2, out3);
139 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
140 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
141 __lsx_vstelm_d(out0, dst, 0, 0);
142 dst += dst_stride;
143 __lsx_vstelm_d(out0, dst, 0, 1);
144 dst += dst_stride;
145 __lsx_vstelm_d(out1, dst, 0, 0);
146 dst += dst_stride;
147 __lsx_vstelm_d(out1, dst, 0, 1);
148 }
149
common_hz_8t_8x8mult_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)150 static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
151 uint8_t *dst, int32_t dst_stride,
152 const int8_t *filter, int32_t height) {
153 uint32_t loop_cnt = height >> 2;
154 int32_t src_stride2 = src_stride << 1;
155 int32_t src_stride3 = src_stride + src_stride2;
156 int32_t src_stride4 = src_stride2 << 1;
157 __m128i src0, src1, src2, src3;
158 __m128i filter0, filter1, filter2, filter3;
159 __m128i mask0, mask1, mask2, mask3;
160 __m128i out0, out1, out2, out3;
161 uint8_t *_src = (uint8_t *)src - 3;
162
163 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
164 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
165 mask3 = __lsx_vaddi_bu(mask0, 6);
166 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
167 filter0, filter1, filter2, filter3);
168
169 for (; loop_cnt--;) {
170 src0 = __lsx_vld(_src, 0);
171 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
172 src3 = __lsx_vldx(_src, src_stride3);
173 _src += src_stride4;
174 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
175 src1, src2, src3);
176 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
177 mask3, filter0, filter1, filter2, filter3, out0,
178 out1, out2, out3);
179 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
180 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
181 __lsx_vstelm_d(out0, dst, 0, 0);
182 dst += dst_stride;
183 __lsx_vstelm_d(out0, dst, 0, 1);
184 dst += dst_stride;
185 __lsx_vstelm_d(out1, dst, 0, 0);
186 dst += dst_stride;
187 __lsx_vstelm_d(out1, dst, 0, 1);
188 dst += dst_stride;
189 }
190 }
191
common_hz_8t_8w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)192 static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
193 uint8_t *dst, int32_t dst_stride,
194 const int8_t *filter, int32_t height) {
195 if (height == 4) {
196 common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
197 } else {
198 common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
199 }
200 }
201
common_hz_8t_16w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)202 static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
203 uint8_t *dst, int32_t dst_stride,
204 const int8_t *filter, int32_t height) {
205 uint32_t loop_cnt = height >> 1;
206 int32_t stride = src_stride << 1;
207 __m128i src0, src1, src2, src3;
208 __m128i filter0, filter1, filter2, filter3;
209 __m128i mask0, mask1, mask2, mask3;
210 __m128i out0, out1, out2, out3;
211
212 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
213 src -= 3;
214 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
215 mask3 = __lsx_vaddi_bu(mask0, 6);
216 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
217 filter0, filter1, filter2, filter3);
218
219 for (; loop_cnt--;) {
220 const uint8_t *_src = src + src_stride;
221 DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
222 DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
223 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
224 src1, src2, src3);
225 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
226 mask3, filter0, filter1, filter2, filter3, out0,
227 out1, out2, out3);
228 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
229 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
230 __lsx_vst(out0, dst, 0);
231 dst += dst_stride;
232 __lsx_vst(out1, dst, 0);
233 dst += dst_stride;
234 src += stride;
235 }
236 }
237
common_hz_8t_32w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)238 static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
239 uint8_t *dst, int32_t dst_stride,
240 const int8_t *filter, int32_t height) {
241 uint32_t loop_cnt = height >> 1;
242 __m128i src0, src1, src2, src3;
243 __m128i filter0, filter1, filter2, filter3;
244 __m128i mask0, mask1, mask2, mask3;
245 __m128i out0, out1, out2, out3;
246 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
247
248 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
249 src -= 3;
250 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
251 mask3 = __lsx_vaddi_bu(mask0, 6);
252 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
253 filter0, filter1, filter2, filter3);
254
255 for (; loop_cnt--;) {
256 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
257 src3 = __lsx_vld(src, 24);
258 src1 = __lsx_vshuf_b(src2, src0, shuff);
259 src += src_stride;
260 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
261 src1, src2, src3);
262 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
263 mask3, filter0, filter1, filter2, filter3, out0,
264 out1, out2, out3);
265 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
266 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
267 __lsx_vst(out0, dst, 0);
268 __lsx_vst(out1, dst, 16);
269
270 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
271 src3 = __lsx_vld(src, 24);
272 src1 = __lsx_vshuf_b(src2, src0, shuff);
273 src += src_stride;
274
275 dst += dst_stride;
276 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
277 src1, src2, src3);
278 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
279 mask3, filter0, filter1, filter2, filter3, out0,
280 out1, out2, out3);
281 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
282 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
283 __lsx_vst(out0, dst, 0);
284 __lsx_vst(out1, dst, 16);
285 dst += dst_stride;
286 }
287 }
288
common_hz_8t_64w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)289 static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
290 uint8_t *dst, int32_t dst_stride,
291 const int8_t *filter, int32_t height) {
292 int32_t loop_cnt = height;
293 __m128i src0, src1, src2, src3;
294 __m128i filter0, filter1, filter2, filter3;
295 __m128i mask0, mask1, mask2, mask3;
296 __m128i out0, out1, out2, out3;
297 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
298
299 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
300 src -= 3;
301 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
302 mask3 = __lsx_vaddi_bu(mask0, 6);
303 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
304 filter0, filter1, filter2, filter3);
305
306 for (; loop_cnt--;) {
307 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
308 src3 = __lsx_vld(src, 24);
309 src1 = __lsx_vshuf_b(src2, src0, shuff);
310 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
311 src1, src2, src3);
312 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
313 mask3, filter0, filter1, filter2, filter3, out0,
314 out1, out2, out3);
315 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
316 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
317 __lsx_vst(out0, dst, 0);
318 __lsx_vst(out1, dst, 16);
319
320 DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
321 src3 = __lsx_vld(src, 56);
322 src1 = __lsx_vshuf_b(src2, src0, shuff);
323 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
324 src1, src2, src3);
325 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
326 mask3, filter0, filter1, filter2, filter3, out0,
327 out1, out2, out3);
328 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
329 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
330 __lsx_vst(out0, dst, 32);
331 __lsx_vst(out1, dst, 48);
332 src += src_stride;
333 dst += dst_stride;
334 }
335 }
336
common_hz_2t_4x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)337 static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
338 uint8_t *dst, int32_t dst_stride,
339 int8_t *filter) {
340 __m128i src0, src1, src2, src3, mask;
341 __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
342 int32_t src_stride2 = src_stride << 1;
343 int32_t src_stride3 = src_stride + src_stride2;
344
345 int32_t dst_stride2 = dst_stride << 1;
346 int32_t dst_stride3 = dst_stride + dst_stride2;
347
348 mask = __lsx_vld(mc_filt_mask_arr, 16);
349 /* rearranging filter */
350 filt0 = __lsx_vldrepl_h(filter, 0);
351
352 src0 = __lsx_vld(src, 0);
353 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
354 src3 = __lsx_vldx(src, src_stride3);
355 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
356 DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
357 DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
358 FILTER_BITS, res0, res1);
359
360 __lsx_vstelm_w(res0, dst, 0, 0);
361 __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
362 __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
363 __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
364 }
365
common_hz_2t_4x8_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)366 static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
367 uint8_t *dst, int32_t dst_stride,
368 int8_t *filter) {
369 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
370 __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
371 __m128i res0, res1, res2, res3, filt0;
372 int32_t src_stride2 = src_stride << 1;
373 int32_t src_stride3 = src_stride + src_stride2;
374 int32_t src_stride4 = src_stride2 << 1;
375
376 int32_t dst_stride2 = dst_stride << 1;
377 int32_t dst_stride3 = dst_stride + dst_stride2;
378
379 uint8_t *src_tmp1 = src + src_stride4;
380
381 mask = __lsx_vld(mc_filt_mask_arr, 16);
382
383 /* rearranging filter */
384 filt0 = __lsx_vldrepl_h(filter, 0);
385
386 src0 = __lsx_vld(src, 0);
387 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
388 src, src_stride4, src1, src2, src3, src4);
389 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
390 src6);
391 src7 = __lsx_vldx(src_tmp1, src_stride3);
392
393 DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
394 src7, src6, mask, vec0, vec1, vec2, vec3);
395 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
396 vec4, vec5, vec6, vec7);
397 DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
398 FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
399 res1, res2, res3);
400
401 __lsx_vstelm_w(res0, dst, 0, 0);
402 dst += dst_stride;
403 __lsx_vstelm_w(res0, dst, 0, 1);
404 dst += dst_stride;
405 __lsx_vstelm_w(res1, dst, 0, 0);
406 dst += dst_stride;
407 __lsx_vstelm_w(res1, dst, 0, 1);
408 dst += dst_stride;
409
410 __lsx_vstelm_w(res2, dst, 0, 0);
411 __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
412 __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
413 __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
414 }
415
common_hz_2t_4w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)416 static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
417 uint8_t *dst, int32_t dst_stride,
418 int8_t *filter, int32_t height) {
419 if (height == 4) {
420 common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
421 } else if (height == 8) {
422 common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
423 }
424 }
425
common_hz_2t_8x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)426 static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
427 uint8_t *dst, int32_t dst_stride,
428 int8_t *filter) {
429 __m128i filt0, mask;
430 __m128i src0, src1, src2, src3;
431 __m128i vec0, vec1, vec2, vec3;
432
433 int32_t src_stride2 = src_stride << 1;
434 int32_t src_stride3 = src_stride2 + src_stride;
435
436 int32_t dst_stride2 = dst_stride << 1;
437 int32_t dst_stride3 = dst_stride2 + dst_stride;
438
439 mask = __lsx_vld(mc_filt_mask_arr, 0);
440
441 /* rearranging filter */
442 filt0 = __lsx_vldrepl_h(filter, 0);
443
444 src0 = __lsx_vld(src, 0);
445 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
446 src3 = __lsx_vldx(src, src_stride3);
447
448 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
449 src3, src3, mask, vec0, vec1, vec2, vec3);
450 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
451 vec0, vec1, vec2, vec3);
452 DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
453 FILTER_BITS, vec0, vec1);
454
455 __lsx_vstelm_d(vec0, dst, 0, 0);
456 __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
457 __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
458 __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
459 }
460
common_hz_2t_8x8mult_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)461 static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
462 uint8_t *dst, int32_t dst_stride,
463 int8_t *filter, int32_t height) {
464 __m128i filt0, mask;
465 __m128i src0, src1, src2, src3, out0, out1;
466 __m128i vec0, vec1, vec2, vec3;
467
468 int32_t src_stride2 = src_stride << 1;
469 int32_t src_stride3 = src_stride2 + src_stride;
470 int32_t src_stride4 = src_stride2 << 1;
471
472 int32_t dst_stride2 = dst_stride << 1;
473 int32_t dst_stride3 = dst_stride2 + dst_stride;
474 int32_t dst_stride4 = dst_stride2 << 1;
475
476 mask = __lsx_vld(mc_filt_mask_arr, 0);
477
478 /* rearranging filter */
479 filt0 = __lsx_vldrepl_h(filter, 0);
480
481 src0 = __lsx_vld(src, 0);
482 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
483 src3 = __lsx_vldx(src, src_stride3);
484 src += src_stride4;
485
486 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
487 src3, src3, mask, vec0, vec1, vec2, vec3);
488 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
489 vec0, vec1, vec2, vec3);
490 DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
491 FILTER_BITS, out0, out1);
492
493 __lsx_vstelm_d(out0, dst, 0, 0);
494 dst += dst_stride;
495 __lsx_vstelm_d(out0, dst, 0, 1);
496 dst += dst_stride;
497 __lsx_vstelm_d(out1, dst, 0, 0);
498 dst += dst_stride;
499 __lsx_vstelm_d(out1, dst, 0, 1);
500 dst += dst_stride;
501
502 src0 = __lsx_vld(src, 0);
503 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
504 src3 = __lsx_vldx(src, src_stride3);
505 src += src_stride4;
506
507 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
508 src3, src3, mask, vec0, vec1, vec2, vec3);
509 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
510 vec0, vec1, vec2, vec3);
511 DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
512 FILTER_BITS, out0, out1);
513
514 __lsx_vstelm_d(out0, dst, 0, 0);
515 dst += dst_stride;
516 __lsx_vstelm_d(out0, dst, 0, 1);
517 dst += dst_stride;
518 __lsx_vstelm_d(out1, dst, 0, 0);
519 dst += dst_stride;
520 __lsx_vstelm_d(out1, dst, 0, 1);
521 dst += dst_stride;
522
523 if (height == 16) {
524 uint8_t *dst_tmp1 = dst + dst_stride4;
525
526 src0 = __lsx_vld(src, 0);
527 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
528 src3 = __lsx_vldx(src, src_stride3);
529 src += src_stride4;
530
531 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
532 mask, src3, src3, mask, vec0, vec1, vec2, vec3);
533 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
534 filt0, vec0, vec1, vec2, vec3);
535 DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
536 FILTER_BITS, out0, out1);
537
538 __lsx_vstelm_d(out0, dst, 0, 0);
539 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
540 __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
541 __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
542
543 src0 = __lsx_vld(src, 0);
544 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
545 src3 = __lsx_vldx(src, src_stride3);
546 src += src_stride4;
547
548 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
549 mask, src3, src3, mask, vec0, vec1, vec2, vec3);
550 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
551 filt0, vec0, vec1, vec2, vec3);
552 DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
553 FILTER_BITS, out0, out1);
554
555 __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
556 __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
557 __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0);
558 __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1);
559 }
560 }
561
common_hz_2t_8w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)562 static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
563 uint8_t *dst, int32_t dst_stride,
564 int8_t *filter, int32_t height) {
565 if (height == 4) {
566 common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
567 } else {
568 common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
569 }
570 }
571
common_hz_2t_16w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)572 static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
573 uint8_t *dst, int32_t dst_stride,
574 int8_t *filter, int32_t height) {
575 uint32_t loop_cnt = (height >> 2) - 1;
576 __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
577 __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
578 __m128i out0, out1, out2, out3, out4, out5, out6, out7;
579
580 int32_t src_stride2 = src_stride << 1;
581 int32_t src_stride3 = src_stride2 + src_stride;
582 int32_t src_stride4 = src_stride2 << 1;
583
584 uint8_t *src_tmp1 = src + 8;
585 mask = __lsx_vld(mc_filt_mask_arr, 0);
586 filt0 = __lsx_vldrepl_h(filter, 0);
587
588 src0 = __lsx_vld(src, 0);
589 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
590 src6 = __lsx_vldx(src, src_stride3);
591 src1 = __lsx_vld(src_tmp1, 0);
592 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
593 src5);
594 src7 = __lsx_vldx(src_tmp1, src_stride3);
595 src += src_stride4;
596
597 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
598 src3, src3, mask, vec0, vec1, vec2, vec3);
599 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask,
600 src7, src7, mask, vec4, vec5, vec6, vec7);
601 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
602 out0, out1, out2, out3);
603 DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
604 out4, out5, out6, out7);
605 DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
606 FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
607 out1, out2, out3);
608
609 __lsx_vst(out0, dst, 0);
610 dst += dst_stride;
611 __lsx_vst(out1, dst, 0);
612 dst += dst_stride;
613 __lsx_vst(out2, dst, 0);
614 dst += dst_stride;
615 __lsx_vst(out3, dst, 0);
616 dst += dst_stride;
617
618 for (; loop_cnt--;) {
619 src_tmp1 += src_stride4;
620
621 src0 = __lsx_vld(src, 0);
622 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
623 src6 = __lsx_vldx(src, src_stride3);
624
625 src1 = __lsx_vld(src_tmp1, 0);
626 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
627 src5);
628 src7 = __lsx_vldx(src_tmp1, src_stride3);
629 src += src_stride4;
630
631 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
632 mask, src3, src3, mask, vec0, vec1, vec2, vec3);
633 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
634 mask, src7, src7, mask, vec4, vec5, vec6, vec7);
635 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
636 filt0, out0, out1, out2, out3);
637 DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
638 filt0, out4, out5, out6, out7);
639 DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
640 FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
641 out0, out1, out2, out3);
642
643 __lsx_vst(out0, dst, 0);
644 dst += dst_stride;
645 __lsx_vst(out1, dst, 0);
646 dst += dst_stride;
647 __lsx_vst(out2, dst, 0);
648 dst += dst_stride;
649 __lsx_vst(out3, dst, 0);
650 dst += dst_stride;
651 }
652 }
653
common_hz_2t_32w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)654 static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
655 uint8_t *dst, int32_t dst_stride,
656 int8_t *filter, int32_t height) {
657 uint32_t loop_cnt = (height >> 1);
658 __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
659 __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
660 __m128i out0, out1, out2, out3, out4, out5, out6, out7;
661 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
662
663 mask = __lsx_vld(mc_filt_mask_arr, 0);
664 /* rearranging filter */
665 filt0 = __lsx_vldrepl_h(filter, 0);
666
667 for (; loop_cnt--;) {
668 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
669 src3 = __lsx_vld(src, 24);
670 src1 = __lsx_vshuf_b(src2, src0, shuff);
671 src += src_stride;
672 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6);
673 src7 = __lsx_vld(src, 24);
674 src5 = __lsx_vshuf_b(src6, src4, shuff);
675 src += src_stride;
676
677 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
678 mask, src3, src3, mask, vec0, vec1, vec2, vec3);
679 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
680 mask, src7, src7, mask, vec4, vec5, vec6, vec7);
681 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
682 filt0, out0, out1, out2, out3);
683 DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
684 filt0, out4, out5, out6, out7);
685 DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
686 FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
687 out0, out1, out2, out3);
688
689 __lsx_vst(out0, dst, 0);
690 __lsx_vst(out1, dst, 16);
691 dst += dst_stride;
692
693 __lsx_vst(out2, dst, 0);
694 __lsx_vst(out3, dst, 16);
695 dst += dst_stride;
696 }
697 }
698
common_hz_2t_64w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)699 static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
700 uint8_t *dst, int32_t dst_stride,
701 int8_t *filter, int32_t height) {
702 uint32_t loop_cnt = height;
703 __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
704 __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
705 __m128i out0, out1, out2, out3, out4, out5, out6, out7;
706 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
707
708 mask = __lsx_vld(mc_filt_mask_arr, 0);
709
710 /* rearranging filter */
711 filt0 = __lsx_vldrepl_h(filter, 0);
712
713 for (; loop_cnt--;) {
714 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
715 src6);
716 src7 = __lsx_vld(src, 56);
717 DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
718 src5 = __lsx_vshuf_b(src6, src4, shuff);
719 src += src_stride;
720
721 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
722 mask, src3, src3, mask, vec0, vec1, vec2, vec3);
723 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
724 mask, src7, src7, mask, vec4, vec5, vec6, vec7);
725
726 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
727 filt0, out0, out1, out2, out3);
728 DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
729 filt0, out4, out5, out6, out7);
730 DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
731 FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
732 out0, out1, out2, out3);
733
734 __lsx_vst(out0, dst, 0);
735 __lsx_vst(out1, dst, 16);
736 __lsx_vst(out2, dst, 32);
737 __lsx_vst(out3, dst, 48);
738 dst += dst_stride;
739 }
740 }
741
vpx_convolve8_horiz_lsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)742 void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
743 uint8_t *dst, ptrdiff_t dst_stride,
744 const InterpKernel *filter, int x0_q4,
745 int x_step_q4, int y0_q4, int y_step_q4, int w,
746 int h) {
747 const int16_t *const filter_x = filter[x0_q4];
748 int8_t cnt, filt_hor[8];
749
750 assert(x_step_q4 == 16);
751 assert(((const int32_t *)filter_x)[1] != 0x800000);
752
753 for (cnt = 0; cnt < 8; ++cnt) {
754 filt_hor[cnt] = filter_x[cnt];
755 }
756 if (vpx_get_filter_taps(filter_x) == 2) {
757 switch (w) {
758 case 4:
759 common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
760 &filt_hor[3], h);
761 break;
762 case 8:
763 common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
764 &filt_hor[3], h);
765 break;
766 case 16:
767 common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
768 &filt_hor[3], h);
769 break;
770 case 32:
771 common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
772 &filt_hor[3], h);
773 break;
774 case 64:
775 common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
776 &filt_hor[3], h);
777 break;
778 default:
779 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
780 x_step_q4, y0_q4, y_step_q4, w, h);
781 break;
782 }
783 } else {
784 switch (w) {
785 case 4:
786 common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
787 filt_hor, h);
788 break;
789 case 8:
790 common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
791 filt_hor, h);
792 break;
793
794 case 16:
795 common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
796 filt_hor, h);
797 break;
798
799 case 32:
800 common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
801 filt_hor, h);
802 break;
803
804 case 64:
805 common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
806 filt_hor, h);
807 break;
808 default:
809 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
810 x_step_q4, y0_q4, y_step_q4, w, h);
811 break;
812 }
813 }
814 }
815