1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14
common_hz_8t_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
16 int32_t src_stride, uint8_t *dst,
17 int32_t dst_stride,
18 int8_t *filter) {
19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
20 v16u8 dst0, dst1, dst2, dst3, res2, res3;
21 v16u8 mask0, mask1, mask2, mask3;
22 v8i16 filt, res0, res1;
23
24 mask0 = LD_UB(&mc_filt_mask_arr[16]);
25 src -= 3;
26
27 /* rearranging filter */
28 filt = LD_SH(filter);
29 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
30
31 mask1 = mask0 + 2;
32 mask2 = mask0 + 4;
33 mask3 = mask0 + 6;
34
35 LD_SB4(src, src_stride, src0, src1, src2, src3);
36 XORI_B4_128_SB(src0, src1, src2, src3);
37 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
38 filt0, filt1, filt2, filt3, res0, res1);
39 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
40 SRARI_H2_SH(res0, res1, FILTER_BITS);
41 SAT_SH2_SH(res0, res1, 7);
42 PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
43 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
44 XORI_B2_128_UB(res2, res3);
45 AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
46 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
47 }
48
common_hz_8t_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)49 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
50 int32_t src_stride, uint8_t *dst,
51 int32_t dst_stride,
52 int8_t *filter) {
53 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
54 v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
55 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
56 v8i16 filt, vec0, vec1, vec2, vec3;
57
58 mask0 = LD_UB(&mc_filt_mask_arr[16]);
59 src -= 3;
60
61 /* rearranging filter */
62 filt = LD_SH(filter);
63 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
64
65 mask1 = mask0 + 2;
66 mask2 = mask0 + 4;
67 mask3 = mask0 + 6;
68
69 LD_SB4(src, src_stride, src0, src1, src2, src3);
70 XORI_B4_128_SB(src0, src1, src2, src3);
71 src += (4 * src_stride);
72 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
73 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
74 filt0, filt1, filt2, filt3, vec0, vec1);
75 LD_SB4(src, src_stride, src0, src1, src2, src3);
76 XORI_B4_128_SB(src0, src1, src2, src3);
77 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
78 filt0, filt1, filt2, filt3, vec2, vec3);
79 SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
80 SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
81 PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
82 res3);
83 ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
84 XORI_B2_128_UB(res0, res2);
85 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
86 dst6);
87 ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
88 AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
89 ST4x8_UB(res0, res2, dst, dst_stride);
90 }
91
common_hz_8t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)92 static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
93 int32_t src_stride, uint8_t *dst,
94 int32_t dst_stride, int8_t *filter,
95 int32_t height) {
96 if (4 == height) {
97 common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
98 } else if (8 == height) {
99 common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
100 }
101 }
102
common_hz_8t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)103 static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
104 int32_t src_stride, uint8_t *dst,
105 int32_t dst_stride, int8_t *filter,
106 int32_t height) {
107 int32_t loop_cnt;
108 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
109 v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
110 v8i16 filt, out0, out1, out2, out3;
111
112 mask0 = LD_UB(&mc_filt_mask_arr[0]);
113 src -= 3;
114
115 /* rearranging filter */
116 filt = LD_SH(filter);
117 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
118
119 mask1 = mask0 + 2;
120 mask2 = mask0 + 4;
121 mask3 = mask0 + 6;
122
123 for (loop_cnt = (height >> 2); loop_cnt--;) {
124 LD_SB4(src, src_stride, src0, src1, src2, src3);
125 XORI_B4_128_SB(src0, src1, src2, src3);
126 src += (4 * src_stride);
127 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
128 mask3, filt0, filt1, filt2, filt3, out0, out1,
129 out2, out3);
130 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
131 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
132 SAT_SH4_SH(out0, out1, out2, out3, 7);
133 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
134 dst_stride);
135 dst += (4 * dst_stride);
136 }
137 }
138
common_hz_8t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)139 static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
140 int32_t src_stride, uint8_t *dst,
141 int32_t dst_stride,
142 int8_t *filter, int32_t height) {
143 int32_t loop_cnt;
144 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
145 v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
146 v8i16 filt, out0, out1, out2, out3;
147 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
148 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
149
150 mask0 = LD_UB(&mc_filt_mask_arr[0]);
151 src -= 3;
152
153 /* rearranging filter */
154 filt = LD_SH(filter);
155 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
156
157 mask1 = mask0 + 2;
158 mask2 = mask0 + 4;
159 mask3 = mask0 + 6;
160
161 for (loop_cnt = height >> 1; loop_cnt--;) {
162 LD_SB2(src, src_stride, src0, src2);
163 LD_SB2(src + 8, src_stride, src1, src3);
164 src += (2 * src_stride);
165
166 XORI_B4_128_SB(src0, src1, src2, src3);
167 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
168 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
169 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
170 vec14);
171 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
172 vec15);
173 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
174 vec2, vec3);
175 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
176 vec9, vec10, vec11);
177 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
178 vec2, vec3);
179 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
180 vec9, vec10, vec11);
181 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
182 out2, out3);
183 LD_UB2(dst, dst_stride, dst0, dst1);
184 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
185 SAT_SH4_SH(out0, out1, out2, out3, 7);
186 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
187 dst += dst_stride;
188 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
189 dst += dst_stride;
190 }
191 }
192
common_hz_8t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)193 static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
194 int32_t src_stride, uint8_t *dst,
195 int32_t dst_stride,
196 int8_t *filter, int32_t height) {
197 uint32_t loop_cnt;
198 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
199 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
200 v8i16 filt, out0, out1, out2, out3;
201 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
202 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
203
204 mask0 = LD_UB(&mc_filt_mask_arr[0]);
205 src -= 3;
206
207 /* rearranging filter */
208 filt = LD_SH(filter);
209 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
210
211 mask1 = mask0 + 2;
212 mask2 = mask0 + 4;
213 mask3 = mask0 + 6;
214
215 for (loop_cnt = height; loop_cnt--;) {
216 src0 = LD_SB(src);
217 src2 = LD_SB(src + 16);
218 src3 = LD_SB(src + 24);
219 src1 = __msa_sldi_b(src2, src0, 8);
220 src += src_stride;
221
222 XORI_B4_128_SB(src0, src1, src2, src3);
223 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
224 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
225 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
226 vec14);
227 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
228 vec15);
229 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
230 vec2, vec3);
231 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
232 vec9, vec10, vec11);
233 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
234 vec2, vec3);
235 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
236 vec9, vec10, vec11);
237 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
238 out2, out3);
239 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
240 SAT_SH4_SH(out0, out1, out2, out3, 7);
241 LD_UB2(dst, 16, dst1, dst2);
242 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
243 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
244 dst += dst_stride;
245 }
246 }
247
common_hz_8t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)248 static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
249 int32_t src_stride, uint8_t *dst,
250 int32_t dst_stride,
251 int8_t *filter, int32_t height) {
252 uint32_t loop_cnt, cnt;
253 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
254 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
255 v8i16 filt, out0, out1, out2, out3;
256 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
257 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
258
259 mask0 = LD_UB(&mc_filt_mask_arr[0]);
260 src -= 3;
261
262 /* rearranging filter */
263 filt = LD_SH(filter);
264 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
265
266 mask1 = mask0 + 2;
267 mask2 = mask0 + 4;
268 mask3 = mask0 + 6;
269
270 for (loop_cnt = height; loop_cnt--;) {
271 for (cnt = 0; cnt < 2; ++cnt) {
272 src0 = LD_SB(&src[cnt << 5]);
273 src2 = LD_SB(&src[16 + (cnt << 5)]);
274 src3 = LD_SB(&src[24 + (cnt << 5)]);
275 src1 = __msa_sldi_b(src2, src0, 8);
276
277 XORI_B4_128_SB(src0, src1, src2, src3);
278 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
279 vec12);
280 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
281 vec13);
282 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
283 vec14);
284 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
285 vec15);
286 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
287 vec1, vec2, vec3);
288 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
289 vec9, vec10, vec11);
290 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
291 vec1, vec2, vec3);
292 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
293 vec9, vec10, vec11);
294 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
295 out2, out3);
296 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
297 SAT_SH4_SH(out0, out1, out2, out3, 7);
298 LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
299 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
300 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
301 }
302
303 src += src_stride;
304 dst += dst_stride;
305 }
306 }
307
common_hz_2t_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)308 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
309 int32_t src_stride, uint8_t *dst,
310 int32_t dst_stride,
311 int8_t *filter) {
312 v16i8 src0, src1, src2, src3, mask;
313 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
314 v8u16 vec2, vec3, filt;
315
316 mask = LD_SB(&mc_filt_mask_arr[16]);
317
318 /* rearranging filter */
319 filt = LD_UH(filter);
320 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
321
322 LD_SB4(src, src_stride, src0, src1, src2, src3);
323 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
324 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
325 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
326 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
327 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
328 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
329 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
330 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
331 }
332
common_hz_2t_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)333 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
334 int32_t src_stride, uint8_t *dst,
335 int32_t dst_stride,
336 int8_t *filter) {
337 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
338 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
339 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
340 v8u16 vec4, vec5, vec6, vec7, filt;
341
342 mask = LD_SB(&mc_filt_mask_arr[16]);
343
344 /* rearranging filter */
345 filt = LD_UH(filter);
346 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
347
348 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
349 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
350 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
351 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
352 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
353 vec6, vec7);
354 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
355 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
356 res3);
357 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
358 dst6);
359 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
360 res3);
361 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
362 dst += (4 * dst_stride);
363 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
364 }
365
common_hz_2t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)366 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
367 int32_t src_stride, uint8_t *dst,
368 int32_t dst_stride, int8_t *filter,
369 int32_t height) {
370 if (4 == height) {
371 common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
372 } else if (8 == height) {
373 common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
374 }
375 }
376
common_hz_2t_and_aver_dst_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)377 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
378 int32_t src_stride, uint8_t *dst,
379 int32_t dst_stride,
380 int8_t *filter) {
381 v16i8 src0, src1, src2, src3, mask;
382 v16u8 filt0, dst0, dst1, dst2, dst3;
383 v8u16 vec0, vec1, vec2, vec3, filt;
384
385 mask = LD_SB(&mc_filt_mask_arr[0]);
386
387 /* rearranging filter */
388 filt = LD_UH(filter);
389 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
390
391 LD_SB4(src, src_stride, src0, src1, src2, src3);
392 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
393 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
394 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
395 vec2, vec3);
396 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
397 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
398 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
399 dst_stride);
400 }
401
common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)402 static void common_hz_2t_and_aver_dst_8x8mult_msa(
403 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
404 int8_t *filter, int32_t height) {
405 v16i8 src0, src1, src2, src3, mask;
406 v16u8 filt0, dst0, dst1, dst2, dst3;
407 v8u16 vec0, vec1, vec2, vec3, filt;
408
409 mask = LD_SB(&mc_filt_mask_arr[0]);
410
411 /* rearranging filter */
412 filt = LD_UH(filter);
413 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
414
415 LD_SB4(src, src_stride, src0, src1, src2, src3);
416 src += (4 * src_stride);
417 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
418 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
419 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
420 vec2, vec3);
421 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
422 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
423 LD_SB4(src, src_stride, src0, src1, src2, src3);
424 src += (4 * src_stride);
425 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
426 dst_stride);
427 dst += (4 * dst_stride);
428
429 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
430 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
431 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
432 vec2, vec3);
433 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
434 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
435 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
436 dst_stride);
437 dst += (4 * dst_stride);
438
439 if (16 == height) {
440 LD_SB4(src, src_stride, src0, src1, src2, src3);
441 src += (4 * src_stride);
442
443 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
444 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
445 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
446 vec2, vec3);
447 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
448 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
449 LD_SB4(src, src_stride, src0, src1, src2, src3);
450 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
451 dst_stride);
452 dst += (4 * dst_stride);
453
454 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
455 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
456 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
457 vec2, vec3);
458 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
459 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
460 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
461 dst_stride);
462 }
463 }
464
common_hz_2t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)465 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
466 int32_t src_stride, uint8_t *dst,
467 int32_t dst_stride, int8_t *filter,
468 int32_t height) {
469 if (4 == height) {
470 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
471 } else {
472 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
473 filter, height);
474 }
475 }
476
common_hz_2t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)477 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
478 int32_t src_stride, uint8_t *dst,
479 int32_t dst_stride,
480 int8_t *filter, int32_t height) {
481 uint32_t loop_cnt;
482 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
483 v16u8 filt0, dst0, dst1, dst2, dst3;
484 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
485 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
486
487 mask = LD_SB(&mc_filt_mask_arr[0]);
488
489 /* rearranging filter */
490 filt = LD_UH(filter);
491 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
492
493 LD_SB4(src, src_stride, src0, src2, src4, src6);
494 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
495 src += (4 * src_stride);
496
497 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
498 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
499 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
500 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
501 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
502 res2, res3);
503 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
504 res6, res7);
505 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
506 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
507 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
508 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
509 dst += dst_stride;
510 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
511 dst += dst_stride;
512 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
513 dst += dst_stride;
514 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
515 dst += dst_stride;
516
517 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
518 LD_SB4(src, src_stride, src0, src2, src4, src6);
519 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
520 src += (4 * src_stride);
521
522 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
523 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
524 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
525 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
526 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
527 res2, res3);
528 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
529 res6, res7);
530 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
531 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
532 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
533 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
534 dst += dst_stride;
535 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
536 dst += dst_stride;
537 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
538 dst += dst_stride;
539 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
540 dst += dst_stride;
541 }
542 }
543
common_hz_2t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)544 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
545 int32_t src_stride, uint8_t *dst,
546 int32_t dst_stride,
547 int8_t *filter, int32_t height) {
548 uint32_t loop_cnt;
549 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
550 v16u8 filt0, dst0, dst1, dst2, dst3;
551 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
552 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
553
554 mask = LD_SB(&mc_filt_mask_arr[0]);
555
556 /* rearranging filter */
557 filt = LD_UH(filter);
558 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
559
560 for (loop_cnt = (height >> 1); loop_cnt--;) {
561 src0 = LD_SB(src);
562 src2 = LD_SB(src + 16);
563 src3 = LD_SB(src + 24);
564 src1 = __msa_sldi_b(src2, src0, 8);
565 src += src_stride;
566 src4 = LD_SB(src);
567 src6 = LD_SB(src + 16);
568 src7 = LD_SB(src + 24);
569 src5 = __msa_sldi_b(src6, src4, 8);
570 src += src_stride;
571
572 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
573 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
574 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
575 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
576 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
577 res2, res3);
578 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
579 res6, res7);
580 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
581 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
582 LD_UB2(dst, 16, dst0, dst1);
583 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
584 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
585 dst += dst_stride;
586 LD_UB2(dst, 16, dst2, dst3);
587 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
588 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
589 dst += dst_stride;
590 }
591 }
592
common_hz_2t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)593 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
594 int32_t src_stride, uint8_t *dst,
595 int32_t dst_stride,
596 int8_t *filter, int32_t height) {
597 uint32_t loop_cnt;
598 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
599 v16u8 filt0, dst0, dst1, dst2, dst3;
600 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
601 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
602
603 mask = LD_SB(&mc_filt_mask_arr[0]);
604
605 /* rearranging filter */
606 filt = LD_UH(filter);
607 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
608
609 for (loop_cnt = height; loop_cnt--;) {
610 LD_SB4(src, 16, src0, src2, src4, src6);
611 src7 = LD_SB(src + 56);
612 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
613 src += src_stride;
614
615 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
616 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
617 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
618 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
619 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
620 out2, out3);
621 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
622 out6, out7);
623 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
624 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
625 LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
626 PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
627 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
628 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
629 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
630 dst += dst_stride;
631 }
632 }
633
vpx_convolve8_avg_horiz_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)634 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
635 uint8_t *dst, ptrdiff_t dst_stride,
636 const int16_t *filter_x, int x_step_q4,
637 const int16_t *filter_y, int y_step_q4, int w,
638 int h) {
639 int8_t cnt, filt_hor[8];
640
641 assert(x_step_q4 == 16);
642 assert(((const int32_t *)filter_x)[1] != 0x800000);
643
644 for (cnt = 0; cnt < 8; ++cnt) {
645 filt_hor[cnt] = filter_x[cnt];
646 }
647
648 if (((const int32_t *)filter_x)[0] == 0) {
649 switch (w) {
650 case 4:
651 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
652 (int32_t)dst_stride, &filt_hor[3], h);
653 break;
654 case 8:
655 common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
656 (int32_t)dst_stride, &filt_hor[3], h);
657 break;
658 case 16:
659 common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
660 (int32_t)dst_stride, &filt_hor[3], h);
661 break;
662 case 32:
663 common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
664 (int32_t)dst_stride, &filt_hor[3], h);
665 break;
666 case 64:
667 common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
668 (int32_t)dst_stride, &filt_hor[3], h);
669 break;
670 default:
671 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
672 x_step_q4, filter_y, y_step_q4, w, h);
673 break;
674 }
675 } else {
676 switch (w) {
677 case 4:
678 common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
679 (int32_t)dst_stride, filt_hor, h);
680 break;
681 case 8:
682 common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
683 (int32_t)dst_stride, filt_hor, h);
684 break;
685 case 16:
686 common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
687 (int32_t)dst_stride, filt_hor, h);
688 break;
689 case 32:
690 common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
691 (int32_t)dst_stride, filt_hor, h);
692 break;
693 case 64:
694 common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
695 (int32_t)dst_stride, filt_hor, h);
696 break;
697 default:
698 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
699 x_step_q4, filter_y, y_step_q4, w, h);
700 break;
701 }
702 }
703 }
704