1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14
common_hz_8t_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)15 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
16 uint8_t *dst, int32_t dst_stride,
17 int8_t *filter) {
18 v16u8 mask0, mask1, mask2, mask3, out;
19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
20 v8i16 filt, out0, out1;
21
22 mask0 = LD_UB(&mc_filt_mask_arr[16]);
23 src -= 3;
24
25 /* rearranging filter */
26 filt = LD_SH(filter);
27 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
28
29 mask1 = mask0 + 2;
30 mask2 = mask0 + 4;
31 mask3 = mask0 + 6;
32
33 LD_SB4(src, src_stride, src0, src1, src2, src3);
34 XORI_B4_128_SB(src0, src1, src2, src3);
35 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
36 filt0, filt1, filt2, filt3, out0, out1);
37 SRARI_H2_SH(out0, out1, FILTER_BITS);
38 SAT_SH2_SH(out0, out1, 7);
39 out = PCKEV_XORI128_UB(out0, out1);
40 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
41 }
42
common_hz_8t_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)43 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
44 uint8_t *dst, int32_t dst_stride,
45 int8_t *filter) {
46 v16i8 filt0, filt1, filt2, filt3;
47 v16i8 src0, src1, src2, src3;
48 v16u8 mask0, mask1, mask2, mask3, out;
49 v8i16 filt, out0, out1, out2, out3;
50
51 mask0 = LD_UB(&mc_filt_mask_arr[16]);
52 src -= 3;
53
54 /* rearranging filter */
55 filt = LD_SH(filter);
56 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
57
58 mask1 = mask0 + 2;
59 mask2 = mask0 + 4;
60 mask3 = mask0 + 6;
61
62 LD_SB4(src, src_stride, src0, src1, src2, src3);
63 XORI_B4_128_SB(src0, src1, src2, src3);
64 src += (4 * src_stride);
65 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
66 filt0, filt1, filt2, filt3, out0, out1);
67 LD_SB4(src, src_stride, src0, src1, src2, src3);
68 XORI_B4_128_SB(src0, src1, src2, src3);
69 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
70 filt0, filt1, filt2, filt3, out2, out3);
71 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
72 SAT_SH4_SH(out0, out1, out2, out3, 7);
73 out = PCKEV_XORI128_UB(out0, out1);
74 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
75 dst += (4 * dst_stride);
76 out = PCKEV_XORI128_UB(out2, out3);
77 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
78 }
79
common_hz_8t_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)80 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
81 uint8_t *dst, int32_t dst_stride,
82 int8_t *filter, int32_t height) {
83 if (4 == height) {
84 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
85 } else if (8 == height) {
86 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
87 }
88 }
89
common_hz_8t_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)90 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
91 uint8_t *dst, int32_t dst_stride,
92 int8_t *filter) {
93 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
94 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
95 v8i16 filt, out0, out1, out2, out3;
96
97 mask0 = LD_UB(&mc_filt_mask_arr[0]);
98 src -= 3;
99
100 /* rearranging filter */
101 filt = LD_SH(filter);
102 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
103
104 mask1 = mask0 + 2;
105 mask2 = mask0 + 4;
106 mask3 = mask0 + 6;
107
108 LD_SB4(src, src_stride, src0, src1, src2, src3);
109 XORI_B4_128_SB(src0, src1, src2, src3);
110 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
111 filt0, filt1, filt2, filt3, out0, out1, out2,
112 out3);
113 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
114 SAT_SH4_SH(out0, out1, out2, out3, 7);
115 tmp0 = PCKEV_XORI128_UB(out0, out1);
116 tmp1 = PCKEV_XORI128_UB(out2, out3);
117 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
118 }
119
common_hz_8t_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)120 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
121 uint8_t *dst, int32_t dst_stride,
122 int8_t *filter, int32_t height) {
123 uint32_t loop_cnt;
124 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
125 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
126 v8i16 filt, out0, out1, out2, out3;
127
128 mask0 = LD_UB(&mc_filt_mask_arr[0]);
129 src -= 3;
130
131 /* rearranging filter */
132 filt = LD_SH(filter);
133 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
134
135 mask1 = mask0 + 2;
136 mask2 = mask0 + 4;
137 mask3 = mask0 + 6;
138
139 for (loop_cnt = (height >> 2); loop_cnt--;) {
140 LD_SB4(src, src_stride, src0, src1, src2, src3);
141 XORI_B4_128_SB(src0, src1, src2, src3);
142 src += (4 * src_stride);
143 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
144 mask3, filt0, filt1, filt2, filt3, out0, out1,
145 out2, out3);
146 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
147 SAT_SH4_SH(out0, out1, out2, out3, 7);
148 tmp0 = PCKEV_XORI128_UB(out0, out1);
149 tmp1 = PCKEV_XORI128_UB(out2, out3);
150 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
151 dst += (4 * dst_stride);
152 }
153 }
154
common_hz_8t_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)155 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
156 uint8_t *dst, int32_t dst_stride,
157 int8_t *filter, int32_t height) {
158 if (4 == height) {
159 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
160 } else {
161 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
162 }
163 }
164
common_hz_8t_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)165 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
166 uint8_t *dst, int32_t dst_stride,
167 int8_t *filter, int32_t height) {
168 uint32_t loop_cnt;
169 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
170 v16u8 mask0, mask1, mask2, mask3, out;
171 v8i16 filt, out0, out1, out2, out3;
172
173 mask0 = LD_UB(&mc_filt_mask_arr[0]);
174 src -= 3;
175
176 /* rearranging filter */
177 filt = LD_SH(filter);
178 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
179
180 mask1 = mask0 + 2;
181 mask2 = mask0 + 4;
182 mask3 = mask0 + 6;
183
184 for (loop_cnt = (height >> 1); loop_cnt--;) {
185 LD_SB2(src, src_stride, src0, src2);
186 LD_SB2(src + 8, src_stride, src1, src3);
187 XORI_B4_128_SB(src0, src1, src2, src3);
188 src += (2 * src_stride);
189 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
190 mask3, filt0, filt1, filt2, filt3, out0, out1,
191 out2, out3);
192 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
193 SAT_SH4_SH(out0, out1, out2, out3, 7);
194 out = PCKEV_XORI128_UB(out0, out1);
195 ST_UB(out, dst);
196 dst += dst_stride;
197 out = PCKEV_XORI128_UB(out2, out3);
198 ST_UB(out, dst);
199 dst += dst_stride;
200 }
201 }
202
common_hz_8t_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)203 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
204 uint8_t *dst, int32_t dst_stride,
205 int8_t *filter, int32_t height) {
206 uint32_t loop_cnt;
207 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
208 v16u8 mask0, mask1, mask2, mask3, out;
209 v8i16 filt, out0, out1, out2, out3;
210
211 mask0 = LD_UB(&mc_filt_mask_arr[0]);
212 src -= 3;
213
214 /* rearranging filter */
215 filt = LD_SH(filter);
216 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
217
218 mask1 = mask0 + 2;
219 mask2 = mask0 + 4;
220 mask3 = mask0 + 6;
221
222 for (loop_cnt = (height >> 1); loop_cnt--;) {
223 src0 = LD_SB(src);
224 src2 = LD_SB(src + 16);
225 src3 = LD_SB(src + 24);
226 src1 = __msa_sldi_b(src2, src0, 8);
227 src += src_stride;
228 XORI_B4_128_SB(src0, src1, src2, src3);
229 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
230 mask3, filt0, filt1, filt2, filt3, out0, out1,
231 out2, out3);
232 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
233 SAT_SH4_SH(out0, out1, out2, out3, 7);
234
235 src0 = LD_SB(src);
236 src2 = LD_SB(src + 16);
237 src3 = LD_SB(src + 24);
238 src1 = __msa_sldi_b(src2, src0, 8);
239 src += src_stride;
240
241 out = PCKEV_XORI128_UB(out0, out1);
242 ST_UB(out, dst);
243 out = PCKEV_XORI128_UB(out2, out3);
244 ST_UB(out, dst + 16);
245 dst += dst_stride;
246
247 XORI_B4_128_SB(src0, src1, src2, src3);
248 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
249 mask3, filt0, filt1, filt2, filt3, out0, out1,
250 out2, out3);
251 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
252 SAT_SH4_SH(out0, out1, out2, out3, 7);
253 out = PCKEV_XORI128_UB(out0, out1);
254 ST_UB(out, dst);
255 out = PCKEV_XORI128_UB(out2, out3);
256 ST_UB(out, dst + 16);
257 dst += dst_stride;
258 }
259 }
260
common_hz_8t_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)261 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
262 uint8_t *dst, int32_t dst_stride,
263 int8_t *filter, int32_t height) {
264 int32_t loop_cnt;
265 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
266 v16u8 mask0, mask1, mask2, mask3, out;
267 v8i16 filt, out0, out1, out2, out3;
268
269 mask0 = LD_UB(&mc_filt_mask_arr[0]);
270 src -= 3;
271
272 /* rearranging filter */
273 filt = LD_SH(filter);
274 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
275
276 mask1 = mask0 + 2;
277 mask2 = mask0 + 4;
278 mask3 = mask0 + 6;
279
280 for (loop_cnt = height; loop_cnt--;) {
281 src0 = LD_SB(src);
282 src2 = LD_SB(src + 16);
283 src3 = LD_SB(src + 24);
284 src1 = __msa_sldi_b(src2, src0, 8);
285
286 XORI_B4_128_SB(src0, src1, src2, src3);
287 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
288 mask3, filt0, filt1, filt2, filt3, out0, out1,
289 out2, out3);
290 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
291 SAT_SH4_SH(out0, out1, out2, out3, 7);
292 out = PCKEV_XORI128_UB(out0, out1);
293 ST_UB(out, dst);
294 out = PCKEV_XORI128_UB(out2, out3);
295 ST_UB(out, dst + 16);
296
297 src0 = LD_SB(src + 32);
298 src2 = LD_SB(src + 48);
299 src3 = LD_SB(src + 56);
300 src1 = __msa_sldi_b(src2, src0, 8);
301 src += src_stride;
302
303 XORI_B4_128_SB(src0, src1, src2, src3);
304 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
305 mask3, filt0, filt1, filt2, filt3, out0, out1,
306 out2, out3);
307 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
308 SAT_SH4_SH(out0, out1, out2, out3, 7);
309 out = PCKEV_XORI128_UB(out0, out1);
310 ST_UB(out, dst + 32);
311 out = PCKEV_XORI128_UB(out2, out3);
312 ST_UB(out, dst + 48);
313 dst += dst_stride;
314 }
315 }
316
common_hz_2t_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)317 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
318 uint8_t *dst, int32_t dst_stride,
319 int8_t *filter) {
320 v16i8 src0, src1, src2, src3, mask;
321 v16u8 filt0, vec0, vec1, res0, res1;
322 v8u16 vec2, vec3, filt;
323
324 mask = LD_SB(&mc_filt_mask_arr[16]);
325
326 /* rearranging filter */
327 filt = LD_UH(filter);
328 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
329
330 LD_SB4(src, src_stride, src0, src1, src2, src3);
331 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
332 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
333 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
334 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
335 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
336 }
337
common_hz_2t_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)338 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
339 uint8_t *dst, int32_t dst_stride,
340 int8_t *filter) {
341 v16u8 vec0, vec1, vec2, vec3, filt0;
342 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
343 v16i8 res0, res1, res2, res3;
344 v8u16 vec4, vec5, vec6, vec7, filt;
345
346 mask = LD_SB(&mc_filt_mask_arr[16]);
347
348 /* rearranging filter */
349 filt = LD_UH(filter);
350 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
351
352 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
353 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
354 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
355 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
356 vec6, vec7);
357 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
358 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
359 res3);
360 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
361 dst += (4 * dst_stride);
362 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
363 }
364
common_hz_2t_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)365 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
366 uint8_t *dst, int32_t dst_stride,
367 int8_t *filter, int32_t height) {
368 if (4 == height) {
369 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
370 } else if (8 == height) {
371 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
372 }
373 }
374
common_hz_2t_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)375 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
376 uint8_t *dst, int32_t dst_stride,
377 int8_t *filter) {
378 v16u8 filt0;
379 v16i8 src0, src1, src2, src3, mask;
380 v8u16 vec0, vec1, vec2, vec3, filt;
381
382 mask = LD_SB(&mc_filt_mask_arr[0]);
383
384 /* rearranging filter */
385 filt = LD_UH(filter);
386 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
387
388 LD_SB4(src, src_stride, src0, src1, src2, src3);
389 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
390 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
391 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
392 vec2, vec3);
393 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
394 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
395 ST8x4_UB(src0, src1, dst, dst_stride);
396 }
397
common_hz_2t_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)398 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
399 uint8_t *dst, int32_t dst_stride,
400 int8_t *filter, int32_t height) {
401 v16u8 filt0;
402 v16i8 src0, src1, src2, src3, mask, out0, out1;
403 v8u16 vec0, vec1, vec2, vec3, filt;
404
405 mask = LD_SB(&mc_filt_mask_arr[0]);
406
407 /* rearranging filter */
408 filt = LD_UH(filter);
409 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
410
411 LD_SB4(src, src_stride, src0, src1, src2, src3);
412 src += (4 * src_stride);
413
414 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
415 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
417 vec2, vec3);
418 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
419
420 LD_SB4(src, src_stride, src0, src1, src2, src3);
421 src += (4 * src_stride);
422
423 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
424 ST8x4_UB(out0, out1, dst, dst_stride);
425 dst += (4 * dst_stride);
426
427 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
428 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
429 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
430 vec2, vec3);
431 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
432 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
433 ST8x4_UB(out0, out1, dst, dst_stride);
434 dst += (4 * dst_stride);
435
436 if (16 == height) {
437 LD_SB4(src, src_stride, src0, src1, src2, src3);
438 src += (4 * src_stride);
439
440 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
441 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
442 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
443 vec2, vec3);
444 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
445 LD_SB4(src, src_stride, src0, src1, src2, src3);
446 src += (4 * src_stride);
447
448 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
449 ST8x4_UB(out0, out1, dst, dst_stride);
450
451 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
452 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
453 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
454 vec2, vec3);
455 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
456 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
457 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
458 }
459 }
460
common_hz_2t_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)461 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
462 uint8_t *dst, int32_t dst_stride,
463 int8_t *filter, int32_t height) {
464 if (4 == height) {
465 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
466 } else {
467 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
468 }
469 }
470
common_hz_2t_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)471 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
472 uint8_t *dst, int32_t dst_stride,
473 int8_t *filter, int32_t height) {
474 uint32_t loop_cnt;
475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
476 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
477 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
478
479 mask = LD_SB(&mc_filt_mask_arr[0]);
480
481 loop_cnt = (height >> 2) - 1;
482
483 /* rearranging filter */
484 filt = LD_UH(filter);
485 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
486
487 LD_SB4(src, src_stride, src0, src2, src4, src6);
488 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
489 src += (4 * src_stride);
490
491 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
492 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
493 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
494 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
495 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
496 out2, out3);
497 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
498 out6, out7);
499 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
500 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
501 PCKEV_ST_SB(out0, out1, dst);
502 dst += dst_stride;
503 PCKEV_ST_SB(out2, out3, dst);
504 dst += dst_stride;
505 PCKEV_ST_SB(out4, out5, dst);
506 dst += dst_stride;
507 PCKEV_ST_SB(out6, out7, dst);
508 dst += dst_stride;
509
510 for (; loop_cnt--;) {
511 LD_SB4(src, src_stride, src0, src2, src4, src6);
512 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
513 src += (4 * src_stride);
514
515 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
516 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
517 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
518 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
519 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
520 out2, out3);
521 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
522 out6, out7);
523 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
524 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
525 PCKEV_ST_SB(out0, out1, dst);
526 dst += dst_stride;
527 PCKEV_ST_SB(out2, out3, dst);
528 dst += dst_stride;
529 PCKEV_ST_SB(out4, out5, dst);
530 dst += dst_stride;
531 PCKEV_ST_SB(out6, out7, dst);
532 dst += dst_stride;
533 }
534 }
535
common_hz_2t_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)536 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
537 uint8_t *dst, int32_t dst_stride,
538 int8_t *filter, int32_t height) {
539 uint32_t loop_cnt;
540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
541 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
542 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
543
544 mask = LD_SB(&mc_filt_mask_arr[0]);
545
546 /* rearranging filter */
547 filt = LD_UH(filter);
548 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
549
550 for (loop_cnt = height >> 1; loop_cnt--;) {
551 src0 = LD_SB(src);
552 src2 = LD_SB(src + 16);
553 src3 = LD_SB(src + 24);
554 src1 = __msa_sldi_b(src2, src0, 8);
555 src += src_stride;
556 src4 = LD_SB(src);
557 src6 = LD_SB(src + 16);
558 src7 = LD_SB(src + 24);
559 src5 = __msa_sldi_b(src6, src4, 8);
560 src += src_stride;
561
562 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
563 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
564 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
565 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
566 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
567 out2, out3);
568 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
569 out6, out7);
570 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
571 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
572 PCKEV_ST_SB(out0, out1, dst);
573 PCKEV_ST_SB(out2, out3, dst + 16);
574 dst += dst_stride;
575 PCKEV_ST_SB(out4, out5, dst);
576 PCKEV_ST_SB(out6, out7, dst + 16);
577 dst += dst_stride;
578 }
579 }
580
common_hz_2t_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)581 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
582 uint8_t *dst, int32_t dst_stride,
583 int8_t *filter, int32_t height) {
584 uint32_t loop_cnt;
585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
586 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
587 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
588
589 mask = LD_SB(&mc_filt_mask_arr[0]);
590
591 /* rearranging filter */
592 filt = LD_UH(filter);
593 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
594
595 for (loop_cnt = height; loop_cnt--;) {
596 src0 = LD_SB(src);
597 src2 = LD_SB(src + 16);
598 src4 = LD_SB(src + 32);
599 src6 = LD_SB(src + 48);
600 src7 = LD_SB(src + 56);
601 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
602 src += src_stride;
603
604 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
605 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
606 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
607 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
608 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
609 out2, out3);
610 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
611 out6, out7);
612 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
613 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
614 PCKEV_ST_SB(out0, out1, dst);
615 PCKEV_ST_SB(out2, out3, dst + 16);
616 PCKEV_ST_SB(out4, out5, dst + 32);
617 PCKEV_ST_SB(out6, out7, dst + 48);
618 dst += dst_stride;
619 }
620 }
621
vpx_convolve8_horiz_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)622 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
623 uint8_t *dst, ptrdiff_t dst_stride,
624 const int16_t *filter_x, int x_step_q4,
625 const int16_t *filter_y, int y_step_q4, int w,
626 int h) {
627 int8_t cnt, filt_hor[8];
628
629 assert(x_step_q4 == 16);
630 assert(((const int32_t *)filter_x)[1] != 0x800000);
631
632 for (cnt = 0; cnt < 8; ++cnt) {
633 filt_hor[cnt] = filter_x[cnt];
634 }
635
636 if (((const int32_t *)filter_x)[0] == 0) {
637 switch (w) {
638 case 4:
639 common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
640 &filt_hor[3], h);
641 break;
642 case 8:
643 common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
644 &filt_hor[3], h);
645 break;
646 case 16:
647 common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
648 &filt_hor[3], h);
649 break;
650 case 32:
651 common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
652 &filt_hor[3], h);
653 break;
654 case 64:
655 common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
656 &filt_hor[3], h);
657 break;
658 default:
659 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
660 x_step_q4, filter_y, y_step_q4, w, h);
661 break;
662 }
663 } else {
664 switch (w) {
665 case 4:
666 common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
667 filt_hor, h);
668 break;
669 case 8:
670 common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
671 filt_hor, h);
672 break;
673 case 16:
674 common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
675 filt_hor, h);
676 break;
677 case 32:
678 common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
679 filt_hor, h);
680 break;
681 case 64:
682 common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
683 filt_hor, h);
684 break;
685 default:
686 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
687 x_step_q4, filter_y, y_step_q4, w, h);
688 break;
689 }
690 }
691 }
692