1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vp8/common/filter.h"
14 #include "vp8/common/mips/msa/vp8_macros_msa.h"
15
16 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = {
17 { 0, -6, 123, 12, -1, 0, 0, 0 },
18 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
19 { 0, -9, 93, 50, -6, 0, 0, 0 },
20 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
21 { 0, -6, 50, 93, -9, 0, 0, 0 },
22 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
23 { 0, -1, 12, 123, -6, 0, 0, 0 },
24 };
25
26 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
27 /* 8 width cases */
28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 /* 4 width cases */
30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 /* 4 width cases */
32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34
35 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
36 filt_h2) \
37 ({ \
38 v16i8 vec0_m, vec1_m, vec2_m; \
39 v8i16 hz_out_m; \
40 \
41 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
42 vec0_m, vec1_m, vec2_m); \
43 hz_out_m = \
44 DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \
45 \
46 hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \
47 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
48 \
49 hz_out_m; \
50 })
51
52 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
53 mask2, filt0, filt1, filt2, out0, out1) \
54 { \
55 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
56 \
57 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
58 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
59 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
60 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
61 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
62 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
63 }
64
65 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
66 mask2, filt0, filt1, filt2, out0, out1, \
67 out2, out3) \
68 { \
69 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
70 \
71 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
72 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
73 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
74 out0, out1, out2, out3); \
75 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
76 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
77 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \
78 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \
79 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
80 out0, out1, out2, out3); \
81 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
82 out0, out1, out2, out3); \
83 }
84
85 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
86 ({ \
87 v8i16 tmp0; \
88 \
89 tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
90 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
91 \
92 tmp0; \
93 })
94
95 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
96 ({ \
97 v16i8 vec0_m, vec1_m; \
98 v8i16 hz_out_m; \
99 \
100 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
101 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
102 \
103 hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \
104 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
105 \
106 hz_out_m; \
107 })
108
109 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
110 filt0, filt1, out0, out1) \
111 { \
112 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
113 \
114 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
115 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
116 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
117 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
118 }
119
120 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
121 filt0, filt1, out0, out1, out2, out3) \
122 { \
123 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
124 \
125 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
126 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
127 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
128 out0, out1, out2, out3); \
129 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
130 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
131 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
132 out0, out1, out2, out3); \
133 }
134
common_hz_6t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)135 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
136 uint8_t *RESTRICT dst, int32_t dst_stride,
137 const int8_t *filter) {
138 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
139 v16u8 mask0, mask1, mask2, out;
140 v8i16 filt, out0, out1;
141
142 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
143 src -= 2;
144
145 filt = LD_SH(filter);
146 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
147
148 mask1 = mask0 + 2;
149 mask2 = mask0 + 4;
150
151 LD_SB4(src, src_stride, src0, src1, src2, src3);
152 XORI_B4_128_SB(src0, src1, src2, src3);
153 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
154 filt1, filt2, out0, out1);
155 SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
156 SAT_SH2_SH(out0, out1, 7);
157 out = PCKEV_XORI128_UB(out0, out1);
158 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
159 }
160
common_hz_6t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)161 static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
162 uint8_t *RESTRICT dst, int32_t dst_stride,
163 const int8_t *filter) {
164 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
165 v16u8 mask0, mask1, mask2, out;
166 v8i16 filt, out0, out1, out2, out3;
167
168 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
169 src -= 2;
170
171 filt = LD_SH(filter);
172 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173
174 mask1 = mask0 + 2;
175 mask2 = mask0 + 4;
176
177 LD_SB4(src, src_stride, src0, src1, src2, src3);
178 XORI_B4_128_SB(src0, src1, src2, src3);
179 src += (4 * src_stride);
180 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
181 filt1, filt2, out0, out1);
182 LD_SB4(src, src_stride, src0, src1, src2, src3);
183 XORI_B4_128_SB(src0, src1, src2, src3);
184 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
185 filt1, filt2, out2, out3);
186 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
187 SAT_SH4_SH(out0, out1, out2, out3, 7);
188 out = PCKEV_XORI128_UB(out0, out1);
189 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
190 dst += (4 * dst_stride);
191 out = PCKEV_XORI128_UB(out2, out3);
192 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
193 }
194
common_hz_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)195 static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
196 uint8_t *RESTRICT dst, int32_t dst_stride,
197 const int8_t *filter, int32_t height) {
198 if (4 == height) {
199 common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
200 } else if (8 == height) {
201 common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
202 }
203 }
204
common_hz_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)205 static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
206 uint8_t *RESTRICT dst, int32_t dst_stride,
207 const int8_t *filter, int32_t height) {
208 uint32_t loop_cnt;
209 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
210 v16u8 mask0, mask1, mask2, tmp0, tmp1;
211 v8i16 filt, out0, out1, out2, out3;
212
213 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
214 src -= 2;
215
216 filt = LD_SH(filter);
217 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
218
219 mask1 = mask0 + 2;
220 mask2 = mask0 + 4;
221
222 LD_SB4(src, src_stride, src0, src1, src2, src3);
223 XORI_B4_128_SB(src0, src1, src2, src3);
224 src += (4 * src_stride);
225 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
226 filt1, filt2, out0, out1, out2, out3);
227 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
228 SAT_SH4_SH(out0, out1, out2, out3, 7);
229 tmp0 = PCKEV_XORI128_UB(out0, out1);
230 tmp1 = PCKEV_XORI128_UB(out2, out3);
231 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
232 dst += (4 * dst_stride);
233
234 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
235 LD_SB4(src, src_stride, src0, src1, src2, src3);
236 XORI_B4_128_SB(src0, src1, src2, src3);
237 src += (4 * src_stride);
238 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
239 filt0, filt1, filt2, out0, out1, out2, out3);
240 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
241 SAT_SH4_SH(out0, out1, out2, out3, 7);
242 tmp0 = PCKEV_XORI128_UB(out0, out1);
243 tmp1 = PCKEV_XORI128_UB(out2, out3);
244 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
245 dst += (4 * dst_stride);
246 }
247 }
248
common_hz_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)249 static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
250 uint8_t *RESTRICT dst, int32_t dst_stride,
251 const int8_t *filter, int32_t height) {
252 uint32_t loop_cnt;
253 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
254 v16u8 mask0, mask1, mask2, out;
255 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
256
257 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
258 src -= 2;
259
260 filt = LD_SH(filter);
261 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
262
263 mask1 = mask0 + 2;
264 mask2 = mask0 + 4;
265
266 for (loop_cnt = (height >> 2); loop_cnt--;) {
267 LD_SB4(src, src_stride, src0, src2, src4, src6);
268 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
269 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
270 src += (4 * src_stride);
271
272 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273 filt0, filt1, filt2, out0, out1, out2, out3);
274 HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
275 filt0, filt1, filt2, out4, out5, out6, out7);
276 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
277 SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
278 SAT_SH4_SH(out0, out1, out2, out3, 7);
279 SAT_SH4_SH(out4, out5, out6, out7, 7);
280 out = PCKEV_XORI128_UB(out0, out1);
281 ST_UB(out, dst);
282 dst += dst_stride;
283 out = PCKEV_XORI128_UB(out2, out3);
284 ST_UB(out, dst);
285 dst += dst_stride;
286 out = PCKEV_XORI128_UB(out4, out5);
287 ST_UB(out, dst);
288 dst += dst_stride;
289 out = PCKEV_XORI128_UB(out6, out7);
290 ST_UB(out, dst);
291 dst += dst_stride;
292 }
293 }
294
common_vt_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)295 static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
296 uint8_t *RESTRICT dst, int32_t dst_stride,
297 const int8_t *filter, int32_t height) {
298 uint32_t loop_cnt;
299 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
300 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
301 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
302 v16u8 out;
303 v8i16 filt, out10, out32;
304
305 src -= (2 * src_stride);
306
307 filt = LD_SH(filter);
308 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
309
310 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
311 src += (5 * src_stride);
312
313 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
314 src32_r, src43_r);
315 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
316 XORI_B2_128_SB(src2110, src4332);
317
318 for (loop_cnt = (height >> 2); loop_cnt--;) {
319 LD_SB4(src, src_stride, src5, src6, src7, src8);
320 src += (4 * src_stride);
321
322 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
323 src76_r, src87_r);
324 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
325 XORI_B2_128_SB(src6554, src8776);
326 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
327 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
328 SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
329 SAT_SH2_SH(out10, out32, 7);
330 out = PCKEV_XORI128_UB(out10, out32);
331 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
332 dst += (4 * dst_stride);
333
334 src2110 = src6554;
335 src4332 = src8776;
336 src4 = src8;
337 }
338 }
339
common_vt_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)340 static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
341 uint8_t *RESTRICT dst, int32_t dst_stride,
342 const int8_t *filter, int32_t height) {
343 uint32_t loop_cnt;
344 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
345 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
346 v16i8 src109_r, filt0, filt1, filt2;
347 v16u8 tmp0, tmp1;
348 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
349
350 src -= (2 * src_stride);
351
352 filt = LD_SH(filter);
353 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
354
355 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
356 src += (5 * src_stride);
357
358 XORI_B5_128_SB(src0, src1, src2, src3, src4);
359 ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
360 src21_r, src43_r);
361
362 for (loop_cnt = (height >> 2); loop_cnt--;) {
363 LD_SB4(src, src_stride, src7, src8, src9, src10);
364 XORI_B4_128_SB(src7, src8, src9, src10);
365 src += (4 * src_stride);
366
367 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
368 src87_r, src98_r, src109_r);
369 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
370 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
371 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
372 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
373 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
374 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
375 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
376 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
377 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
378 dst += (4 * dst_stride);
379
380 src10_r = src76_r;
381 src32_r = src98_r;
382 src21_r = src87_r;
383 src43_r = src109_r;
384 src4 = src10;
385 }
386 }
387
common_vt_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)388 static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
389 uint8_t *RESTRICT dst, int32_t dst_stride,
390 const int8_t *filter, int32_t height) {
391 uint32_t loop_cnt;
392 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
393 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
394 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
395 v16i8 src65_l, src87_l, filt0, filt1, filt2;
396 v16u8 tmp0, tmp1, tmp2, tmp3;
397 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
398
399 src -= (2 * src_stride);
400
401 filt = LD_SH(filter);
402 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
403
404 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
405 src += (5 * src_stride);
406
407 XORI_B5_128_SB(src0, src1, src2, src3, src4);
408 ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r,
409 src43_r, src21_r);
410 ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l,
411 src43_l, src21_l);
412
413 for (loop_cnt = (height >> 2); loop_cnt--;) {
414 LD_SB4(src, src_stride, src5, src6, src7, src8);
415 src += (4 * src_stride);
416
417 XORI_B4_128_SB(src5, src6, src7, src8);
418 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
419 src76_r, src87_r);
420 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
421 src76_l, src87_l);
422 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
423 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
424 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
425 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
426 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
427 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
428 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
429 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
430 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
431 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
432 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
433 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
434 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
435 tmp0, tmp1, tmp2, tmp3);
436 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
437 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
438 dst += (4 * dst_stride);
439
440 src10_r = src54_r;
441 src32_r = src76_r;
442 src21_r = src65_r;
443 src43_r = src87_r;
444 src10_l = src54_l;
445 src32_l = src76_l;
446 src21_l = src65_l;
447 src43_l = src87_l;
448 src4 = src8;
449 }
450 }
451
common_hv_6ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)452 static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
453 uint8_t *RESTRICT dst, int32_t dst_stride,
454 const int8_t *filter_horiz,
455 const int8_t *filter_vert,
456 int32_t height) {
457 uint32_t loop_cnt;
458 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
459 v16i8 filt_hz0, filt_hz1, filt_hz2;
460 v16u8 mask0, mask1, mask2, out;
461 v8i16 tmp0, tmp1;
462 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
463 v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
464
465 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
466 src -= (2 + 2 * src_stride);
467
468 filt = LD_SH(filter_horiz);
469 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
470 filt = LD_SH(filter_vert);
471 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
472
473 mask1 = mask0 + 2;
474 mask2 = mask0 + 4;
475
476 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
477 src += (5 * src_stride);
478
479 XORI_B5_128_SB(src0, src1, src2, src3, src4);
480 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
481 filt_hz2);
482 hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
483 filt_hz2);
484 hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
485 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
486 filt_hz2);
487 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
488
489 for (loop_cnt = (height >> 2); loop_cnt--;) {
490 LD_SB2(src, src_stride, src5, src6);
491 src += (2 * src_stride);
492
493 XORI_B2_128_SB(src5, src6);
494 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
495 filt_hz1, filt_hz2);
496 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
497
498 LD_SB2(src, src_stride, src7, src8);
499 src += (2 * src_stride);
500
501 XORI_B2_128_SB(src7, src8);
502 hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
503 filt_hz1, filt_hz2);
504 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
505
506 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
507 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
508
509 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
510 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
511
512 SRARI_H2_SH(tmp0, tmp1, 7);
513 SAT_SH2_SH(tmp0, tmp1, 7);
514 out = PCKEV_XORI128_UB(tmp0, tmp1);
515 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
516 dst += (4 * dst_stride);
517
518 hz_out3 = hz_out7;
519 out0 = out2;
520 out1 = out3;
521 }
522 }
523
common_hv_6ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)524 static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
525 uint8_t *RESTRICT dst, int32_t dst_stride,
526 const int8_t *filter_horiz,
527 const int8_t *filter_vert,
528 int32_t height) {
529 uint32_t loop_cnt;
530 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
531 v16i8 filt_hz0, filt_hz1, filt_hz2;
532 v16u8 mask0, mask1, mask2, vec0, vec1;
533 v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
534 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
535 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
536 v8i16 tmp0, tmp1, tmp2, tmp3;
537
538 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
539 src -= (2 + 2 * src_stride);
540
541 filt = LD_SH(filter_horiz);
542 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
543
544 mask1 = mask0 + 2;
545 mask2 = mask0 + 4;
546
547 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
548 src += (5 * src_stride);
549
550 XORI_B5_128_SB(src0, src1, src2, src3, src4);
551 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
552 filt_hz2);
553 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
554 filt_hz2);
555 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
556 filt_hz2);
557 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
558 filt_hz2);
559 hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
560 filt_hz2);
561
562 filt = LD_SH(filter_vert);
563 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
564
565 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
566 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
567
568 for (loop_cnt = (height >> 2); loop_cnt--;) {
569 LD_SB4(src, src_stride, src5, src6, src7, src8);
570 src += (4 * src_stride);
571
572 XORI_B4_128_SB(src5, src6, src7, src8);
573 hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
574 filt_hz1, filt_hz2);
575 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
576 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
577
578 hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
579 filt_hz1, filt_hz2);
580 out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
581 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
582
583 hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
584 filt_hz1, filt_hz2);
585 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
586 tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
587
588 hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
589 filt_hz1, filt_hz2);
590 out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
591 tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
592
593 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
594 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
595 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
596 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
597 ST8x4_UB(vec0, vec1, dst, dst_stride);
598 dst += (4 * dst_stride);
599
600 hz_out4 = hz_out8;
601 out0 = out2;
602 out1 = out7;
603 out3 = out5;
604 out4 = out6;
605 }
606 }
607
common_hv_6ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)608 static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
609 uint8_t *RESTRICT dst, int32_t dst_stride,
610 const int8_t *filter_horiz,
611 const int8_t *filter_vert,
612 int32_t height) {
613 int32_t multiple8_cnt;
614 for (multiple8_cnt = 2; multiple8_cnt--;) {
615 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
616 filter_vert, height);
617 src += 8;
618 dst += 8;
619 }
620 }
621
common_hz_4t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)622 static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
623 uint8_t *RESTRICT dst, int32_t dst_stride,
624 const int8_t *filter) {
625 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
626 v8i16 filt, out0, out1;
627 v16u8 out;
628
629 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
630 src -= 1;
631
632 filt = LD_SH(filter);
633 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
634
635 mask1 = mask0 + 2;
636
637 LD_SB4(src, src_stride, src0, src1, src2, src3);
638 XORI_B4_128_SB(src0, src1, src2, src3);
639 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
640 out0, out1);
641 SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
642 SAT_SH2_SH(out0, out1, 7);
643 out = PCKEV_XORI128_UB(out0, out1);
644 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
645 }
646
common_hz_4t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)647 static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
648 uint8_t *RESTRICT dst, int32_t dst_stride,
649 const int8_t *filter) {
650 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
651 v16u8 out;
652 v8i16 filt, out0, out1, out2, out3;
653
654 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
655 src -= 1;
656
657 filt = LD_SH(filter);
658 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
659
660 mask1 = mask0 + 2;
661
662 LD_SB4(src, src_stride, src0, src1, src2, src3);
663 src += (4 * src_stride);
664
665 XORI_B4_128_SB(src0, src1, src2, src3);
666 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
667 out0, out1);
668 LD_SB4(src, src_stride, src0, src1, src2, src3);
669 XORI_B4_128_SB(src0, src1, src2, src3);
670 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
671 out2, out3);
672 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
673 SAT_SH4_SH(out0, out1, out2, out3, 7);
674 out = PCKEV_XORI128_UB(out0, out1);
675 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
676 dst += (4 * dst_stride);
677 out = PCKEV_XORI128_UB(out2, out3);
678 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
679 }
680
common_hz_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)681 static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
682 uint8_t *RESTRICT dst, int32_t dst_stride,
683 const int8_t *filter, int32_t height) {
684 if (4 == height) {
685 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
686 } else if (8 == height) {
687 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
688 }
689 }
690
common_hz_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)691 static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
692 uint8_t *RESTRICT dst, int32_t dst_stride,
693 const int8_t *filter, int32_t height) {
694 uint32_t loop_cnt;
695 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
696 v16u8 tmp0, tmp1;
697 v8i16 filt, out0, out1, out2, out3;
698
699 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
700 src -= 1;
701
702 filt = LD_SH(filter);
703 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
704
705 mask1 = mask0 + 2;
706
707 for (loop_cnt = (height >> 2); loop_cnt--;) {
708 LD_SB4(src, src_stride, src0, src1, src2, src3);
709 src += (4 * src_stride);
710
711 XORI_B4_128_SB(src0, src1, src2, src3);
712 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
713 filt1, out0, out1, out2, out3);
714 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
715 SAT_SH4_SH(out0, out1, out2, out3, 7);
716 tmp0 = PCKEV_XORI128_UB(out0, out1);
717 tmp1 = PCKEV_XORI128_UB(out2, out3);
718 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
719 dst += (4 * dst_stride);
720 }
721 }
722
common_hz_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)723 static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
724 uint8_t *RESTRICT dst, int32_t dst_stride,
725 const int8_t *filter, int32_t height) {
726 uint32_t loop_cnt;
727 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
728 v16i8 filt0, filt1, mask0, mask1;
729 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
730 v16u8 out;
731
732 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
733 src -= 1;
734
735 filt = LD_SH(filter);
736 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
737
738 mask1 = mask0 + 2;
739
740 for (loop_cnt = (height >> 2); loop_cnt--;) {
741 LD_SB4(src, src_stride, src0, src2, src4, src6);
742 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
743 src += (4 * src_stride);
744
745 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
746 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
747 filt1, out0, out1, out2, out3);
748 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
749 filt1, out4, out5, out6, out7);
750 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
751 SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
752 SAT_SH4_SH(out0, out1, out2, out3, 7);
753 SAT_SH4_SH(out4, out5, out6, out7, 7);
754 out = PCKEV_XORI128_UB(out0, out1);
755 ST_UB(out, dst);
756 dst += dst_stride;
757 out = PCKEV_XORI128_UB(out2, out3);
758 ST_UB(out, dst);
759 dst += dst_stride;
760 out = PCKEV_XORI128_UB(out4, out5);
761 ST_UB(out, dst);
762 dst += dst_stride;
763 out = PCKEV_XORI128_UB(out6, out7);
764 ST_UB(out, dst);
765 dst += dst_stride;
766 }
767 }
768
common_vt_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)769 static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
770 uint8_t *RESTRICT dst, int32_t dst_stride,
771 const int8_t *filter, int32_t height) {
772 uint32_t loop_cnt;
773 v16i8 src0, src1, src2, src3, src4, src5;
774 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
775 v16i8 src2110, src4332, filt0, filt1;
776 v8i16 filt, out10, out32;
777 v16u8 out;
778
779 src -= src_stride;
780
781 filt = LD_SH(filter);
782 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
783
784 LD_SB3(src, src_stride, src0, src1, src2);
785 src += (3 * src_stride);
786
787 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
788
789 src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
790 src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
791
792 for (loop_cnt = (height >> 2); loop_cnt--;) {
793 LD_SB3(src, src_stride, src3, src4, src5);
794 src += (3 * src_stride);
795 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
796 src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
797 src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
798 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
799
800 src2 = LD_SB(src);
801 src += (src_stride);
802 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
803 src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
804 src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
805 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
806 SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
807 SAT_SH2_SH(out10, out32, 7);
808 out = PCKEV_XORI128_UB(out10, out32);
809 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
810 dst += (4 * dst_stride);
811 }
812 }
813
common_vt_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)814 static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
815 uint8_t *RESTRICT dst, int32_t dst_stride,
816 const int8_t *filter, int32_t height) {
817 uint32_t loop_cnt;
818 v16i8 src0, src1, src2, src7, src8, src9, src10;
819 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
820 v16u8 tmp0, tmp1;
821 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
822
823 src -= src_stride;
824
825 filt = LD_SH(filter);
826 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
827
828 LD_SB3(src, src_stride, src0, src1, src2);
829 src += (3 * src_stride);
830
831 XORI_B3_128_SB(src0, src1, src2);
832 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
833
834 for (loop_cnt = (height >> 2); loop_cnt--;) {
835 LD_SB4(src, src_stride, src7, src8, src9, src10);
836 src += (4 * src_stride);
837
838 XORI_B4_128_SB(src7, src8, src9, src10);
839 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r,
840 src87_r, src98_r, src109_r);
841 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
842 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
843 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
844 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
845 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
846 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
847 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
848 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
849 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
850 dst += (4 * dst_stride);
851
852 src10_r = src98_r;
853 src21_r = src109_r;
854 src2 = src10;
855 }
856 }
857
common_vt_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)858 static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
859 uint8_t *RESTRICT dst, int32_t dst_stride,
860 const int8_t *filter, int32_t height) {
861 uint32_t loop_cnt;
862 v16i8 src0, src1, src2, src3, src4, src5, src6;
863 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
864 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
865 v16u8 tmp0, tmp1, tmp2, tmp3;
866 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
867
868 src -= src_stride;
869
870 filt = LD_SH(filter);
871 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
872
873 LD_SB3(src, src_stride, src0, src1, src2);
874 src += (3 * src_stride);
875
876 XORI_B3_128_SB(src0, src1, src2);
877 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
878 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
879
880 for (loop_cnt = (height >> 2); loop_cnt--;) {
881 LD_SB4(src, src_stride, src3, src4, src5, src6);
882 src += (4 * src_stride);
883
884 XORI_B4_128_SB(src3, src4, src5, src6);
885 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
886 src54_r, src65_r);
887 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l,
888 src54_l, src65_l);
889 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
890 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
891 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
892 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
893 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
894 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
895 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
896 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
897 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
898 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
899 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
900 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
901 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
902 tmp0, tmp1, tmp2, tmp3);
903 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
904 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
905 dst += (4 * dst_stride);
906
907 src10_r = src54_r;
908 src21_r = src65_r;
909 src10_l = src54_l;
910 src21_l = src65_l;
911 src2 = src6;
912 }
913 }
914
common_hv_4ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)915 static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
916 uint8_t *RESTRICT dst, int32_t dst_stride,
917 const int8_t *filter_horiz,
918 const int8_t *filter_vert,
919 int32_t height) {
920 uint32_t loop_cnt;
921 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
922 v16u8 mask0, mask1, out;
923 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
924 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
925
926 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
927 src -= (1 + 1 * src_stride);
928
929 filt = LD_SH(filter_horiz);
930 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
931
932 mask1 = mask0 + 2;
933
934 LD_SB3(src, src_stride, src0, src1, src2);
935 src += (3 * src_stride);
936
937 XORI_B3_128_SB(src0, src1, src2);
938 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
939 hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
940 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
941
942 filt = LD_SH(filter_vert);
943 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
944
945 for (loop_cnt = (height >> 2); loop_cnt--;) {
946 LD_SB4(src, src_stride, src3, src4, src5, src6);
947 src += (4 * src_stride);
948
949 XORI_B2_128_SB(src3, src4);
950 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
951 hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
952 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
953 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
954
955 XORI_B2_128_SB(src5, src6);
956 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
957 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
958 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
959 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
960
961 SRARI_H2_SH(tmp0, tmp1, 7);
962 SAT_SH2_SH(tmp0, tmp1, 7);
963 out = PCKEV_XORI128_UB(tmp0, tmp1);
964 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
965 dst += (4 * dst_stride);
966
967 hz_out1 = hz_out5;
968 vec0 = vec2;
969 }
970 }
971
common_hv_4ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)972 static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
973 uint8_t *RESTRICT dst, int32_t dst_stride,
974 const int8_t *filter_horiz,
975 const int8_t *filter_vert,
976 int32_t height) {
977 uint32_t loop_cnt;
978 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
979 v16u8 mask0, mask1, out0, out1;
980 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
981 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
982 v8i16 vec0, vec1, vec2, vec3, vec4;
983
984 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
985 src -= (1 + 1 * src_stride);
986
987 filt = LD_SH(filter_horiz);
988 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
989
990 mask1 = mask0 + 2;
991
992 LD_SB3(src, src_stride, src0, src1, src2);
993 src += (3 * src_stride);
994
995 XORI_B3_128_SB(src0, src1, src2);
996 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
997 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
998 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
999 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1000
1001 filt = LD_SH(filter_vert);
1002 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1003
1004 for (loop_cnt = (height >> 2); loop_cnt--;) {
1005 LD_SB4(src, src_stride, src3, src4, src5, src6);
1006 src += (4 * src_stride);
1007
1008 XORI_B4_128_SB(src3, src4, src5, src6);
1009 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1010 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1011 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1012
1013 hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1014 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1015 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1016
1017 hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1018 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1019 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1020
1021 hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1022 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1023 tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1024
1025 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1026 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1027 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1028 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1029 ST8x4_UB(out0, out1, dst, dst_stride);
1030 dst += (4 * dst_stride);
1031
1032 vec0 = vec4;
1033 vec2 = vec1;
1034 }
1035 }
1036
common_hv_4ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1037 static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1038 uint8_t *RESTRICT dst, int32_t dst_stride,
1039 const int8_t *filter_horiz,
1040 const int8_t *filter_vert,
1041 int32_t height) {
1042 int32_t multiple8_cnt;
1043 for (multiple8_cnt = 2; multiple8_cnt--;) {
1044 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1045 filter_vert, height);
1046 src += 8;
1047 dst += 8;
1048 }
1049 }
1050
common_hv_6ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1051 static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1052 uint8_t *RESTRICT dst, int32_t dst_stride,
1053 const int8_t *filter_horiz,
1054 const int8_t *filter_vert,
1055 int32_t height) {
1056 uint32_t loop_cnt;
1057 v16i8 src0, src1, src2, src3, src4, src5, src6;
1058 v16i8 filt_hz0, filt_hz1, filt_hz2;
1059 v16u8 res0, res1, mask0, mask1, mask2;
1060 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1061 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1062
1063 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
1064 src -= (2 + 1 * src_stride);
1065
1066 filt = LD_SH(filter_horiz);
1067 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1068
1069 mask1 = mask0 + 2;
1070 mask2 = mask0 + 4;
1071
1072 LD_SB3(src, src_stride, src0, src1, src2);
1073 src += (3 * src_stride);
1074
1075 XORI_B3_128_SB(src0, src1, src2);
1076 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1077 filt_hz2);
1078 hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1079 filt_hz2);
1080 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1081
1082 filt = LD_SH(filter_vert);
1083 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1084
1085 for (loop_cnt = (height >> 2); loop_cnt--;) {
1086 LD_SB4(src, src_stride, src3, src4, src5, src6);
1087 src += (4 * src_stride);
1088
1089 XORI_B4_128_SB(src3, src4, src5, src6);
1090 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1091 filt_hz1, filt_hz2);
1092 hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
1093 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1094 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1095
1096 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1097 filt_hz1, filt_hz2);
1098 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1099 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1100 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1101
1102 SRARI_H2_SH(tmp0, tmp1, 7);
1103 SAT_SH2_SH(tmp0, tmp1, 7);
1104 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1105 XORI_B2_128_UB(res0, res1);
1106 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1107 dst += (4 * dst_stride);
1108
1109 hz_out1 = hz_out5;
1110 vec0 = vec2;
1111 }
1112 }
1113
common_hv_6ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1114 static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1115 uint8_t *RESTRICT dst, int32_t dst_stride,
1116 const int8_t *filter_horiz,
1117 const int8_t *filter_vert,
1118 int32_t height) {
1119 uint32_t loop_cnt;
1120 v16i8 src0, src1, src2, src3, src4, src5, src6;
1121 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1122 v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1123 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1124 v16u8 out0, out1;
1125
1126 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1127 src -= (2 + src_stride);
1128
1129 filt = LD_SH(filter_horiz);
1130 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1131
1132 mask1 = mask0 + 2;
1133 mask2 = mask0 + 4;
1134
1135 LD_SB3(src, src_stride, src0, src1, src2);
1136 src += (3 * src_stride);
1137
1138 XORI_B3_128_SB(src0, src1, src2);
1139 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
1140 filt_hz2);
1141 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1142 filt_hz2);
1143 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1144 filt_hz2);
1145 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1146
1147 filt = LD_SH(filter_vert);
1148 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1149
1150 for (loop_cnt = (height >> 2); loop_cnt--;) {
1151 LD_SB4(src, src_stride, src3, src4, src5, src6);
1152 src += (4 * src_stride);
1153
1154 XORI_B4_128_SB(src3, src4, src5, src6);
1155
1156 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1157 filt_hz1, filt_hz2);
1158 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1159 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1160
1161 hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1162 filt_hz1, filt_hz2);
1163 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1164 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1165
1166 hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1167 filt_hz1, filt_hz2);
1168 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1169 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1170
1171 hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1172 filt_hz1, filt_hz2);
1173 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1174 tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1175
1176 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1177 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1178 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1179 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1180 ST8x4_UB(out0, out1, dst, dst_stride);
1181 dst += (4 * dst_stride);
1182 }
1183 }
1184
common_hv_6ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1185 static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1186 uint8_t *RESTRICT dst, int32_t dst_stride,
1187 const int8_t *filter_horiz,
1188 const int8_t *filter_vert,
1189 int32_t height) {
1190 int32_t multiple8_cnt;
1191 for (multiple8_cnt = 2; multiple8_cnt--;) {
1192 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1193 filter_vert, height);
1194 src += 8;
1195 dst += 8;
1196 }
1197 }
1198
common_hv_4ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1199 static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1200 uint8_t *RESTRICT dst, int32_t dst_stride,
1201 const int8_t *filter_horiz,
1202 const int8_t *filter_vert,
1203 int32_t height) {
1204 uint32_t loop_cnt;
1205 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1206 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1207 v16u8 out;
1208 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1209 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1210 v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1211
1212 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
1213
1214 src -= (1 + 2 * src_stride);
1215
1216 filt = LD_SH(filter_horiz);
1217 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1218
1219 mask1 = mask0 + 2;
1220
1221 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1222 src += (5 * src_stride);
1223
1224 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1225 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1226 hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1227 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1228 hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1229 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1230
1231 filt = LD_SH(filter_vert);
1232 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1233
1234 for (loop_cnt = (height >> 2); loop_cnt--;) {
1235 LD_SB4(src, src_stride, src5, src6, src7, src8);
1236 XORI_B4_128_SB(src5, src6, src7, src8);
1237 src += (4 * src_stride);
1238
1239 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1240 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1241 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1242 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1243
1244 hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1245 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
1246 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1247 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1248
1249 SRARI_H2_SH(tmp0, tmp1, 7);
1250 SAT_SH2_SH(tmp0, tmp1, 7);
1251 out = PCKEV_XORI128_UB(tmp0, tmp1);
1252 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1253 dst += (4 * dst_stride);
1254
1255 hz_out3 = hz_out7;
1256 out0 = out2;
1257 out1 = out3;
1258 }
1259 }
1260
common_hv_4ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1261 static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1262 uint8_t *RESTRICT dst, int32_t dst_stride,
1263 const int8_t *filter_horiz,
1264 const int8_t *filter_vert,
1265 int32_t height) {
1266 uint32_t loop_cnt;
1267 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1268 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1269 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1270 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1271 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1272 v16u8 vec0, vec1;
1273
1274 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1275 src -= (1 + 2 * src_stride);
1276
1277 filt = LD_SH(filter_horiz);
1278 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1279
1280 mask1 = mask0 + 2;
1281
1282 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1283 src += (5 * src_stride);
1284
1285 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1286 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1287 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1288 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1289 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1290 hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1291 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1292 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1293
1294 filt = LD_SH(filter_vert);
1295 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1296
1297 for (loop_cnt = (height >> 2); loop_cnt--;) {
1298 LD_SB4(src, src_stride, src5, src6, src7, src8);
1299 src += (4 * src_stride);
1300
1301 XORI_B4_128_SB(src5, src6, src7, src8);
1302
1303 hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1304 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1305 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1306
1307 hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1308 out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
1309 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1310
1311 hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1312 out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1313 tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1314
1315 hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1316 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
1317 tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1318
1319 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1320 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1321 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1322 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1323 ST8x4_UB(vec0, vec1, dst, dst_stride);
1324 dst += (4 * dst_stride);
1325
1326 hz_out4 = hz_out8;
1327 out0 = out2;
1328 out1 = out6;
1329 out3 = out5;
1330 out4 = out7;
1331 }
1332 }
1333
common_hv_4ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1334 static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1335 uint8_t *RESTRICT dst, int32_t dst_stride,
1336 const int8_t *filter_horiz,
1337 const int8_t *filter_vert,
1338 int32_t height) {
1339 int32_t multiple8_cnt;
1340 for (multiple8_cnt = 2; multiple8_cnt--;) {
1341 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1342 filter_vert, height);
1343 src += 8;
1344 dst += 8;
1345 }
1346 }
1347
vp8_sixtap_predict4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1348 void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1349 int32_t xoffset, int32_t yoffset,
1350 uint8_t *RESTRICT dst, int32_t dst_stride) {
1351 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1352 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1353
1354 if (yoffset) {
1355 if (xoffset) {
1356 switch (xoffset) {
1357 case 2:
1358 case 4:
1359 case 6:
1360 switch (yoffset) {
1361 case 2:
1362 case 4:
1363 case 6:
1364 common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1365 h_filter, v_filter, 4);
1366 break;
1367
1368 case 1:
1369 case 3:
1370 case 5:
1371 case 7:
1372 common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1373 h_filter, v_filter + 1, 4);
1374 break;
1375 }
1376 break;
1377
1378 case 1:
1379 case 3:
1380 case 5:
1381 case 7:
1382 switch (yoffset) {
1383 case 2:
1384 case 4:
1385 case 6:
1386 common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1387 h_filter + 1, v_filter, 4);
1388 break;
1389
1390 case 1:
1391 case 3:
1392 case 5:
1393 case 7:
1394 common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1395 h_filter + 1, v_filter + 1, 4);
1396 break;
1397 }
1398 break;
1399 }
1400 } else {
1401 switch (yoffset) {
1402 case 2:
1403 case 4:
1404 case 6:
1405 common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1406 break;
1407
1408 case 1:
1409 case 3:
1410 case 5:
1411 case 7:
1412 common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1413 4);
1414 break;
1415 }
1416 }
1417 } else {
1418 switch (xoffset) {
1419 case 0: {
1420 uint32_t tp0, tp1, tp2, tp3;
1421
1422 LW4(src, src_stride, tp0, tp1, tp2, tp3);
1423 SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
1424 break;
1425 }
1426 case 2:
1427 case 4:
1428 case 6:
1429 common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1430 break;
1431
1432 case 1:
1433 case 3:
1434 case 5:
1435 case 7:
1436 common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1437 break;
1438 }
1439 }
1440 }
1441
vp8_sixtap_predict8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1442 void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1443 int32_t xoffset, int32_t yoffset,
1444 uint8_t *RESTRICT dst, int32_t dst_stride) {
1445 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1446 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1447
1448 if (yoffset) {
1449 if (xoffset) {
1450 switch (xoffset) {
1451 case 2:
1452 case 4:
1453 case 6:
1454 switch (yoffset) {
1455 case 2:
1456 case 4:
1457 case 6:
1458 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1459 h_filter, v_filter, 4);
1460 break;
1461
1462 case 1:
1463 case 3:
1464 case 5:
1465 case 7:
1466 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1467 h_filter, v_filter + 1, 4);
1468 break;
1469 }
1470 break;
1471
1472 case 1:
1473 case 3:
1474 case 5:
1475 case 7:
1476 switch (yoffset) {
1477 case 2:
1478 case 4:
1479 case 6:
1480 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1481 h_filter + 1, v_filter, 4);
1482 break;
1483
1484 case 1:
1485 case 3:
1486 case 5:
1487 case 7:
1488 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1489 h_filter + 1, v_filter + 1, 4);
1490 break;
1491 }
1492 break;
1493 }
1494 } else {
1495 switch (yoffset) {
1496 case 2:
1497 case 4:
1498 case 6:
1499 common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1500 break;
1501
1502 case 1:
1503 case 3:
1504 case 5:
1505 case 7:
1506 common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1507 4);
1508 break;
1509 }
1510 }
1511 } else {
1512 switch (xoffset) {
1513 case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break;
1514 case 2:
1515 case 4:
1516 case 6:
1517 common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1518 break;
1519
1520 case 1:
1521 case 3:
1522 case 5:
1523 case 7:
1524 common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1525 break;
1526 }
1527 }
1528 }
1529
vp8_sixtap_predict8x8_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1530 void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
1531 int32_t xoffset, int32_t yoffset,
1532 uint8_t *RESTRICT dst, int32_t dst_stride) {
1533 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1534 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1535
1536 if (yoffset) {
1537 if (xoffset) {
1538 switch (xoffset) {
1539 case 2:
1540 case 4:
1541 case 6:
1542 switch (yoffset) {
1543 case 2:
1544 case 4:
1545 case 6:
1546 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1547 h_filter, v_filter, 8);
1548 break;
1549
1550 case 1:
1551 case 3:
1552 case 5:
1553 case 7:
1554 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1555 h_filter, v_filter + 1, 8);
1556 break;
1557 }
1558 break;
1559
1560 case 1:
1561 case 3:
1562 case 5:
1563 case 7:
1564 switch (yoffset) {
1565 case 2:
1566 case 4:
1567 case 6:
1568 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1569 h_filter + 1, v_filter, 8);
1570 break;
1571
1572 case 1:
1573 case 3:
1574 case 5:
1575 case 7:
1576 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1577 h_filter + 1, v_filter + 1, 8);
1578 break;
1579 }
1580 break;
1581 }
1582 } else {
1583 switch (yoffset) {
1584 case 2:
1585 case 4:
1586 case 6:
1587 common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
1588 break;
1589
1590 case 1:
1591 case 3:
1592 case 5:
1593 case 7:
1594 common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1595 8);
1596 break;
1597 }
1598 }
1599 } else {
1600 switch (xoffset) {
1601 case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
1602 case 2:
1603 case 4:
1604 case 6:
1605 common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
1606 break;
1607
1608 case 1:
1609 case 3:
1610 case 5:
1611 case 7:
1612 common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8);
1613 break;
1614 }
1615 }
1616 }
1617
vp8_sixtap_predict16x16_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1618 void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
1619 int32_t xoffset, int32_t yoffset,
1620 uint8_t *RESTRICT dst, int32_t dst_stride) {
1621 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1622 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1623
1624 if (yoffset) {
1625 if (xoffset) {
1626 switch (xoffset) {
1627 case 2:
1628 case 4:
1629 case 6:
1630 switch (yoffset) {
1631 case 2:
1632 case 4:
1633 case 6:
1634 common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1635 h_filter, v_filter, 16);
1636 break;
1637
1638 case 1:
1639 case 3:
1640 case 5:
1641 case 7:
1642 common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1643 h_filter, v_filter + 1, 16);
1644 break;
1645 }
1646 break;
1647
1648 case 1:
1649 case 3:
1650 case 5:
1651 case 7:
1652 switch (yoffset) {
1653 case 2:
1654 case 4:
1655 case 6:
1656 common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1657 h_filter + 1, v_filter, 16);
1658 break;
1659
1660 case 1:
1661 case 3:
1662 case 5:
1663 case 7:
1664 common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1665 h_filter + 1, v_filter + 1, 16);
1666 break;
1667 }
1668 break;
1669 }
1670 } else {
1671 switch (yoffset) {
1672 case 2:
1673 case 4:
1674 case 6:
1675 common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
1676 break;
1677
1678 case 1:
1679 case 3:
1680 case 5:
1681 case 7:
1682 common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1683 16);
1684 break;
1685 }
1686 }
1687 } else {
1688 switch (xoffset) {
1689 case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
1690 case 2:
1691 case 4:
1692 case 6:
1693 common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
1694 break;
1695
1696 case 1:
1697 case 3:
1698 case 5:
1699 case 7:
1700 common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1,
1701 16);
1702 break;
1703 }
1704 }
1705 }
1706