1 /*
2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavcodec/vp8dsp.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "vp8dsp_mips.h"
24
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26 /* 8 width cases */
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 /* 4 width cases */
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30 /* 4 width cases */
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33
34 static const int8_t subpel_filters_msa[7][8] = {
35 {-6, 123, 12, -1, 0, 0, 0, 0},
36 {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
37 {-9, 93, 50, -6, 0, 0, 0, 0},
38 {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
39 {-6, 50, 93, -9, 0, 0, 0, 0},
40 {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
41 {-1, 12, 123, -6, 0, 0, 0, 0},
42 };
43
44 static const int8_t bilinear_filters_msa[7][2] = {
45 {112, 16},
46 {96, 32},
47 {80, 48},
48 {64, 64},
49 {48, 80},
50 {32, 96},
51 {16, 112}
52 };
53
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
55 filt_h0, filt_h1, filt_h2) \
56 ( { \
57 v16i8 vec0_m, vec1_m, vec2_m; \
58 v8i16 hz_out_m; \
59 \
60 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
61 vec0_m, vec1_m, vec2_m); \
62 hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
63 filt_h0, filt_h1, filt_h2); \
64 \
65 hz_out_m = __msa_srari_h(hz_out_m, 7); \
66 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
67 \
68 hz_out_m; \
69 } )
70
71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
72 mask0, mask1, mask2, \
73 filt0, filt1, filt2, \
74 out0, out1) \
75 { \
76 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
77 \
78 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
79 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
80 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
81 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
82 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
83 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
84 }
85
86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
87 mask0, mask1, mask2, \
88 filt0, filt1, filt2, \
89 out0, out1, out2, out3) \
90 { \
91 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
92 \
93 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
94 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
95 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
96 out0, out1, out2, out3); \
97 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
98 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
99 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \
100 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \
101 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
102 out0, out1, out2, out3); \
103 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
104 out0, out1, out2, out3); \
105 }
106
107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
108 ( { \
109 v8i16 tmp0; \
110 \
111 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
112 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
113 \
114 tmp0; \
115 } )
116
117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
118 ( { \
119 v16i8 vec0_m, vec1_m; \
120 v8i16 hz_out_m; \
121 \
122 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
123 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
124 \
125 hz_out_m = __msa_srari_h(hz_out_m, 7); \
126 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
127 \
128 hz_out_m; \
129 } )
130
131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
132 mask0, mask1, filt0, filt1, \
133 out0, out1) \
134 { \
135 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
136 \
137 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
138 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
139 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
140 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
141 }
142
143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
144 mask0, mask1, filt0, filt1, \
145 out0, out1, out2, out3) \
146 { \
147 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
148 \
149 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
150 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
151 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
152 out0, out1, out2, out3); \
153 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
154 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
155 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
156 out0, out1, out2, out3); \
157 }
158
common_hz_6t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)159 static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
160 uint8_t *dst, int32_t dst_stride,
161 const int8_t *filter)
162 {
163 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
164 v16u8 mask0, mask1, mask2, out;
165 v8i16 filt, out0, out1;
166
167 mask0 = LD_UB(&mc_filt_mask_arr[16]);
168 src -= 2;
169
170 /* rearranging filter */
171 filt = LD_SH(filter);
172 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173
174 mask1 = mask0 + 2;
175 mask2 = mask0 + 4;
176
177 LD_SB4(src, src_stride, src0, src1, src2, src3);
178 XORI_B4_128_SB(src0, src1, src2, src3);
179 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
180 filt0, filt1, filt2, out0, out1);
181 SRARI_H2_SH(out0, out1, 7);
182 SAT_SH2_SH(out0, out1, 7);
183 out = PCKEV_XORI128_UB(out0, out1);
184 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
185 }
186
common_hz_6t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)187 static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
188 uint8_t *dst, int32_t dst_stride,
189 const int8_t *filter)
190 {
191 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
192 v16u8 mask0, mask1, mask2, out;
193 v8i16 filt, out0, out1, out2, out3;
194
195 mask0 = LD_UB(&mc_filt_mask_arr[16]);
196 src -= 2;
197
198 /* rearranging filter */
199 filt = LD_SH(filter);
200 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
201
202 mask1 = mask0 + 2;
203 mask2 = mask0 + 4;
204
205 LD_SB4(src, src_stride, src0, src1, src2, src3);
206 XORI_B4_128_SB(src0, src1, src2, src3);
207 src += (4 * src_stride);
208 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
209 filt0, filt1, filt2, out0, out1);
210 LD_SB4(src, src_stride, src0, src1, src2, src3);
211 XORI_B4_128_SB(src0, src1, src2, src3);
212 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213 filt0, filt1, filt2, out2, out3);
214 SRARI_H4_SH(out0, out1, out2, out3, 7);
215 SAT_SH4_SH(out0, out1, out2, out3, 7);
216 out = PCKEV_XORI128_UB(out0, out1);
217 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
218 out = PCKEV_XORI128_UB(out2, out3);
219 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
220 }
221
ff_put_vp8_epel4_h6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)222 void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
223 uint8_t *src, ptrdiff_t src_stride,
224 int height, int mx, int my)
225 {
226 const int8_t *filter = subpel_filters_msa[mx - 1];
227
228 if (4 == height) {
229 common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
230 } else if (8 == height) {
231 common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
232 }
233 }
234
ff_put_vp8_epel8_h6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)235 void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
236 uint8_t *src, ptrdiff_t src_stride,
237 int height, int mx, int my)
238 {
239 uint32_t loop_cnt;
240 const int8_t *filter = subpel_filters_msa[mx - 1];
241 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
242 v16u8 mask0, mask1, mask2, tmp0, tmp1;
243 v8i16 filt, out0, out1, out2, out3;
244
245 mask0 = LD_UB(&mc_filt_mask_arr[0]);
246
247 src -= 2;
248
249 /* rearranging filter */
250 filt = LD_SH(filter);
251 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
252
253 mask1 = mask0 + 2;
254 mask2 = mask0 + 4;
255
256 LD_SB4(src, src_stride, src0, src1, src2, src3);
257 XORI_B4_128_SB(src0, src1, src2, src3);
258 src += (4 * src_stride);
259 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
260 filt0, filt1, filt2, out0, out1, out2, out3);
261 SRARI_H4_SH(out0, out1, out2, out3, 7);
262 SAT_SH4_SH(out0, out1, out2, out3, 7);
263 tmp0 = PCKEV_XORI128_UB(out0, out1);
264 tmp1 = PCKEV_XORI128_UB(out2, out3);
265 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266 dst += (4 * dst_stride);
267
268 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269 LD_SB4(src, src_stride, src0, src1, src2, src3);
270 XORI_B4_128_SB(src0, src1, src2, src3);
271 src += (4 * src_stride);
272 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273 filt0, filt1, filt2, out0, out1, out2, out3);
274 SRARI_H4_SH(out0, out1, out2, out3, 7);
275 SAT_SH4_SH(out0, out1, out2, out3, 7);
276 tmp0 = PCKEV_XORI128_UB(out0, out1);
277 tmp1 = PCKEV_XORI128_UB(out2, out3);
278 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279 dst += (4 * dst_stride);
280 }
281 }
282
ff_put_vp8_epel16_h6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)283 void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
284 uint8_t *src, ptrdiff_t src_stride,
285 int height, int mx, int my)
286 {
287 uint32_t loop_cnt;
288 const int8_t *filter = subpel_filters_msa[mx - 1];
289 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290 v16u8 mask0, mask1, mask2, out;
291 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
292
293 mask0 = LD_UB(&mc_filt_mask_arr[0]);
294 src -= 2;
295
296 /* rearranging filter */
297 filt = LD_SH(filter);
298 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
299
300 mask1 = mask0 + 2;
301 mask2 = mask0 + 4;
302
303 for (loop_cnt = (height >> 2); loop_cnt--;) {
304 LD_SB4(src, src_stride, src0, src2, src4, src6);
305 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
306 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
307 src += (4 * src_stride);
308
309 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
310 filt0, filt1, filt2, out0, out1, out2, out3);
311 HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
312 filt0, filt1, filt2, out4, out5, out6, out7);
313 SRARI_H4_SH(out0, out1, out2, out3, 7);
314 SRARI_H4_SH(out4, out5, out6, out7, 7);
315 SAT_SH4_SH(out0, out1, out2, out3, 7);
316 SAT_SH4_SH(out4, out5, out6, out7, 7);
317 out = PCKEV_XORI128_UB(out0, out1);
318 ST_UB(out, dst);
319 dst += dst_stride;
320 out = PCKEV_XORI128_UB(out2, out3);
321 ST_UB(out, dst);
322 dst += dst_stride;
323 out = PCKEV_XORI128_UB(out4, out5);
324 ST_UB(out, dst);
325 dst += dst_stride;
326 out = PCKEV_XORI128_UB(out6, out7);
327 ST_UB(out, dst);
328 dst += dst_stride;
329 }
330 }
331
ff_put_vp8_epel4_v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)332 void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
333 uint8_t *src, ptrdiff_t src_stride,
334 int height, int mx, int my)
335 {
336 uint32_t loop_cnt;
337 const int8_t *filter = subpel_filters_msa[my - 1];
338 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
339 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
341 v16u8 out;
342 v8i16 filt, out10, out32;
343
344 src -= (2 * src_stride);
345
346 filt = LD_SH(filter);
347 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
348
349 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350 src += (5 * src_stride);
351
352 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
353 src32_r, src43_r);
354 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
355 XORI_B2_128_SB(src2110, src4332);
356
357 for (loop_cnt = (height >> 2); loop_cnt--;) {
358 LD_SB4(src, src_stride, src5, src6, src7, src8);
359 src += (4 * src_stride);
360
361 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362 src65_r, src76_r, src87_r);
363 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
364 XORI_B2_128_SB(src6554, src8776);
365 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
367 SRARI_H2_SH(out10, out32, 7);
368 SAT_SH2_SH(out10, out32, 7);
369 out = PCKEV_XORI128_UB(out10, out32);
370 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371 dst += (4 * dst_stride);
372
373 src2110 = src6554;
374 src4332 = src8776;
375 src4 = src8;
376 }
377 }
378
ff_put_vp8_epel8_v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)379 void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
380 uint8_t *src, ptrdiff_t src_stride,
381 int height, int mx, int my)
382 {
383 uint32_t loop_cnt;
384 const int8_t *filter = subpel_filters_msa[my - 1];
385 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
386 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387 v16i8 src109_r, filt0, filt1, filt2;
388 v16u8 tmp0, tmp1;
389 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
390
391 src -= (2 * src_stride);
392
393 filt = LD_SH(filter);
394 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
395
396 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397 src += (5 * src_stride);
398
399 XORI_B5_128_SB(src0, src1, src2, src3, src4);
400 ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401 src10_r, src32_r, src21_r, src43_r);
402
403 for (loop_cnt = (height >> 2); loop_cnt--;) {
404 LD_SB4(src, src_stride, src7, src8, src9, src10);
405 XORI_B4_128_SB(src7, src8, src9, src10);
406 src += (4 * src_stride);
407
408 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409 src87_r, src98_r, src109_r);
410 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
414 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
415 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
416 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
417 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
418 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419 dst += (4 * dst_stride);
420
421 src10_r = src76_r;
422 src32_r = src98_r;
423 src21_r = src87_r;
424 src43_r = src109_r;
425 src4 = src10;
426 }
427 }
428
ff_put_vp8_epel16_v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)429 void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
430 uint8_t *src, ptrdiff_t src_stride,
431 int height, int mx, int my)
432 {
433 uint32_t loop_cnt;
434 const int8_t *filter = subpel_filters_msa[my - 1];
435 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
436 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438 v16i8 src65_l, src87_l, filt0, filt1, filt2;
439 v16u8 tmp0, tmp1, tmp2, tmp3;
440 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
441
442 src -= (2 * src_stride);
443
444 filt = LD_SH(filter);
445 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
446
447 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448 src += (5 * src_stride);
449
450 XORI_B5_128_SB(src0, src1, src2, src3, src4);
451 ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452 src32_r, src43_r, src21_r);
453 ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454 src32_l, src43_l, src21_l);
455
456 for (loop_cnt = (height >> 2); loop_cnt--;) {
457 LD_SB4(src, src_stride, src5, src6, src7, src8);
458 src += (4 * src_stride);
459
460 XORI_B4_128_SB(src5, src6, src7, src8);
461 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462 src65_r, src76_r, src87_r);
463 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464 src65_l, src76_l, src87_l);
465 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
466 filt2);
467 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
468 filt2);
469 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
470 filt2);
471 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
472 filt2);
473 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
474 filt2);
475 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
476 filt2);
477 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
478 filt2);
479 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
480 filt2);
481 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
482 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
483 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486 out3_r, tmp0, tmp1, tmp2, tmp3);
487 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
488 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489 dst += (4 * dst_stride);
490
491 src10_r = src54_r;
492 src32_r = src76_r;
493 src21_r = src65_r;
494 src43_r = src87_r;
495 src10_l = src54_l;
496 src32_l = src76_l;
497 src21_l = src65_l;
498 src43_l = src87_l;
499 src4 = src8;
500 }
501 }
502
ff_put_vp8_epel4_h6v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)503 void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
504 uint8_t *src, ptrdiff_t src_stride,
505 int height, int mx, int my)
506 {
507 uint32_t loop_cnt;
508 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
509 const int8_t *filter_vert = subpel_filters_msa[my - 1];
510 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
511 v16i8 filt_hz0, filt_hz1, filt_hz2;
512 v16u8 mask0, mask1, mask2, out;
513 v8i16 tmp0, tmp1;
514 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515 v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
516
517 mask0 = LD_UB(&mc_filt_mask_arr[16]);
518 src -= (2 + 2 * src_stride);
519
520 /* rearranging filter */
521 filt = LD_SH(filter_horiz);
522 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
523
524 filt = LD_SH(filter_vert);
525 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
526
527 mask1 = mask0 + 2;
528 mask2 = mask0 + 4;
529
530 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531 src += (5 * src_stride);
532
533 XORI_B5_128_SB(src0, src1, src2, src3, src4);
534 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
535 filt_hz1, filt_hz2);
536 hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
537 filt_hz1, filt_hz2);
538 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
539 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
540 filt_hz1, filt_hz2);
541 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
542
543 for (loop_cnt = (height >> 2); loop_cnt--;) {
544 LD_SB2(src, src_stride, src5, src6);
545 src += (2 * src_stride);
546
547 XORI_B2_128_SB(src5, src6);
548 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
549 filt_hz1, filt_hz2);
550 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
551
552 LD_SB2(src, src_stride, src7, src8);
553 src += (2 * src_stride);
554
555 XORI_B2_128_SB(src7, src8);
556 hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
557 filt_hz1, filt_hz2);
558 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
559
560 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
562
563 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
565
566 SRARI_H2_SH(tmp0, tmp1, 7);
567 SAT_SH2_SH(tmp0, tmp1, 7);
568 out = PCKEV_XORI128_UB(tmp0, tmp1);
569 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570 dst += (4 * dst_stride);
571
572 hz_out3 = hz_out7;
573 out0 = out2;
574 out1 = out3;
575 }
576 }
577
ff_put_vp8_epel8_h6v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)578 void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
579 uint8_t *src, ptrdiff_t src_stride,
580 int height, int mx, int my)
581 {
582 uint32_t loop_cnt;
583 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
584 const int8_t *filter_vert = subpel_filters_msa[my - 1];
585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
586 v16i8 filt_hz0, filt_hz1, filt_hz2;
587 v16u8 mask0, mask1, mask2, vec0, vec1;
588 v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
589 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591 v8i16 tmp0, tmp1, tmp2, tmp3;
592
593 mask0 = LD_UB(&mc_filt_mask_arr[0]);
594 src -= (2 + 2 * src_stride);
595
596 /* rearranging filter */
597 filt = LD_SH(filter_horiz);
598 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
599
600 mask1 = mask0 + 2;
601 mask2 = mask0 + 4;
602
603 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604 src += (5 * src_stride);
605
606 XORI_B5_128_SB(src0, src1, src2, src3, src4);
607 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
608 filt_hz1, filt_hz2);
609 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
610 filt_hz1, filt_hz2);
611 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
612 filt_hz1, filt_hz2);
613 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
614 filt_hz1, filt_hz2);
615 hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
616 filt_hz1, filt_hz2);
617
618 filt = LD_SH(filter_vert);
619 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
620
621 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
623
624 for (loop_cnt = (height >> 2); loop_cnt--;) {
625 LD_SB4(src, src_stride, src5, src6, src7, src8);
626 src += (4 * src_stride);
627
628 XORI_B4_128_SB(src5, src6, src7, src8);
629 hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
630 filt_hz1, filt_hz2);
631 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
633
634 hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
635 filt_hz1, filt_hz2);
636 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
638
639 hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
640 filt_hz1, filt_hz2);
641 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642 tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
643
644 hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
645 filt_hz1, filt_hz2);
646 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647 tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
648
649 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
650 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
651 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
652 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
653 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654 dst += (4 * dst_stride);
655
656 hz_out4 = hz_out8;
657 out0 = out2;
658 out1 = out7;
659 out3 = out5;
660 out4 = out6;
661 }
662 }
663
664
ff_put_vp8_epel16_h6v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)665 void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
666 uint8_t *src, ptrdiff_t src_stride,
667 int height, int mx, int my)
668 {
669 int32_t multiple8_cnt;
670
671 for (multiple8_cnt = 2; multiple8_cnt--;) {
672 ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
673 mx, my);
674
675 src += 8;
676 dst += 8;
677 }
678 }
679
common_hz_4t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)680 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
681 uint8_t *dst, int32_t dst_stride,
682 const int8_t *filter)
683 {
684 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
685 v8i16 filt, out0, out1;
686 v16u8 out;
687
688 mask0 = LD_SB(&mc_filt_mask_arr[16]);
689 src -= 1;
690
691 /* rearranging filter */
692 filt = LD_SH(filter);
693 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
694
695 mask1 = mask0 + 2;
696
697 LD_SB4(src, src_stride, src0, src1, src2, src3);
698 XORI_B4_128_SB(src0, src1, src2, src3);
699 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
700 filt0, filt1, out0, out1);
701 SRARI_H2_SH(out0, out1, 7);
702 SAT_SH2_SH(out0, out1, 7);
703 out = PCKEV_XORI128_UB(out0, out1);
704 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
705 }
706
common_hz_4t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)707 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
708 uint8_t *dst, int32_t dst_stride,
709 const int8_t *filter)
710 {
711 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
712 v16u8 out;
713 v8i16 filt, out0, out1, out2, out3;
714
715 mask0 = LD_SB(&mc_filt_mask_arr[16]);
716 src -= 1;
717
718 /* rearranging filter */
719 filt = LD_SH(filter);
720 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
721
722 mask1 = mask0 + 2;
723
724 LD_SB4(src, src_stride, src0, src1, src2, src3);
725 src += (4 * src_stride);
726
727 XORI_B4_128_SB(src0, src1, src2, src3);
728 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
729 filt0, filt1, out0, out1);
730 LD_SB4(src, src_stride, src0, src1, src2, src3);
731 XORI_B4_128_SB(src0, src1, src2, src3);
732 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
733 filt0, filt1, out2, out3);
734 SRARI_H4_SH(out0, out1, out2, out3, 7);
735 SAT_SH4_SH(out0, out1, out2, out3, 7);
736 out = PCKEV_XORI128_UB(out0, out1);
737 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
738 out = PCKEV_XORI128_UB(out2, out3);
739 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
740 }
741
common_hz_4t_4x16_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)742 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
743 uint8_t *dst, int32_t dst_stride,
744 const int8_t *filter)
745 {
746 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
747 v16i8 filt0, filt1, mask0, mask1;
748 v16u8 out;
749 v8i16 filt, out0, out1, out2, out3;
750
751 mask0 = LD_SB(&mc_filt_mask_arr[16]);
752 src -= 1;
753
754 /* rearranging filter */
755 filt = LD_SH(filter);
756 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
757
758 mask1 = mask0 + 2;
759
760 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761 src += (8 * src_stride);
762 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
763 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
764 filt0, filt1, out0, out1);
765 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
766 filt0, filt1, out2, out3);
767 SRARI_H4_SH(out0, out1, out2, out3, 7);
768 SAT_SH4_SH(out0, out1, out2, out3, 7);
769 out = PCKEV_XORI128_UB(out0, out1);
770 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771 dst += (4 * dst_stride);
772 out = PCKEV_XORI128_UB(out2, out3);
773 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774 dst += (4 * dst_stride);
775
776 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777 src += (8 * src_stride);
778 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
780 filt0, filt1, out0, out1);
781 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
782 filt0, filt1, out2, out3);
783 SRARI_H4_SH(out0, out1, out2, out3, 7);
784 SAT_SH4_SH(out0, out1, out2, out3, 7);
785 out = PCKEV_XORI128_UB(out0, out1);
786 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787 dst += (4 * dst_stride);
788 out = PCKEV_XORI128_UB(out2, out3);
789 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
790 }
791
ff_put_vp8_epel4_h4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)792 void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
793 uint8_t *src, ptrdiff_t src_stride,
794 int height, int mx, int my)
795 {
796 const int8_t *filter = subpel_filters_msa[mx - 1];
797
798 if (4 == height) {
799 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
800 } else if (8 == height) {
801 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
802 } else if (16 == height) {
803 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
804 }
805 }
806
ff_put_vp8_epel8_h4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)807 void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
808 uint8_t *src, ptrdiff_t src_stride,
809 int height, int mx, int my)
810 {
811 uint32_t loop_cnt;
812 const int8_t *filter = subpel_filters_msa[mx - 1];
813 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
814 v16u8 tmp0, tmp1;
815 v8i16 filt, out0, out1, out2, out3;
816
817 mask0 = LD_SB(&mc_filt_mask_arr[0]);
818 src -= 1;
819
820 /* rearranging filter */
821 filt = LD_SH(filter);
822 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
823
824 mask1 = mask0 + 2;
825
826 for (loop_cnt = (height >> 2); loop_cnt--;) {
827 LD_SB4(src, src_stride, src0, src1, src2, src3);
828 src += (4 * src_stride);
829
830 XORI_B4_128_SB(src0, src1, src2, src3);
831 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
832 filt1, out0, out1, out2, out3);
833 SRARI_H4_SH(out0, out1, out2, out3, 7);
834 SAT_SH4_SH(out0, out1, out2, out3, 7);
835 tmp0 = PCKEV_XORI128_UB(out0, out1);
836 tmp1 = PCKEV_XORI128_UB(out2, out3);
837 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838 dst += (4 * dst_stride);
839 }
840 }
841
ff_put_vp8_epel16_h4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)842 void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
843 uint8_t *src, ptrdiff_t src_stride,
844 int height, int mx, int my)
845 {
846 uint32_t loop_cnt;
847 const int8_t *filter = subpel_filters_msa[mx - 1];
848 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
849 v16i8 filt0, filt1, mask0, mask1;
850 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
851 v16u8 out;
852
853 mask0 = LD_SB(&mc_filt_mask_arr[0]);
854 src -= 1;
855
856 /* rearranging filter */
857 filt = LD_SH(filter);
858 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
859
860 mask1 = mask0 + 2;
861
862 for (loop_cnt = (height >> 2); loop_cnt--;) {
863 LD_SB4(src, src_stride, src0, src2, src4, src6);
864 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865 src += (4 * src_stride);
866
867 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
868 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
869 filt1, out0, out1, out2, out3);
870 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
871 filt1, out4, out5, out6, out7);
872 SRARI_H4_SH(out0, out1, out2, out3, 7);
873 SRARI_H4_SH(out4, out5, out6, out7, 7);
874 SAT_SH4_SH(out0, out1, out2, out3, 7);
875 SAT_SH4_SH(out4, out5, out6, out7, 7);
876 out = PCKEV_XORI128_UB(out0, out1);
877 ST_UB(out, dst);
878 dst += dst_stride;
879 out = PCKEV_XORI128_UB(out2, out3);
880 ST_UB(out, dst);
881 dst += dst_stride;
882 out = PCKEV_XORI128_UB(out4, out5);
883 ST_UB(out, dst);
884 dst += dst_stride;
885 out = PCKEV_XORI128_UB(out6, out7);
886 ST_UB(out, dst);
887 dst += dst_stride;
888 }
889 }
890
ff_put_vp8_epel4_v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)891 void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
892 uint8_t *src, ptrdiff_t src_stride,
893 int height, int mx, int my)
894 {
895 uint32_t loop_cnt;
896 const int8_t *filter = subpel_filters_msa[my - 1];
897 v16i8 src0, src1, src2, src3, src4, src5;
898 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899 v16i8 src2110, src4332, filt0, filt1;
900 v8i16 filt, out10, out32;
901 v16u8 out;
902
903 src -= src_stride;
904
905 filt = LD_SH(filter);
906 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
907
908 LD_SB3(src, src_stride, src0, src1, src2);
909 src += (3 * src_stride);
910
911 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
912
913 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
915
916 for (loop_cnt = (height >> 2); loop_cnt--;) {
917 LD_SB3(src, src_stride, src3, src4, src5);
918 src += (3 * src_stride);
919 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
922 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
923
924 src2 = LD_SB(src);
925 src += (src_stride);
926 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
929 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
930 SRARI_H2_SH(out10, out32, 7);
931 SAT_SH2_SH(out10, out32, 7);
932 out = PCKEV_XORI128_UB(out10, out32);
933 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934 dst += (4 * dst_stride);
935 }
936 }
937
ff_put_vp8_epel8_v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)938 void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
939 uint8_t *src, ptrdiff_t src_stride,
940 int height, int mx, int my)
941 {
942 uint32_t loop_cnt;
943 const int8_t *filter = subpel_filters_msa[my - 1];
944 v16i8 src0, src1, src2, src7, src8, src9, src10;
945 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
946 v16u8 tmp0, tmp1;
947 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
948
949 src -= src_stride;
950
951 filt = LD_SH(filter);
952 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
953
954 LD_SB3(src, src_stride, src0, src1, src2);
955 src += (3 * src_stride);
956
957 XORI_B3_128_SB(src0, src1, src2);
958 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
959
960 for (loop_cnt = (height >> 2); loop_cnt--;) {
961 LD_SB4(src, src_stride, src7, src8, src9, src10);
962 src += (4 * src_stride);
963
964 XORI_B4_128_SB(src7, src8, src9, src10);
965 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966 src72_r, src87_r, src98_r, src109_r);
967 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
968 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
969 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
970 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
971 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
972 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
973 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
974 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
975 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976 dst += (4 * dst_stride);
977
978 src10_r = src98_r;
979 src21_r = src109_r;
980 src2 = src10;
981 }
982 }
983
ff_put_vp8_epel16_v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)984 void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
985 uint8_t *src, ptrdiff_t src_stride,
986 int height, int mx, int my)
987 {
988 uint32_t loop_cnt;
989 const int8_t *filter = subpel_filters_msa[my - 1];
990 v16i8 src0, src1, src2, src3, src4, src5, src6;
991 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993 v16u8 tmp0, tmp1, tmp2, tmp3;
994 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
995
996 src -= src_stride;
997
998 filt = LD_SH(filter);
999 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1000
1001 LD_SB3(src, src_stride, src0, src1, src2);
1002 src += (3 * src_stride);
1003
1004 XORI_B3_128_SB(src0, src1, src2);
1005 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1007
1008 for (loop_cnt = (height >> 2); loop_cnt--;) {
1009 LD_SB4(src, src_stride, src3, src4, src5, src6);
1010 src += (4 * src_stride);
1011
1012 XORI_B4_128_SB(src3, src4, src5, src6);
1013 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014 src32_r, src43_r, src54_r, src65_r);
1015 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016 src32_l, src43_l, src54_l, src65_l);
1017 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
1018 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
1019 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
1020 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
1021 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
1022 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
1023 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
1024 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
1025 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1026 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1027 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030 out3_r, tmp0, tmp1, tmp2, tmp3);
1031 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1032 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033 dst += (4 * dst_stride);
1034
1035 src10_r = src54_r;
1036 src21_r = src65_r;
1037 src10_l = src54_l;
1038 src21_l = src65_l;
1039 src2 = src6;
1040 }
1041 }
1042
ff_put_vp8_epel4_h4v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1043 void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1044 uint8_t *src, ptrdiff_t src_stride,
1045 int height, int mx, int my)
1046 {
1047 uint32_t loop_cnt;
1048 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1049 const int8_t *filter_vert = subpel_filters_msa[my - 1];
1050 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051 v16u8 mask0, mask1, out;
1052 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1054
1055 mask0 = LD_UB(&mc_filt_mask_arr[16]);
1056 src -= (1 + 1 * src_stride);
1057
1058 /* rearranging filter */
1059 filt = LD_SH(filter_horiz);
1060 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1061
1062 mask1 = mask0 + 2;
1063
1064 LD_SB3(src, src_stride, src0, src1, src2);
1065 src += (3 * src_stride);
1066
1067 XORI_B3_128_SB(src0, src1, src2);
1068 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069 hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1071
1072 filt = LD_SH(filter_vert);
1073 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1074
1075 for (loop_cnt = (height >> 2); loop_cnt--;) {
1076 LD_SB4(src, src_stride, src3, src4, src5, src6);
1077 src += (4 * src_stride);
1078
1079 XORI_B2_128_SB(src3, src4);
1080 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1083 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1084
1085 XORI_B2_128_SB(src5, src6);
1086 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1089 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1090
1091 SRARI_H2_SH(tmp0, tmp1, 7);
1092 SAT_SH2_SH(tmp0, tmp1, 7);
1093 out = PCKEV_XORI128_UB(tmp0, tmp1);
1094 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095 dst += (4 * dst_stride);
1096
1097 hz_out1 = hz_out5;
1098 vec0 = vec2;
1099 }
1100 }
1101
ff_put_vp8_epel8_h4v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1102 void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1103 uint8_t *src, ptrdiff_t src_stride,
1104 int height, int mx, int my)
1105 {
1106 uint32_t loop_cnt;
1107 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1108 const int8_t *filter_vert = subpel_filters_msa[my - 1];
1109 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110 v16u8 mask0, mask1, out0, out1;
1111 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113 v8i16 vec0, vec1, vec2, vec3, vec4;
1114
1115 mask0 = LD_UB(&mc_filt_mask_arr[0]);
1116 src -= (1 + 1 * src_stride);
1117
1118 /* rearranging filter */
1119 filt = LD_SH(filter_horiz);
1120 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1121
1122 mask1 = mask0 + 2;
1123
1124 LD_SB3(src, src_stride, src0, src1, src2);
1125 src += (3 * src_stride);
1126
1127 XORI_B3_128_SB(src0, src1, src2);
1128 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1132
1133 filt = LD_SH(filter_vert);
1134 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1135
1136 for (loop_cnt = (height >> 2); loop_cnt--;) {
1137 LD_SB4(src, src_stride, src3, src4, src5, src6);
1138 src += (4 * src_stride);
1139
1140 XORI_B4_128_SB(src3, src4, src5, src6);
1141 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1143 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1144
1145 hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1147 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1148
1149 hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150 vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1151 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1152
1153 hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1155 tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1156
1157 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1158 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1159 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1160 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1161 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162 dst += (4 * dst_stride);
1163
1164 vec0 = vec4;
1165 vec2 = vec1;
1166 }
1167 }
1168
ff_put_vp8_epel16_h4v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1169 void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1170 uint8_t *src, ptrdiff_t src_stride,
1171 int height, int mx, int my)
1172 {
1173 int32_t multiple8_cnt;
1174
1175 for (multiple8_cnt = 2; multiple8_cnt--;) {
1176 ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
1177 mx, my);
1178
1179 src += 8;
1180 dst += 8;
1181 }
1182 }
1183
ff_put_vp8_epel4_h6v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1184 void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1185 uint8_t *src, ptrdiff_t src_stride,
1186 int height, int mx, int my)
1187 {
1188 uint32_t loop_cnt;
1189 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1190 const int8_t *filter_vert = subpel_filters_msa[my - 1];
1191 v16i8 src0, src1, src2, src3, src4, src5, src6;
1192 v16i8 filt_hz0, filt_hz1, filt_hz2;
1193 v16u8 res0, res1, mask0, mask1, mask2;
1194 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1196
1197 mask0 = LD_UB(&mc_filt_mask_arr[16]);
1198 src -= (2 + 1 * src_stride);
1199
1200 /* rearranging filter */
1201 filt = LD_SH(filter_horiz);
1202 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1203
1204 mask1 = mask0 + 2;
1205 mask2 = mask0 + 4;
1206
1207 LD_SB3(src, src_stride, src0, src1, src2);
1208 src += (3 * src_stride);
1209
1210 XORI_B3_128_SB(src0, src1, src2);
1211 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
1212 filt_hz1, filt_hz2);
1213 hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
1214 filt_hz1, filt_hz2);
1215 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1216
1217 filt = LD_SH(filter_vert);
1218 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1219
1220 for (loop_cnt = (height >> 2); loop_cnt--;) {
1221 LD_SB4(src, src_stride, src3, src4, src5, src6);
1222 src += (4 * src_stride);
1223
1224 XORI_B4_128_SB(src3, src4, src5, src6);
1225 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1226 filt_hz1, filt_hz2);
1227 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1229 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1230
1231 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1232 filt_hz1, filt_hz2);
1233 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1235 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1236
1237 SRARI_H2_SH(tmp0, tmp1, 7);
1238 SAT_SH2_SH(tmp0, tmp1, 7);
1239 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1240 XORI_B2_128_UB(res0, res1);
1241 ST_W2(res0, 0, 1, dst, dst_stride);
1242 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243 dst += (4 * dst_stride);
1244
1245 hz_out1 = hz_out5;
1246 vec0 = vec2;
1247 }
1248 }
1249
ff_put_vp8_epel8_h6v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1250 void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1251 uint8_t *src, ptrdiff_t src_stride,
1252 int height, int mx, int my)
1253 {
1254 uint32_t loop_cnt;
1255 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1256 const int8_t *filter_vert = subpel_filters_msa[my - 1];
1257 v16i8 src0, src1, src2, src3, src4, src5, src6;
1258 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259 v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1261 v16u8 out0, out1;
1262
1263 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1264 src -= (2 + src_stride);
1265
1266 /* rearranging filter */
1267 filt = LD_SH(filter_horiz);
1268 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1269
1270 mask1 = mask0 + 2;
1271 mask2 = mask0 + 4;
1272
1273 LD_SB3(src, src_stride, src0, src1, src2);
1274 src += (3 * src_stride);
1275
1276 XORI_B3_128_SB(src0, src1, src2);
1277 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
1278 filt_hz1, filt_hz2);
1279 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
1280 filt_hz1, filt_hz2);
1281 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
1282 filt_hz1, filt_hz2);
1283 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1284
1285 filt = LD_SH(filter_vert);
1286 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1287
1288 for (loop_cnt = (height >> 2); loop_cnt--;) {
1289 LD_SB4(src, src_stride, src3, src4, src5, src6);
1290 src += (4 * src_stride);
1291
1292 XORI_B4_128_SB(src3, src4, src5, src6);
1293
1294 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1295 filt_hz1, filt_hz2);
1296 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1297 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1298
1299 hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1300 filt_hz1, filt_hz2);
1301 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1302 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1303
1304 hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1305 filt_hz1, filt_hz2);
1306 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1307 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1308
1309 hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1310 filt_hz1, filt_hz2);
1311 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1312 tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1313
1314 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1315 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1316 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1317 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1318 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319 dst += (4 * dst_stride);
1320 }
1321 }
1322
ff_put_vp8_epel16_h6v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1323 void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1324 uint8_t *src, ptrdiff_t src_stride,
1325 int height, int mx, int my)
1326 {
1327 int32_t multiple8_cnt;
1328
1329 for (multiple8_cnt = 2; multiple8_cnt--;) {
1330 ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
1331 mx, my);
1332
1333 src += 8;
1334 dst += 8;
1335 }
1336 }
1337
ff_put_vp8_epel4_h4v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1338 void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1339 uint8_t *src, ptrdiff_t src_stride,
1340 int height, int mx, int my)
1341 {
1342 uint32_t loop_cnt;
1343 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1344 const int8_t *filter_vert = subpel_filters_msa[my - 1];
1345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1347 v16u8 out;
1348 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350 v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1351
1352 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1353
1354 src -= (1 + 2 * src_stride);
1355
1356 /* rearranging filter */
1357 filt = LD_SH(filter_horiz);
1358 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1359
1360 mask1 = mask0 + 2;
1361
1362 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363 src += (5 * src_stride);
1364
1365 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1366 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367 hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1371
1372 filt = LD_SH(filter_vert);
1373 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1374
1375 for (loop_cnt = (height >> 2); loop_cnt--;) {
1376 LD_SB4(src, src_stride, src5, src6, src7, src8);
1377 XORI_B4_128_SB(src5, src6, src7, src8);
1378 src += (4 * src_stride);
1379
1380 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1384
1385 hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1389
1390 SRARI_H2_SH(tmp0, tmp1, 7);
1391 SAT_SH2_SH(tmp0, tmp1, 7);
1392 out = PCKEV_XORI128_UB(tmp0, tmp1);
1393 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394 dst += (4 * dst_stride);
1395
1396 hz_out3 = hz_out7;
1397 out0 = out2;
1398 out1 = out3;
1399 }
1400 }
1401
ff_put_vp8_epel8_h4v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1402 void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1403 uint8_t *src, ptrdiff_t src_stride,
1404 int height, int mx, int my)
1405 {
1406 uint32_t loop_cnt;
1407 const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1408 const int8_t *filter_vert = subpel_filters_msa[my - 1];
1409 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1410 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1414 v16u8 vec0, vec1;
1415
1416 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1417 src -= (1 + 2 * src_stride);
1418
1419 /* rearranging filter */
1420 filt = LD_SH(filter_horiz);
1421 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1422
1423 mask1 = mask0 + 2;
1424
1425 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426 src += (5 * src_stride);
1427
1428 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1429 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433 hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1436
1437 filt = LD_SH(filter_vert);
1438 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1439
1440 for (loop_cnt = (height >> 2); loop_cnt--;) {
1441 LD_SB4(src, src_stride, src5, src6, src7, src8);
1442 src += (4 * src_stride);
1443
1444 XORI_B4_128_SB(src5, src6, src7, src8);
1445
1446 hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1449
1450 hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1453
1454 hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456 tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1457
1458 hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460 tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1461
1462 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1463 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1464 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1465 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1466 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467 dst += (4 * dst_stride);
1468
1469 hz_out4 = hz_out8;
1470 out0 = out2;
1471 out1 = out6;
1472 out3 = out5;
1473 out4 = out7;
1474 }
1475 }
1476
ff_put_vp8_epel16_h4v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1477 void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1478 uint8_t *src, ptrdiff_t src_stride,
1479 int height, int mx, int my)
1480 {
1481 int32_t multiple8_cnt;
1482
1483 for (multiple8_cnt = 2; multiple8_cnt--;) {
1484 ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
1485 mx, my);
1486
1487 src += 8;
1488 dst += 8;
1489 }
1490 }
1491
common_hz_2t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1492 static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1493 uint8_t *dst, int32_t dst_stride,
1494 const int8_t *filter)
1495 {
1496 v16i8 src0, src1, src2, src3, mask;
1497 v16u8 filt0, vec0, vec1, res0, res1;
1498 v8u16 vec2, vec3, filt;
1499
1500 mask = LD_SB(&mc_filt_mask_arr[16]);
1501
1502 /* rearranging filter */
1503 filt = LD_UH(filter);
1504 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1505
1506 LD_SB4(src, src_stride, src0, src1, src2, src3);
1507 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1509 SRARI_H2_UH(vec2, vec3, 7);
1510 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1511 ST_W2(res0, 0, 1, dst, dst_stride);
1512 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1513 }
1514
common_hz_2t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1515 static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1516 uint8_t *dst, int32_t dst_stride,
1517 const int8_t *filter)
1518 {
1519 v16u8 vec0, vec1, vec2, vec3, filt0;
1520 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1521 v16i8 res0, res1, res2, res3;
1522 v8u16 vec4, vec5, vec6, vec7, filt;
1523
1524 mask = LD_SB(&mc_filt_mask_arr[16]);
1525
1526 /* rearranging filter */
1527 filt = LD_UH(filter);
1528 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1529
1530 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534 vec4, vec5, vec6, vec7);
1535 SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1536 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537 res0, res1, res2, res3);
1538 ST_W2(res0, 0, 1, dst, dst_stride);
1539 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1542 }
1543
ff_put_vp8_bilinear4_h_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1544 void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1545 uint8_t *src, ptrdiff_t src_stride,
1546 int height, int mx, int my)
1547 {
1548 const int8_t *filter = bilinear_filters_msa[mx - 1];
1549
1550 if (4 == height) {
1551 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1552 } else if (8 == height) {
1553 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1554 }
1555 }
1556
common_hz_2t_8x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1557 static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1558 uint8_t *dst, int32_t dst_stride,
1559 const int8_t *filter)
1560 {
1561 v16u8 filt0;
1562 v16i8 src0, src1, src2, src3, mask;
1563 v8u16 vec0, vec1, vec2, vec3, filt;
1564
1565 mask = LD_SB(&mc_filt_mask_arr[0]);
1566
1567 /* rearranging filter */
1568 filt = LD_UH(filter);
1569 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1570
1571 LD_SB4(src, src_stride, src0, src1, src2, src3);
1572 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575 vec0, vec1, vec2, vec3);
1576 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1577 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1578 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1579 }
1580
common_hz_2t_8x8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1581 static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1582 uint8_t *dst, int32_t dst_stride,
1583 const int8_t *filter, int32_t height)
1584 {
1585 v16u8 filt0;
1586 v16i8 src0, src1, src2, src3, mask, out0, out1;
1587 v8u16 vec0, vec1, vec2, vec3, filt;
1588
1589 mask = LD_SB(&mc_filt_mask_arr[0]);
1590
1591 /* rearranging filter */
1592 filt = LD_UH(filter);
1593 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1594
1595 LD_SB4(src, src_stride, src0, src1, src2, src3);
1596 src += (4 * src_stride);
1597
1598 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601 vec0, vec1, vec2, vec3);
1602 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1603
1604 LD_SB4(src, src_stride, src0, src1, src2, src3);
1605 src += (4 * src_stride);
1606
1607 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1608 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1609
1610 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613 vec0, vec1, vec2, vec3);
1614 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1615 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1616 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617 dst += (8 * dst_stride);
1618
1619 if (16 == height) {
1620 LD_SB4(src, src_stride, src0, src1, src2, src3);
1621 src += (4 * src_stride);
1622
1623 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626 vec0, vec1, vec2, vec3);
1627 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1628 LD_SB4(src, src_stride, src0, src1, src2, src3);
1629 src += (4 * src_stride);
1630
1631 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1632 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1633
1634 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637 vec0, vec1, vec2, vec3);
1638 SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1639 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1640 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1641 }
1642 }
1643
ff_put_vp8_bilinear8_h_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1644 void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1645 uint8_t *src, ptrdiff_t src_stride,
1646 int height, int mx, int my)
1647 {
1648 const int8_t *filter = bilinear_filters_msa[mx - 1];
1649
1650 if (4 == height) {
1651 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1652 } else {
1653 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1654 height);
1655 }
1656 }
1657
ff_put_vp8_bilinear16_h_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1658 void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1659 uint8_t *src, ptrdiff_t src_stride,
1660 int height, int mx, int my)
1661 {
1662 uint32_t loop_cnt;
1663 const int8_t *filter = bilinear_filters_msa[mx - 1];
1664 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1665 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1667
1668 mask = LD_SB(&mc_filt_mask_arr[0]);
1669
1670 loop_cnt = (height >> 2) - 1;
1671
1672 /* rearranging filter */
1673 filt = LD_UH(filter);
1674 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1675
1676 LD_SB4(src, src_stride, src0, src2, src4, src6);
1677 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678 src += (4 * src_stride);
1679
1680 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685 out0, out1, out2, out3);
1686 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687 out4, out5, out6, out7);
1688 SRARI_H4_UH(out0, out1, out2, out3, 7);
1689 SRARI_H4_UH(out4, out5, out6, out7, 7);
1690 PCKEV_ST_SB(out0, out1, dst);
1691 dst += dst_stride;
1692 PCKEV_ST_SB(out2, out3, dst);
1693 dst += dst_stride;
1694 PCKEV_ST_SB(out4, out5, dst);
1695 dst += dst_stride;
1696 PCKEV_ST_SB(out6, out7, dst);
1697 dst += dst_stride;
1698
1699 for (; loop_cnt--;) {
1700 LD_SB4(src, src_stride, src0, src2, src4, src6);
1701 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702 src += (4 * src_stride);
1703
1704 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709 out0, out1, out2, out3);
1710 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711 out4, out5, out6, out7);
1712 SRARI_H4_UH(out0, out1, out2, out3, 7);
1713 SRARI_H4_UH(out4, out5, out6, out7, 7);
1714 PCKEV_ST_SB(out0, out1, dst);
1715 dst += dst_stride;
1716 PCKEV_ST_SB(out2, out3, dst);
1717 dst += dst_stride;
1718 PCKEV_ST_SB(out4, out5, dst);
1719 dst += dst_stride;
1720 PCKEV_ST_SB(out6, out7, dst);
1721 dst += dst_stride;
1722 }
1723 }
1724
common_vt_2t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1725 static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1726 uint8_t *dst, int32_t dst_stride,
1727 const int8_t *filter)
1728 {
1729 v16i8 src0, src1, src2, src3, src4;
1730 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1731 v16u8 filt0;
1732 v8i16 filt;
1733 v8u16 tmp0, tmp1;
1734
1735 filt = LD_SH(filter);
1736 filt0 = (v16u8) __msa_splati_h(filt, 0);
1737
1738 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739 src += (5 * src_stride);
1740
1741 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742 src10_r, src21_r, src32_r, src43_r);
1743 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1745 SRARI_H2_UH(tmp0, tmp1, 7);
1746 SAT_UH2_UH(tmp0, tmp1, 7);
1747 src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748 ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1749 }
1750
common_vt_2t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1751 static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1752 uint8_t *dst, int32_t dst_stride,
1753 const int8_t *filter)
1754 {
1755 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1756 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758 v8u16 tmp0, tmp1, tmp2, tmp3;
1759 v16u8 filt0;
1760 v8i16 filt;
1761
1762 filt = LD_SH(filter);
1763 filt0 = (v16u8) __msa_splati_h(filt, 0);
1764
1765 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766 src += (8 * src_stride);
1767
1768 src8 = LD_SB(src);
1769 src += src_stride;
1770
1771 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1772 src32_r, src43_r);
1773 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1774 src76_r, src87_r);
1775 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776 src87_r, src76_r, src2110, src4332, src6554, src8776);
1777 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778 tmp0, tmp1, tmp2, tmp3);
1779 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1780 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1781 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1783 }
1784
ff_put_vp8_bilinear4_v_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1785 void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1786 uint8_t *src, ptrdiff_t src_stride,
1787 int height, int mx, int my)
1788 {
1789 const int8_t *filter = bilinear_filters_msa[my - 1];
1790
1791 if (4 == height) {
1792 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1793 } else if (8 == height) {
1794 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1795 }
1796 }
1797
common_vt_2t_8x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1798 static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1799 uint8_t *dst, int32_t dst_stride,
1800 const int8_t *filter)
1801 {
1802 v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1803 v16i8 out0, out1;
1804 v8u16 tmp0, tmp1, tmp2, tmp3;
1805 v8i16 filt;
1806
1807 /* rearranging filter_y */
1808 filt = LD_SH(filter);
1809 filt0 = (v16u8) __msa_splati_h(filt, 0);
1810
1811 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815 tmp0, tmp1, tmp2, tmp3);
1816 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1817 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1818 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1819 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1820 }
1821
common_vt_2t_8x8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1822 static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1823 uint8_t *dst, int32_t dst_stride,
1824 const int8_t *filter, int32_t height)
1825 {
1826 uint32_t loop_cnt;
1827 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1828 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1829 v16i8 out0, out1;
1830 v8u16 tmp0, tmp1, tmp2, tmp3;
1831 v8i16 filt;
1832
1833 /* rearranging filter_y */
1834 filt = LD_SH(filter);
1835 filt0 = (v16u8) __msa_splati_h(filt, 0);
1836
1837 src0 = LD_UB(src);
1838 src += src_stride;
1839
1840 for (loop_cnt = (height >> 3); loop_cnt--;) {
1841 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842 src += (8 * src_stride);
1843
1844 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845 vec0, vec1, vec2, vec3);
1846 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847 vec4, vec5, vec6, vec7);
1848 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849 tmp0, tmp1, tmp2, tmp3);
1850 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1851 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1852 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1853 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1854
1855 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856 tmp0, tmp1, tmp2, tmp3);
1857 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1858 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1859 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1860 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861 dst += (8 * dst_stride);
1862
1863 src0 = src8;
1864 }
1865 }
1866
ff_put_vp8_bilinear8_v_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1867 void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1868 uint8_t *src, ptrdiff_t src_stride,
1869 int height, int mx, int my)
1870 {
1871 const int8_t *filter = bilinear_filters_msa[my - 1];
1872
1873 if (4 == height) {
1874 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1875 } else {
1876 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1877 height);
1878 }
1879 }
1880
ff_put_vp8_bilinear16_v_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1881 void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1882 uint8_t *src, ptrdiff_t src_stride,
1883 int height, int mx, int my)
1884 {
1885 uint32_t loop_cnt;
1886 const int8_t *filter = bilinear_filters_msa[my - 1];
1887 v16u8 src0, src1, src2, src3, src4;
1888 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889 v8u16 tmp0, tmp1, tmp2, tmp3;
1890 v8i16 filt;
1891
1892 /* rearranging filter_y */
1893 filt = LD_SH(filter);
1894 filt0 = (v16u8) __msa_splati_h(filt, 0);
1895
1896 src0 = LD_UB(src);
1897 src += src_stride;
1898
1899 for (loop_cnt = (height >> 2); loop_cnt--;) {
1900 LD_UB4(src, src_stride, src1, src2, src3, src4);
1901 src += (4 * src_stride);
1902
1903 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1906 SRARI_H2_UH(tmp0, tmp1, 7);
1907 SAT_UH2_UH(tmp0, tmp1, 7);
1908 PCKEV_ST_SB(tmp0, tmp1, dst);
1909 dst += dst_stride;
1910
1911 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1914 SRARI_H2_UH(tmp2, tmp3, 7);
1915 SAT_UH2_UH(tmp2, tmp3, 7);
1916 PCKEV_ST_SB(tmp2, tmp3, dst);
1917 dst += dst_stride;
1918
1919 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1920 SRARI_H2_UH(tmp0, tmp1, 7);
1921 SAT_UH2_UH(tmp0, tmp1, 7);
1922 PCKEV_ST_SB(tmp0, tmp1, dst);
1923 dst += dst_stride;
1924
1925 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1926 SRARI_H2_UH(tmp2, tmp3, 7);
1927 SAT_UH2_UH(tmp2, tmp3, 7);
1928 PCKEV_ST_SB(tmp2, tmp3, dst);
1929 dst += dst_stride;
1930
1931 src0 = src4;
1932 }
1933 }
1934
common_hv_2ht_2vt_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)1935 static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
1936 uint8_t *dst, int32_t dst_stride,
1937 const int8_t *filter_horiz,
1938 const int8_t *filter_vert)
1939 {
1940 v16i8 src0, src1, src2, src3, src4, mask;
1941 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
1943
1944 mask = LD_SB(&mc_filt_mask_arr[16]);
1945
1946 /* rearranging filter */
1947 filt = LD_UH(filter_horiz);
1948 filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1949
1950 filt = LD_UH(filter_vert);
1951 filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1952
1953 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1954 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1955 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1956 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
1957 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1959
1960 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1962 SRARI_H2_UH(tmp0, tmp1, 7);
1963 SAT_UH2_UH(tmp0, tmp1, 7);
1964 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1965 ST_W2(res0, 0, 1, dst, dst_stride);
1966 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1967 }
1968
common_hv_2ht_2vt_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)1969 static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
1970 uint8_t *dst, int32_t dst_stride,
1971 const int8_t *filter_horiz,
1972 const int8_t *filter_vert)
1973 {
1974 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
1975 v16i8 res0, res1, res2, res3;
1976 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
1979
1980 mask = LD_SB(&mc_filt_mask_arr[16]);
1981
1982 /* rearranging filter */
1983 filt = LD_UH(filter_horiz);
1984 filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1985
1986 filt = LD_UH(filter_vert);
1987 filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1988
1989 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990 src += (8 * src_stride);
1991 src8 = LD_SB(src);
1992
1993 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1994 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1995 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
1996 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
1997 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
1998 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
1999 hz_out3, hz_out5);
2000 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2001
2002 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005 vec4, vec5, vec6, vec7);
2006 SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2007 SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2008 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009 res0, res1, res2, res3);
2010 ST_W2(res0, 0, 1, dst, dst_stride);
2011 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2014 }
2015
ff_put_vp8_bilinear4_hv_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2016 void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2017 uint8_t *src, ptrdiff_t src_stride,
2018 int height, int mx, int my)
2019 {
2020 const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2021 const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2022
2023 if (4 == height) {
2024 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2025 filter_horiz, filter_vert);
2026 } else if (8 == height) {
2027 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2028 filter_horiz, filter_vert);
2029 }
2030 }
2031
common_hv_2ht_2vt_8x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)2032 static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
2033 uint8_t *dst, int32_t dst_stride,
2034 const int8_t *filter_horiz,
2035 const int8_t *filter_vert)
2036 {
2037 v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2038 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2040 v8i16 filt;
2041
2042 mask = LD_SB(&mc_filt_mask_arr[0]);
2043
2044 /* rearranging filter */
2045 filt = LD_SH(filter_horiz);
2046 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2047
2048 filt = LD_SH(filter_vert);
2049 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2050
2051 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2052
2053 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2054 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2055 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2057
2058 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2059 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2061
2062 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2063 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2065
2066 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2067 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2069
2070 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2071 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2072 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2073 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2074 }
2075
common_hv_2ht_2vt_8x8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)2076 static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
2077 uint8_t *dst, int32_t dst_stride,
2078 const int8_t *filter_horiz,
2079 const int8_t *filter_vert,
2080 int32_t height)
2081 {
2082 uint32_t loop_cnt;
2083 v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2084 v16u8 filt_hz, filt_vt, vec0;
2085 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2086 v8i16 filt;
2087
2088 mask = LD_SB(&mc_filt_mask_arr[0]);
2089
2090 /* rearranging filter */
2091 filt = LD_SH(filter_horiz);
2092 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2093
2094 filt = LD_SH(filter_vert);
2095 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2096
2097 src0 = LD_SB(src);
2098 src += src_stride;
2099
2100 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2101
2102 for (loop_cnt = (height >> 3); loop_cnt--;) {
2103 LD_SB4(src, src_stride, src1, src2, src3, src4);
2104 src += (4 * src_stride);
2105
2106 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2107 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2109
2110 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2111 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2113
2114 SRARI_H2_UH(tmp1, tmp2, 7);
2115 SAT_UH2_UH(tmp1, tmp2, 7);
2116
2117 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2118 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2120
2121 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2122 LD_SB4(src, src_stride, src1, src2, src3, src4);
2123 src += (4 * src_stride);
2124 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125 tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2126
2127 SRARI_H2_UH(tmp3, tmp4, 7);
2128 SAT_UH2_UH(tmp3, tmp4, 7);
2129 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2130 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2131
2132 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2133 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134 tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2135
2136 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2137 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138 tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2139
2140 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2141 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142 tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2143
2144 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2145 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146 tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2147
2148 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2149 SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2150 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2151 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152 dst += (8 * dst_stride);
2153 }
2154 }
2155
ff_put_vp8_bilinear8_hv_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2156 void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2157 uint8_t *src, ptrdiff_t src_stride,
2158 int height, int mx, int my)
2159 {
2160 const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2161 const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2162
2163 if (4 == height) {
2164 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2165 filter_horiz, filter_vert);
2166 } else {
2167 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2168 filter_horiz, filter_vert, height);
2169 }
2170 }
2171
ff_put_vp8_bilinear16_hv_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2172 void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2173 uint8_t *src, ptrdiff_t src_stride,
2174 int height, int mx, int my)
2175 {
2176 uint32_t loop_cnt;
2177 const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2178 const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2179 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2180 v16u8 filt_hz, filt_vt, vec0, vec1;
2181 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2182 v8i16 filt;
2183
2184 mask = LD_SB(&mc_filt_mask_arr[0]);
2185
2186 /* rearranging filter */
2187 filt = LD_SH(filter_horiz);
2188 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2189
2190 filt = LD_SH(filter_vert);
2191 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2192
2193 LD_SB2(src, 8, src0, src1);
2194 src += src_stride;
2195
2196 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2197 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2198
2199
2200 for (loop_cnt = (height >> 2); loop_cnt--;) {
2201 LD_SB4(src, src_stride, src0, src2, src4, src6);
2202 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203 src += (4 * src_stride);
2204
2205 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2206 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2207 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2209 SRARI_H2_UH(tmp1, tmp2, 7);
2210 SAT_UH2_UH(tmp1, tmp2, 7);
2211 PCKEV_ST_SB(tmp1, tmp2, dst);
2212 dst += dst_stride;
2213
2214 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2215 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2216 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2218 SRARI_H2_UH(tmp1, tmp2, 7);
2219 SAT_UH2_UH(tmp1, tmp2, 7);
2220 PCKEV_ST_SB(tmp1, tmp2, dst);
2221 dst += dst_stride;
2222
2223 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2224 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2225 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2227 SRARI_H2_UH(tmp1, tmp2, 7);
2228 SAT_UH2_UH(tmp1, tmp2, 7);
2229 PCKEV_ST_SB(tmp1, tmp2, dst);
2230 dst += dst_stride;
2231
2232 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2233 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2234 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2236 SRARI_H2_UH(tmp1, tmp2, 7);
2237 SAT_UH2_UH(tmp1, tmp2, 7);
2238 PCKEV_ST_SB(tmp1, tmp2, dst);
2239 dst += dst_stride;
2240 }
2241 }
2242
ff_put_vp8_pixels8_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2243 void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
2244 uint8_t *src, ptrdiff_t src_stride,
2245 int height, int mx, int my)
2246 {
2247 int32_t cnt;
2248 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2250
2251 if (0 == height % 8) {
2252 for (cnt = height >> 3; cnt--;) {
2253 LD_UB8(src, src_stride,
2254 src0, src1, src2, src3, src4, src5, src6, src7);
2255 src += (8 * src_stride);
2256
2257 out0 = __msa_copy_u_d((v2i64) src0, 0);
2258 out1 = __msa_copy_u_d((v2i64) src1, 0);
2259 out2 = __msa_copy_u_d((v2i64) src2, 0);
2260 out3 = __msa_copy_u_d((v2i64) src3, 0);
2261 out4 = __msa_copy_u_d((v2i64) src4, 0);
2262 out5 = __msa_copy_u_d((v2i64) src5, 0);
2263 out6 = __msa_copy_u_d((v2i64) src6, 0);
2264 out7 = __msa_copy_u_d((v2i64) src7, 0);
2265
2266 SD4(out0, out1, out2, out3, dst, dst_stride);
2267 dst += (4 * dst_stride);
2268 SD4(out4, out5, out6, out7, dst, dst_stride);
2269 dst += (4 * dst_stride);
2270 }
2271 } else if (0 == height % 4) {
2272 for (cnt = (height / 4); cnt--;) {
2273 LD_UB4(src, src_stride, src0, src1, src2, src3);
2274 src += (4 * src_stride);
2275 out0 = __msa_copy_u_d((v2i64) src0, 0);
2276 out1 = __msa_copy_u_d((v2i64) src1, 0);
2277 out2 = __msa_copy_u_d((v2i64) src2, 0);
2278 out3 = __msa_copy_u_d((v2i64) src3, 0);
2279
2280 SD4(out0, out1, out2, out3, dst, dst_stride);
2281 dst += (4 * dst_stride);
2282 }
2283 }
2284 }
2285
copy_16multx8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t width)2286 static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
2287 uint8_t *dst, int32_t dst_stride,
2288 int32_t height, int32_t width)
2289 {
2290 int32_t cnt, loop_cnt;
2291 uint8_t *src_tmp, *dst_tmp;
2292 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2293
2294 for (cnt = (width >> 4); cnt--;) {
2295 src_tmp = src;
2296 dst_tmp = dst;
2297
2298 for (loop_cnt = (height >> 3); loop_cnt--;) {
2299 LD_UB8(src_tmp, src_stride,
2300 src0, src1, src2, src3, src4, src5, src6, src7);
2301 src_tmp += (8 * src_stride);
2302
2303 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304 dst_tmp, dst_stride);
2305 dst_tmp += (8 * dst_stride);
2306 }
2307
2308 src += 16;
2309 dst += 16;
2310 }
2311 }
2312
ff_put_vp8_pixels16_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2313 void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
2314 uint8_t *src, ptrdiff_t src_stride,
2315 int height, int mx, int my)
2316 {
2317 int32_t cnt;
2318 v16u8 src0, src1, src2, src3;
2319
2320 if (0 == height % 8) {
2321 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2322 } else if (0 == height % 4) {
2323 for (cnt = (height >> 2); cnt--;) {
2324 LD_UB4(src, src_stride, src0, src1, src2, src3);
2325 src += (4 * src_stride);
2326
2327 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328 dst += (4 * dst_stride);
2329 }
2330 }
2331 }
2332