1 /*
2 * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
23
24 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
25 /* 8 width cases */
26 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
29
30 /* 4 width cases */
31 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
34 };
35
36 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
37 out1, out2) \
38 { \
39 v16i8 tmp0_m, tmp1_m; \
40 v16i8 minus5b_m = __msa_ldi_b(-5); \
41 v16i8 plus20b_m = __msa_ldi_b(20); \
42 \
43 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
44 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
45 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
46 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
47 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
48 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
49 }
50
51 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
52 ( { \
53 v8i16 out0_m; \
54 v16i8 tmp0_m; \
55 v16i8 minus5b = __msa_ldi_b(-5); \
56 v16i8 plus20b = __msa_ldi_b(20); \
57 \
58 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
59 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
60 \
61 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
62 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
63 \
64 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
65 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
66 \
67 out0_m; \
68 } )
69
70 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
71 ( { \
72 v8i16 out0_m; \
73 \
74 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
75 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
76 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
77 \
78 out0_m; \
79 } )
80
81 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
82 ( { \
83 v4i32 out0_m; \
84 \
85 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
86 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
87 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
88 out0_m = __msa_srari_w(out0_m, 10); \
89 out0_m = __msa_sat_s_w(out0_m, 7); \
90 out0_m; \
91 } )
92
avc_luma_hv_qrt_4x4_msa(const uint8_t * src_x,const uint8_t * src_y,uint8_t * dst,int32_t stride)93 static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
94 uint8_t *dst, int32_t stride)
95 {
96 const int16_t filt_const0 = 0xfb01;
97 const int16_t filt_const1 = 0x1414;
98 const int16_t filt_const2 = 0x1fb;
99 v16u8 out;
100 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
105
106 filt0 = (v16i8) __msa_fill_h(filt_const0);
107 filt1 = (v16i8) __msa_fill_h(filt_const1);
108 filt2 = (v16i8) __msa_fill_h(filt_const2);
109
110 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
111
112 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
113 src_y += (5 * stride);
114
115 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
119
120 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
121
122 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
123 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
124 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
125 hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
126
127 SRARI_H2_SH(hz_out0, hz_out1, 5);
128 SAT_SH2_SH(hz_out0, hz_out1, 7);
129
130 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
131
132 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
136
137 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
138 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
141 filt2);
142 vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
143 filt2);
144 SRARI_H2_SH(vt_out0, vt_out1, 5);
145 SAT_SH2_SH(vt_out0, vt_out1, 7);
146
147 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
149
150 SAT_SH2_SH(out0, out1, 7);
151 out = PCKEV_XORI128_UB(out0, out1);
152 ST_W4(out, 0, 1, 2, 3, dst, stride);
153 }
154
avc_luma_hv_qrt_8x8_msa(const uint8_t * src_x,const uint8_t * src_y,uint8_t * dst,int32_t stride)155 static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
156 uint8_t *dst, int32_t stride)
157 {
158 const int16_t filt_const0 = 0xfb01;
159 const int16_t filt_const1 = 0x1414;
160 const int16_t filt_const2 = 0x1fb;
161 v16u8 out0, out1;
162 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164 v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
170
171 filt0 = (v16i8) __msa_fill_h(filt_const0);
172 filt1 = (v16i8) __msa_fill_h(filt_const1);
173 filt2 = (v16i8) __msa_fill_h(filt_const2);
174
175 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
176 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
177 src_y += (5 * stride);
178
179 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
180
181 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
182 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
183 src_x += (4 * stride);
184
185 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
186 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
187 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
188 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
189
190 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
192
193 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
194 src_y += (4 * stride);
195 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
196
197 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
202 filt2);
203 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
204 filt2);
205 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
206 filt2);
207 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
208 filt2);
209 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
211
212 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
216
217 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
218 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
219
220 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
221 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
222 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
223 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
224 dst += (4 * stride);
225
226 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
227 XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
228
229 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
230 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
231 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
232 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
233
234 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
236
237 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
239 src_vt1211_r);
240 vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
241 filt2);
242 vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
243 filt2);
244 vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
245 filt2);
246 vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
247 filt1, filt2);
248 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
250
251 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
255
256 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
257 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
258 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
259 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
260 }
261
avc_luma_hv_qrt_16x16_msa(const uint8_t * src_x,const uint8_t * src_y,uint8_t * dst,int32_t stride)262 static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
263 const uint8_t *src_y, uint8_t *dst,
264 int32_t stride)
265 {
266 const int16_t filt_const0 = 0xfb01;
267 const int16_t filt_const1 = 0x1414;
268 const int16_t filt_const2 = 0x1fb;
269 const uint8_t *src_x_tmp = src_x;
270 const uint8_t *src_y_tmp = src_y;
271 uint8_t *dst_tmp = dst;
272 uint32_t multiple8_cnt, loop_cnt;
273 v16u8 tmp0, tmp1;
274 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
275 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
276 v16i8 src_vt7, src_vt8;
277 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
278 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
279 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
280 v8i16 vt_out3, out0, out1, out2, out3;
281
282 filt0 = (v16i8) __msa_fill_h(filt_const0);
283 filt1 = (v16i8) __msa_fill_h(filt_const1);
284 filt2 = (v16i8) __msa_fill_h(filt_const2);
285
286 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
287
288 for (multiple8_cnt = 2; multiple8_cnt--;) {
289 src_x = src_x_tmp;
290 src_y = src_y_tmp;
291 dst = dst_tmp;
292
293 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
294 src_y += (5 * stride);
295
296 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
297
298 for (loop_cnt = 4; loop_cnt--;) {
299 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
300 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
301 src_x += (4 * stride);
302
303 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
304 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
305 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
306 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
307 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
308 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
309
310 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
311 src_y += (4 * stride);
312
313 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
314 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
315 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
316 src_vt43_r);
317 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
318 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
319 src_vt87_r);
320 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
321 filt1, filt2);
322 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
323 filt1, filt2);
324 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
325 filt1, filt2);
326 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
327 filt1, filt2);
328 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
329 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
330
331 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
332 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
333 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
334 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
335
336 SAT_SH4_SH(out0, out1, out2, out3, 7);
337 tmp0 = PCKEV_XORI128_UB(out0, out1);
338 tmp1 = PCKEV_XORI128_UB(out2, out3);
339 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride);
340 dst += (4 * stride);
341
342 src_vt0 = src_vt4;
343 src_vt1 = src_vt5;
344 src_vt2 = src_vt6;
345 src_vt3 = src_vt7;
346 src_vt4 = src_vt8;
347 }
348
349 src_x_tmp += 8;
350 src_y_tmp += 8;
351 dst_tmp += 8;
352 }
353 }
354
avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t * src_x,const uint8_t * src_y,uint8_t * dst,int32_t stride)355 static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
356 const uint8_t *src_y,
357 uint8_t *dst,
358 int32_t stride)
359 {
360 uint32_t tp0, tp1, tp2, tp3;
361 const int16_t filt_const0 = 0xfb01;
362 const int16_t filt_const1 = 0x1414;
363 const int16_t filt_const2 = 0x1fb;
364 v16u8 res, dst0 = { 0 };
365 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
366 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
367 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
368 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
369 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
370
371 filt0 = (v16i8) __msa_fill_h(filt_const0);
372 filt1 = (v16i8) __msa_fill_h(filt_const1);
373 filt2 = (v16i8) __msa_fill_h(filt_const2);
374
375 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
376
377 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
378 src_y += (5 * stride);
379
380 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
381 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
382 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
383 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
384
385 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
386
387 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
388 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
389 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
390 hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
391
392 SRARI_H2_SH(hz_out0, hz_out1, 5);
393 SAT_SH2_SH(hz_out0, hz_out1, 7);
394
395 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
396
397 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
398 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
399 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
400 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
401
402 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
403 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
404 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
405 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
406 filt2);
407 vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
408 filt2);
409 SRARI_H2_SH(vt_out0, vt_out1, 5);
410 SAT_SH2_SH(vt_out0, vt_out1, 7);
411 LW4(dst, stride, tp0, tp1, tp2, tp3);
412 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
413
414 res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
415 res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
416
417 SAT_SH2_SH(res0, res1, 7);
418 res = PCKEV_XORI128_UB(res0, res1);
419 dst0 = __msa_aver_u_b(res, dst0);
420
421 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
422 }
423
avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t * src_x,const uint8_t * src_y,uint8_t * dst,int32_t stride)424 static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
425 const uint8_t *src_y,
426 uint8_t *dst,
427 int32_t stride)
428 {
429 const int16_t filt_const0 = 0xfb01;
430 const int16_t filt_const1 = 0x1414;
431 const int16_t filt_const2 = 0x1fb;
432 uint64_t tp0, tp1, tp2, tp3;
433 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
434 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
435 v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
436 v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
437 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
438 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
439 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
440 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
441 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
442
443 filt0 = (v16i8) __msa_fill_h(filt_const0);
444 filt1 = (v16i8) __msa_fill_h(filt_const1);
445 filt2 = (v16i8) __msa_fill_h(filt_const2);
446
447 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
448 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
449 src_y += (5 * stride);
450
451 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
452
453 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
454 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
455 src_x += (4 * stride);
456
457 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
458 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
459 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
460 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
461
462 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
463 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
464
465 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
466 src_y += (4 * stride);
467 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
468
469 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
470 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
471 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
472 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
473 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
474 filt2);
475 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
476 filt2);
477 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
478 filt2);
479 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
480 filt2);
481 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
482 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
483
484 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
485 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
486 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
487 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
488
489 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
490 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
491
492 LD4(dst, stride, tp0, tp1, tp2, tp3);
493 INSERT_D2_UB(tp0, tp1, dst0);
494 INSERT_D2_UB(tp2, tp3, dst1);
495
496 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
497 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
498 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
499 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
500 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
501 dst += (4 * stride);
502
503 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
504 XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
505
506 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
507 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
508 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
509 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
510
511 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
512 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
513
514 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
515 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
516 src_vt1211_r);
517 vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
518 filt2);
519 vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
520 filt2);
521 vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
522 filt2);
523 vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
524 filt1, filt2);
525 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
526 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
527
528 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
529 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
530 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
531 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
532
533 LD4(dst, stride, tp0, tp1, tp2, tp3);
534 INSERT_D2_UB(tp0, tp1, dst0);
535 INSERT_D2_UB(tp2, tp3, dst1);
536
537 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
538 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
539 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
540 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
541 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
542 }
543
avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t * src_x,const uint8_t * src_y,uint8_t * dst,int32_t stride)544 static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
545 const uint8_t *src_y,
546 uint8_t *dst,
547 int32_t stride)
548 {
549 const int16_t filt_const0 = 0xfb01;
550 const int16_t filt_const1 = 0x1414;
551 const int16_t filt_const2 = 0x1fb;
552 const uint8_t *src_x_tmp = src_x;
553 const uint8_t *src_y_tmp = src_y;
554 uint8_t *dst_tmp = dst;
555 uint32_t multiple8_cnt, loop_cnt;
556 uint64_t tp0, tp1, tp2, tp3;
557 v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
558 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
559 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
560 v16i8 src_vt7, src_vt8;
561 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
562 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
563 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
564 v8i16 vt_out3, out0, out1, out2, out3;
565
566 filt0 = (v16i8) __msa_fill_h(filt_const0);
567 filt1 = (v16i8) __msa_fill_h(filt_const1);
568 filt2 = (v16i8) __msa_fill_h(filt_const2);
569
570 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
571
572 for (multiple8_cnt = 2; multiple8_cnt--;) {
573 src_x = src_x_tmp;
574 src_y = src_y_tmp;
575 dst = dst_tmp;
576
577 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
578 src_y += (5 * stride);
579
580 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
581
582 for (loop_cnt = 4; loop_cnt--;) {
583 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
584 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
585 src_x += (4 * stride);
586
587 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
588 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
589 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
590 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
591 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
592 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
593
594 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
595 src_y += (4 * stride);
596
597 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
598 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
599 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
600 src_vt43_r);
601 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
602 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
603 src_vt87_r);
604 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
605 filt1, filt2);
606 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
607 filt1, filt2);
608 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
609 filt1, filt2);
610 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
611 filt1, filt2);
612 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
613 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
614
615 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
616 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
617 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
618 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
619
620 LD4(dst, stride, tp0, tp1, tp2, tp3);
621 INSERT_D2_UB(tp0, tp1, dst0);
622 INSERT_D2_UB(tp2, tp3, dst1);
623
624 SAT_SH4_SH(out0, out1, out2, out3, 7);
625 tmp0 = PCKEV_XORI128_UB(out0, out1);
626 tmp1 = PCKEV_XORI128_UB(out2, out3);
627 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
628 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
629 dst += (4 * stride);
630
631 src_vt0 = src_vt4;
632 src_vt1 = src_vt5;
633 src_vt2 = src_vt6;
634 src_vt3 = src_vt7;
635 src_vt4 = src_vt8;
636 }
637
638 src_x_tmp += 8;
639 src_y_tmp += 8;
640 dst_tmp += 8;
641 }
642 }
643
ff_put_h264_qpel16_mc00_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)644 void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
645 ptrdiff_t stride)
646 {
647 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
648 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
649
650 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
651 src += (8 * stride);
652 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
653
654 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
655 dst += (8 * stride);
656 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
657 }
658
ff_put_h264_qpel8_mc00_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)659 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
660 ptrdiff_t stride)
661 {
662 uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
663
664 LD4(src, stride, src0, src1, src2, src3);
665 src += 4 * stride;
666 LD4(src, stride, src4, src5, src6, src7);
667 SD4(src0, src1, src2, src3, dst, stride);
668 dst += 4 * stride;
669 SD4(src4, src5, src6, src7, dst, stride);
670 }
671
ff_avg_h264_qpel16_mc00_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)672 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
673 ptrdiff_t stride)
674 {
675 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
676 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
677
678 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
679 src += (8 * stride);
680 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
681
682 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
683 dst2, dst3);
684 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
685 dst6, dst7);
686 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
687 dst += (8 * stride);
688
689 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
690 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
691
692 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
693 dst2, dst3);
694 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
695 dst6, dst7);
696 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
697 }
698
ff_avg_h264_qpel8_mc00_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)699 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
700 ptrdiff_t stride)
701 {
702 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
703 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
704 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
705
706 LD4(src, stride, tp0, tp1, tp2, tp3);
707 src += 4 * stride;
708 LD4(src, stride, tp4, tp5, tp6, tp7);
709 INSERT_D2_UB(tp0, tp1, src0);
710 INSERT_D2_UB(tp2, tp3, src1);
711 INSERT_D2_UB(tp4, tp5, src2);
712 INSERT_D2_UB(tp6, tp7, src3);
713
714 LD4(dst, stride, tp0, tp1, tp2, tp3);
715 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
716 INSERT_D2_UB(tp0, tp1, dst0);
717 INSERT_D2_UB(tp2, tp3, dst1);
718 INSERT_D2_UB(tp4, tp5, dst2);
719 INSERT_D2_UB(tp6, tp7, dst3);
720
721 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
722 dst2, dst3);
723
724 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
725 }
726
ff_avg_h264_qpel4_mc00_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)727 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
728 ptrdiff_t stride)
729 {
730 uint32_t tp0, tp1, tp2, tp3;
731 v16u8 src0 = { 0 }, dst0 = { 0 };
732
733 LW4(src, stride, tp0, tp1, tp2, tp3);
734 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
735 LW4(dst, stride, tp0, tp1, tp2, tp3);
736 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
737
738 dst0 = __msa_aver_u_b(src0, dst0);
739
740 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
741 }
742
ff_put_h264_qpel16_mc10_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)743 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
744 ptrdiff_t stride)
745 {
746 uint32_t loop_cnt;
747 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
748 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
749 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
750 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
751 v16i8 minus5b = __msa_ldi_b(-5);
752 v16i8 plus20b = __msa_ldi_b(20);
753
754 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
755 mask3 = mask0 + 8;
756 mask4 = mask1 + 8;
757 mask5 = mask2 + 8;
758 src -= 2;
759
760 for (loop_cnt = 4; loop_cnt--;) {
761 LD_SB2(src, 16, src0, src1);
762 src += stride;
763 LD_SB2(src, 16, src2, src3);
764 src += stride;
765 LD_SB2(src, 16, src4, src5);
766 src += stride;
767 LD_SB2(src, 16, src6, src7);
768 src += stride;
769
770 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
771 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
772 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
773 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
774 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
775 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
776 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
777 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
778 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
779 minus5b, res0, res1, res2, res3);
780 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
781 plus20b, res0, res1, res2, res3);
782 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
783 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
784 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
785 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
786 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
787 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
788 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
789 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
790 minus5b, res4, res5, res6, res7);
791 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
792 plus20b, res4, res5, res6, res7);
793 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
794 src0, src2, src4, src6);
795 SRARI_H4_SH(res0, res1, res2, res3, 5);
796 SRARI_H4_SH(res4, res5, res6, res7, 5);
797 SAT_SH4_SH(res0, res1, res2, res3, 7);
798 SAT_SH4_SH(res4, res5, res6, res7, 7);
799 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
800 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
801 dst0 = __msa_aver_s_b(dst0, src0);
802 dst1 = __msa_aver_s_b(dst1, src2);
803 dst2 = __msa_aver_s_b(dst2, src4);
804 dst3 = __msa_aver_s_b(dst3, src6);
805 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
806 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
807 dst += (4 * stride);
808 }
809 }
810
ff_put_h264_qpel16_mc30_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)811 void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
812 ptrdiff_t stride)
813 {
814 uint32_t loop_cnt;
815 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
816 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
817 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
818 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
819 v16i8 minus5b = __msa_ldi_b(-5);
820 v16i8 plus20b = __msa_ldi_b(20);
821
822 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
823 mask3 = mask0 + 8;
824 mask4 = mask1 + 8;
825 mask5 = mask2 + 8;
826 src -= 2;
827
828 for (loop_cnt = 4; loop_cnt--;) {
829 LD_SB2(src, 16, src0, src1);
830 src += stride;
831 LD_SB2(src, 16, src2, src3);
832 src += stride;
833 LD_SB2(src, 16, src4, src5);
834 src += stride;
835 LD_SB2(src, 16, src6, src7);
836 src += stride;
837
838 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
839 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
840 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
841 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
842 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
843 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
844 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
845 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
846 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
847 minus5b, res0, res1, res2, res3);
848 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
849 plus20b, res0, res1, res2, res3);
850 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
851 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
852 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
853 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
854 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
855 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
856 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
857 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
858 minus5b, res4, res5, res6, res7);
859 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
860 plus20b, res4, res5, res6, res7);
861 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
862 src0, src2, src4, src6);
863 SRARI_H4_SH(res0, res1, res2, res3, 5);
864 SRARI_H4_SH(res4, res5, res6, res7, 5);
865 SAT_SH4_SH(res0, res1, res2, res3, 7);
866 SAT_SH4_SH(res4, res5, res6, res7, 7);
867 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
868 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
869 dst0 = __msa_aver_s_b(dst0, src0);
870 dst1 = __msa_aver_s_b(dst1, src2);
871 dst2 = __msa_aver_s_b(dst2, src4);
872 dst3 = __msa_aver_s_b(dst3, src6);
873 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
874 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
875 dst += (4 * stride);
876 }
877 }
878
ff_put_h264_qpel8_mc10_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)879 void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
880 ptrdiff_t stride)
881 {
882 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
883 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
884 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
885 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
886 v16i8 minus5b = __msa_ldi_b(-5);
887 v16i8 plus20b = __msa_ldi_b(20);
888
889 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
890 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
891 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
892 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
893 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
894 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
895 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
896 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
897 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
898 res0, res1, res2, res3);
899 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
900 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
901 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
902 res0, res1, res2, res3);
903 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
904 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
905 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
906 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
907 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
908 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
909 res4, res5, res6, res7);
910 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
911 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
912 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
913 res4, res5, res6, res7);
914 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
915 src0, src1, src2, src3);
916 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
917 src4, src5, src6, src7);
918 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
919 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
920 SRARI_H4_SH(res0, res1, res2, res3, 5);
921 SRARI_H4_SH(res4, res5, res6, res7, 5);
922 SAT_SH4_SH(res0, res1, res2, res3, 7);
923 SAT_SH4_SH(res4, res5, res6, res7, 7);
924 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
925 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
926 tmp0 = __msa_aver_s_b(tmp0, src0);
927 tmp1 = __msa_aver_s_b(tmp1, src1);
928 tmp2 = __msa_aver_s_b(tmp2, src4);
929 tmp3 = __msa_aver_s_b(tmp3, src5);
930 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
931 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
932 }
933
ff_put_h264_qpel8_mc30_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)934 void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
935 ptrdiff_t stride)
936 {
937 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
938 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
939 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
940 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
941 v16i8 minus5b = __msa_ldi_b(-5);
942 v16i8 plus20b = __msa_ldi_b(20);
943
944 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
945 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
946 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
947 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
948 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
949 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
950 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
951 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
952 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
953 res0, res1, res2, res3);
954 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
955 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
956 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
957 res0, res1, res2, res3);
958 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
959 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
960 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
961 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
962 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
963 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
964 res4, res5, res6, res7);
965 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
966 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
967 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
968 res4, res5, res6, res7);
969 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
970 src0, src1, src2, src3);
971 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
972 src4, src5, src6, src7);
973 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
974 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
975 SRARI_H4_SH(res0, res1, res2, res3, 5);
976 SRARI_H4_SH(res4, res5, res6, res7, 5);
977 SAT_SH4_SH(res0, res1, res2, res3, 7);
978 SAT_SH4_SH(res4, res5, res6, res7, 7);
979 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
980 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
981 tmp0 = __msa_aver_s_b(tmp0, src0);
982 tmp1 = __msa_aver_s_b(tmp1, src1);
983 tmp2 = __msa_aver_s_b(tmp2, src4);
984 tmp3 = __msa_aver_s_b(tmp3, src5);
985 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
986 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
987 }
988
ff_put_h264_qpel4_mc10_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)989 void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
990 ptrdiff_t stride)
991 {
992 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
993 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
994 v8i16 res0, res1;
995 v16i8 minus5b = __msa_ldi_b(-5);
996 v16i8 plus20b = __msa_ldi_b(20);
997
998 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
999 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1000 XORI_B4_128_SB(src0, src1, src2, src3);
1001 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1002 HADD_SB2_SH(vec0, vec1, res0, res1);
1003 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1004 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1005 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1006 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1007 SRARI_H2_SH(res0, res1, 5);
1008 SAT_SH2_SH(res0, res1, 7);
1009 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1010 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
1011 src0, src1, src2, src3);
1012 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1013 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1014 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1015 res = __msa_aver_s_b(res, src0);
1016 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1017 ST_W4(res, 0, 1, 2, 3, dst, stride);
1018 }
1019
ff_put_h264_qpel4_mc30_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1020 void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
1021 ptrdiff_t stride)
1022 {
1023 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
1024 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1025 v8i16 res0, res1;
1026 v16i8 minus5b = __msa_ldi_b(-5);
1027 v16i8 plus20b = __msa_ldi_b(20);
1028
1029 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1030 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1031 XORI_B4_128_SB(src0, src1, src2, src3);
1032 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1033 HADD_SB2_SH(vec0, vec1, res0, res1);
1034 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1035 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1036 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1037 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1038 SRARI_H2_SH(res0, res1, 5);
1039 SAT_SH2_SH(res0, res1, 7);
1040 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1041 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
1042 src0, src1, src2, src3);
1043 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1044 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1045 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1046 res = __msa_aver_s_b(res, src0);
1047 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1048 ST_W4(res, 0, 1, 2, 3, dst, stride);
1049 }
1050
ff_put_h264_qpel16_mc20_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1051 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
1052 ptrdiff_t stride)
1053 {
1054 uint32_t loop_cnt;
1055 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1056 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1057 v16i8 vec11;
1058 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1059 v16i8 minus5b = __msa_ldi_b(-5);
1060 v16i8 plus20b = __msa_ldi_b(20);
1061
1062 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1063 src -= 2;
1064
1065 for (loop_cnt = 4; loop_cnt--;) {
1066 LD_SB2(src, 8, src0, src1);
1067 src += stride;
1068 LD_SB2(src, 8, src2, src3);
1069 src += stride;
1070 LD_SB2(src, 8, src4, src5);
1071 src += stride;
1072 LD_SB2(src, 8, src6, src7);
1073 src += stride;
1074
1075 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1076 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1077 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1078 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1079 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1080 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1081 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1082 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1083 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1084 minus5b, res0, res1, res2, res3);
1085 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1086 plus20b, res0, res1, res2, res3);
1087 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1088 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1089 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1090 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1091 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1092 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1093 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1094 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1095 minus5b, res4, res5, res6, res7);
1096 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1097 plus20b, res4, res5, res6, res7);
1098 SRARI_H4_SH(res0, res1, res2, res3, 5);
1099 SRARI_H4_SH(res4, res5, res6, res7, 5);
1100 SAT_SH4_SH(res0, res1, res2, res3, 7);
1101 SAT_SH4_SH(res4, res5, res6, res7, 7);
1102 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1103 vec2, vec3);
1104 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1105 ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
1106 dst += (4 * stride);
1107 }
1108 }
1109
ff_put_h264_qpel8_mc20_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1110 void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
1111 ptrdiff_t stride)
1112 {
1113 v16u8 out0, out1, out2, out3;
1114 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1115 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1116 v16i8 vec11;
1117 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1118 v16i8 minus5b = __msa_ldi_b(-5);
1119 v16i8 plus20b = __msa_ldi_b(20);
1120
1121 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1122 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1123 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1124 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1125 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1126 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1127 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1128 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1129 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1130 res0, res1, res2, res3);
1131 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1132 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1133 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1134 plus20b, res0, res1, res2, res3);
1135 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1136 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1137 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1138 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1139 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1140 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1141 res4, res5, res6, res7);
1142 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1143 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1144 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1145 plus20b, res4, res5, res6, res7);
1146 SRARI_H4_SH(res0, res1, res2, res3, 5);
1147 SRARI_H4_SH(res4, res5, res6, res7, 5);
1148 SAT_SH4_SH(res0, res1, res2, res3, 7);
1149 SAT_SH4_SH(res4, res5, res6, res7, 7);
1150 out0 = PCKEV_XORI128_UB(res0, res1);
1151 out1 = PCKEV_XORI128_UB(res2, res3);
1152 out2 = PCKEV_XORI128_UB(res4, res5);
1153 out3 = PCKEV_XORI128_UB(res6, res7);
1154 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1155 }
1156
ff_put_h264_qpel4_mc20_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1157 void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
1158 ptrdiff_t stride)
1159 {
1160 v16u8 out;
1161 v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
1162 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1163 v8i16 res0, res1;
1164 v16i8 minus5b = __msa_ldi_b(-5);
1165 v16i8 plus20b = __msa_ldi_b(20);
1166
1167 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1168 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1169 XORI_B4_128_SB(src0, src1, src2, src3);
1170 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1171 HADD_SB2_SH(vec0, vec1, res0, res1);
1172 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1173 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1174 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1175 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1176 SRARI_H2_SH(res0, res1, 5);
1177 SAT_SH2_SH(res0, res1, 7);
1178 out = PCKEV_XORI128_UB(res0, res1);
1179 ST_W4(out, 0, 1, 2, 3, dst, stride);
1180 }
1181
ff_put_h264_qpel16_mc01_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1182 void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
1183 ptrdiff_t stride)
1184 {
1185 int32_t loop_cnt;
1186 int16_t filt_const0 = 0xfb01;
1187 int16_t filt_const1 = 0x1414;
1188 int16_t filt_const2 = 0x1fb;
1189 v16u8 res0, res1, res2, res3;
1190 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1191 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1192 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1193 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1194 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1195
1196 filt0 = (v16i8) __msa_fill_h(filt_const0);
1197 filt1 = (v16i8) __msa_fill_h(filt_const1);
1198 filt2 = (v16i8) __msa_fill_h(filt_const2);
1199
1200 src -= (stride * 2);
1201
1202 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1203 src += (5 * stride);
1204
1205 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1206 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1207 src32_r, src43_r);
1208 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1209 src32_l, src43_l);
1210
1211 for (loop_cnt = 4; loop_cnt--;) {
1212 LD_SB4(src, stride, src5, src6, src7, src8);
1213 src += (4 * stride);
1214
1215 XORI_B4_128_SB(src5, src6, src7, src8);
1216 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1217 src65_r, src76_r, src87_r);
1218 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1219 src65_l, src76_l, src87_l);
1220 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1221 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1222 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1223 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1224 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1225 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1226 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1227 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1228 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1229 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1230 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1231 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1232 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1233 out3_r, res0, res1, res2, res3);
1234 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1235 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1236 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1237 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1238 XORI_B4_128_UB(res0, res1, res2, res3);
1239 ST_UB4(res0, res1, res2, res3, dst, stride);
1240 dst += (4 * stride);
1241
1242 src10_r = src54_r;
1243 src32_r = src76_r;
1244 src21_r = src65_r;
1245 src43_r = src87_r;
1246 src10_l = src54_l;
1247 src32_l = src76_l;
1248 src21_l = src65_l;
1249 src43_l = src87_l;
1250 src2 = src6;
1251 src3 = src7;
1252 src4 = src8;
1253 }
1254 }
1255
ff_put_h264_qpel16_mc03_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1256 void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
1257 ptrdiff_t stride)
1258 {
1259 int32_t loop_cnt;
1260 int16_t filt_const0 = 0xfb01;
1261 int16_t filt_const1 = 0x1414;
1262 int16_t filt_const2 = 0x1fb;
1263 v16u8 res0, res1, res2, res3;
1264 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1265 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1266 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1267 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1268 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1269
1270 filt0 = (v16i8) __msa_fill_h(filt_const0);
1271 filt1 = (v16i8) __msa_fill_h(filt_const1);
1272 filt2 = (v16i8) __msa_fill_h(filt_const2);
1273
1274 src -= (stride * 2);
1275
1276 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1277 src += (5 * stride);
1278
1279 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1280 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1281 src32_r, src43_r);
1282 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1283 src32_l, src43_l);
1284
1285 for (loop_cnt = 4; loop_cnt--;) {
1286 LD_SB4(src, stride, src5, src6, src7, src8);
1287 src += (4 * stride);
1288
1289 XORI_B4_128_SB(src5, src6, src7, src8);
1290 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1291 src65_r, src76_r, src87_r);
1292 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1293 src65_l, src76_l, src87_l);
1294 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1295 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1296 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1297 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1298 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1299 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1300 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1301 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1302 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1303 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1304 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1305 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1306 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1307 out3_r, res0, res1, res2, res3);
1308 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1309 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1310 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1311 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1312 XORI_B4_128_UB(res0, res1, res2, res3);
1313 ST_UB4(res0, res1, res2, res3, dst, stride);
1314 dst += (4 * stride);
1315
1316 src10_r = src54_r;
1317 src32_r = src76_r;
1318 src21_r = src65_r;
1319 src43_r = src87_r;
1320 src10_l = src54_l;
1321 src32_l = src76_l;
1322 src21_l = src65_l;
1323 src43_l = src87_l;
1324 src3 = src7;
1325 src4 = src8;
1326 }
1327 }
1328
ff_put_h264_qpel8_mc01_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1329 void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
1330 ptrdiff_t stride)
1331 {
1332 const int16_t filt_const0 = 0xfb01;
1333 const int16_t filt_const1 = 0x1414;
1334 const int16_t filt_const2 = 0x1fb;
1335 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1336 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1337 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1338 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1339 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1340
1341 filt0 = (v16i8) __msa_fill_h(filt_const0);
1342 filt1 = (v16i8) __msa_fill_h(filt_const1);
1343 filt2 = (v16i8) __msa_fill_h(filt_const2);
1344
1345 src -= (stride * 2);
1346
1347 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1348 src += (5 * stride);
1349 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1350 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1351 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1352 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1353 src32_r, src43_r);
1354 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1355 src76_r, src87_r);
1356 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1357 src109_r, src1110_r, src1211_r);
1358 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1359 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1360 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1361 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1362 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1363 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1364 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1365 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1366 PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
1367 PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
1368 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1369 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1370 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1371 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1372 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1373 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1374 out0 = __msa_aver_s_b(out0, tmp0);
1375 out1 = __msa_aver_s_b(out1, tmp1);
1376 out2 = __msa_aver_s_b(out2, tmp2);
1377 out3 = __msa_aver_s_b(out3, tmp3);
1378 XORI_B4_128_SB(out0, out1, out2, out3);
1379 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1380 }
1381
ff_put_h264_qpel8_mc03_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1382 void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
1383 ptrdiff_t stride)
1384 {
1385 const int16_t filt_const0 = 0xfb01;
1386 const int16_t filt_const1 = 0x1414;
1387 const int16_t filt_const2 = 0x1fb;
1388 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1389 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1390 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1391 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1392 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1393
1394 filt0 = (v16i8) __msa_fill_h(filt_const0);
1395 filt1 = (v16i8) __msa_fill_h(filt_const1);
1396 filt2 = (v16i8) __msa_fill_h(filt_const2);
1397
1398 src -= (stride * 2);
1399
1400 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1401 src += (5 * stride);
1402 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1403 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1404 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1405 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1406 src32_r, src43_r);
1407 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1408 src76_r, src87_r);
1409 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1410 src109_r, src1110_r, src1211_r);
1411 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1412 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1413 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1414 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1415 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1416 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1417 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1418 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1419 PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
1420 PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
1421 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1422 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1423 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1424 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1425 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1426 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1427 out0 = __msa_aver_s_b(out0, tmp0);
1428 out1 = __msa_aver_s_b(out1, tmp1);
1429 out2 = __msa_aver_s_b(out2, tmp2);
1430 out3 = __msa_aver_s_b(out3, tmp3);
1431 XORI_B4_128_SB(out0, out1, out2, out3);
1432 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1433 }
1434
ff_put_h264_qpel4_mc01_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1435 void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
1436 ptrdiff_t stride)
1437 {
1438 int16_t filt_const0 = 0xfb01;
1439 int16_t filt_const1 = 0x1414;
1440 int16_t filt_const2 = 0x1fb;
1441 v16u8 out;
1442 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1443 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1444 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1445 v8i16 out10, out32;
1446
1447 filt0 = (v16i8) __msa_fill_h(filt_const0);
1448 filt1 = (v16i8) __msa_fill_h(filt_const1);
1449 filt2 = (v16i8) __msa_fill_h(filt_const2);
1450
1451 src -= (stride * 2);
1452
1453 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1454 src += (5 * stride);
1455 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1456 src32_r, src43_r);
1457 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1458 XORI_B2_128_SB(src2110, src4332);
1459 LD_SB4(src, stride, src5, src6, src7, src8);
1460 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1461 src76_r, src87_r);
1462 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1463 XORI_B2_128_SB(src6554, src8776);
1464 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1465 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1466 SRARI_H2_SH(out10, out32, 5);
1467 SAT_SH2_SH(out10, out32, 7);
1468 out = PCKEV_XORI128_UB(out10, out32);
1469 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1470 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1471 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1472 out = __msa_aver_u_b(out, (v16u8) src32_r);
1473 ST_W4(out, 0, 1, 2, 3, dst, stride);
1474 }
1475
ff_put_h264_qpel4_mc03_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1476 void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
1477 ptrdiff_t stride)
1478 {
1479 int16_t filt_const0 = 0xfb01;
1480 int16_t filt_const1 = 0x1414;
1481 int16_t filt_const2 = 0x1fb;
1482 v16u8 out;
1483 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1484 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1485 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1486 v8i16 out10, out32;
1487
1488 filt0 = (v16i8) __msa_fill_h(filt_const0);
1489 filt1 = (v16i8) __msa_fill_h(filt_const1);
1490 filt2 = (v16i8) __msa_fill_h(filt_const2);
1491
1492 src -= (stride * 2);
1493
1494 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1495 src += (5 * stride);
1496 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1497 src32_r, src43_r);
1498 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1499 XORI_B2_128_SB(src2110, src4332);
1500 LD_SB4(src, stride, src5, src6, src7, src8);
1501 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1502 src76_r, src87_r);
1503 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1504 XORI_B2_128_SB(src6554, src8776);
1505 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1506 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1507 SRARI_H2_SH(out10, out32, 5);
1508 SAT_SH2_SH(out10, out32, 7);
1509 out = PCKEV_XORI128_UB(out10, out32);
1510 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1511 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1512 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1513 out = __msa_aver_u_b(out, (v16u8) src32_r);
1514 ST_W4(out, 0, 1, 2, 3, dst, stride);
1515 }
1516
ff_put_h264_qpel16_mc11_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1517 void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
1518 ptrdiff_t stride)
1519 {
1520 avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride);
1521 }
1522
ff_put_h264_qpel16_mc31_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1523 void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
1524 ptrdiff_t stride)
1525 {
1526 avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1527 }
1528
ff_put_h264_qpel16_mc13_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1529 void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
1530 ptrdiff_t stride)
1531 {
1532 avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst,
1533 stride);
1534 }
1535
ff_put_h264_qpel16_mc33_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1536 void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
1537 ptrdiff_t stride)
1538 {
1539 avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1540 stride);
1541 }
1542
ff_put_h264_qpel8_mc11_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1543 void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
1544 ptrdiff_t stride)
1545 {
1546 avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride);
1547 }
1548
ff_put_h264_qpel8_mc31_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1549 void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
1550 ptrdiff_t stride)
1551 {
1552 avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1553 }
1554
ff_put_h264_qpel8_mc13_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1555 void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
1556 ptrdiff_t stride)
1557 {
1558 avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride);
1559 }
1560
ff_put_h264_qpel8_mc33_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1561 void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
1562 ptrdiff_t stride)
1563 {
1564 avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1565 stride);
1566 }
1567
1568
ff_put_h264_qpel4_mc11_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1569 void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
1570 ptrdiff_t stride)
1571 {
1572 avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride);
1573 }
1574
ff_put_h264_qpel4_mc31_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1575 void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
1576 ptrdiff_t stride)
1577 {
1578 avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1579 }
1580
ff_put_h264_qpel4_mc13_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1581 void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
1582 ptrdiff_t stride)
1583 {
1584 avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride);
1585 }
1586
ff_put_h264_qpel4_mc33_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1587 void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
1588 ptrdiff_t stride)
1589 {
1590 avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1591 stride);
1592 }
1593
ff_put_h264_qpel16_mc21_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1594 void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
1595 ptrdiff_t stride)
1596 {
1597 uint8_t *dst_tmp = dst;
1598 const uint8_t *src_tmp = src - (2 * stride) - 2;
1599 uint32_t multiple8_cnt, loop_cnt;
1600 const int32_t filt_const0 = 0xfffb0001;
1601 const int32_t filt_const1 = 0x140014;
1602 const int32_t filt_const2 = 0x1fffb;
1603 v16u8 out0, out1;
1604 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1605 v16i8 mask2;
1606 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1607 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1608 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1609 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1610 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1611 v8i16 hz_out87_l, filt0, filt1, filt2;
1612 v4i32 tmp0, tmp1;
1613
1614 filt0 = (v8i16) __msa_fill_w(filt_const0);
1615 filt1 = (v8i16) __msa_fill_w(filt_const1);
1616 filt2 = (v8i16) __msa_fill_w(filt_const2);
1617
1618 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1619
1620 for (multiple8_cnt = 2; multiple8_cnt--;) {
1621 dst = dst_tmp;
1622 src = src_tmp;
1623
1624 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1625 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1626 src += (5 * stride);
1627
1628 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1629 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1630 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1631 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1632 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1633
1634 for (loop_cnt = 4; loop_cnt--;) {
1635 LD_SB4(src, stride, src5, src6, src7, src8);
1636 src += (4 * stride);
1637
1638 XORI_B4_128_SB(src5, src6, src7, src8);
1639
1640 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1641 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1642 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1643 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1644
1645 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1646 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1647 hz_out43_r);
1648 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1649 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1650 hz_out43_l);
1651 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1652 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1653 hz_out87_r);
1654 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1655 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1656 hz_out87_l);
1657
1658 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1659 filt1, filt2);
1660 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1661 filt1, filt2);
1662 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1663 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1664 filt1, filt2);
1665 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1666 filt1, filt2);
1667 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1668 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1669 filt1, filt2);
1670 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1671 filt1, filt2);
1672 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1673 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1674 filt1, filt2);
1675 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1676 filt1, filt2);
1677 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1678
1679 dst1 = __msa_srari_h(hz_out2, 5);
1680 dst3 = __msa_srari_h(hz_out3, 5);
1681 dst5 = __msa_srari_h(hz_out4, 5);
1682 dst7 = __msa_srari_h(hz_out5, 5);
1683 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1684
1685 dst0 = __msa_aver_s_h(dst0, dst1);
1686 dst1 = __msa_aver_s_h(dst2, dst3);
1687 dst2 = __msa_aver_s_h(dst4, dst5);
1688 dst3 = __msa_aver_s_h(dst6, dst7);
1689
1690 out0 = PCKEV_XORI128_UB(dst0, dst1);
1691 out1 = PCKEV_XORI128_UB(dst2, dst3);
1692 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1693 dst += (4 * stride);
1694
1695 hz_out0 = hz_out4;
1696 hz_out1 = hz_out5;
1697 hz_out2 = hz_out6;
1698 hz_out3 = hz_out7;
1699 hz_out4 = hz_out8;
1700 }
1701
1702 src_tmp += 8;
1703 dst_tmp += 8;
1704 }
1705 }
1706
ff_put_h264_qpel16_mc23_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1707 void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
1708 ptrdiff_t stride)
1709 {
1710 uint8_t *dst_tmp = dst;
1711 const uint8_t *src_tmp = src - (2 * stride) - 2;
1712 uint32_t multiple8_cnt, loop_cnt;
1713 const int32_t filt_const0 = 0xfffb0001;
1714 const int32_t filt_const1 = 0x140014;
1715 const int32_t filt_const2 = 0x1fffb;
1716 v16u8 out0, out1;
1717 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1718 v16i8 mask2;
1719 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1720 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1721 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1722 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1723 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1724 v8i16 hz_out87_l, filt0, filt1, filt2;
1725 v4i32 tmp0, tmp1;
1726
1727 filt0 = (v8i16) __msa_fill_w(filt_const0);
1728 filt1 = (v8i16) __msa_fill_w(filt_const1);
1729 filt2 = (v8i16) __msa_fill_w(filt_const2);
1730
1731 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1732
1733 for (multiple8_cnt = 2; multiple8_cnt--;) {
1734 dst = dst_tmp;
1735 src = src_tmp;
1736
1737 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1738 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1739 src += (5 * stride);
1740
1741 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1742 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1743 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1744 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1745 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1746
1747 for (loop_cnt = 4; loop_cnt--;) {
1748 LD_SB4(src, stride, src5, src6, src7, src8);
1749 src += (4 * stride);
1750
1751 XORI_B4_128_SB(src5, src6, src7, src8);
1752
1753 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1754 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1755 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1756 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1757
1758 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1759 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1760 hz_out43_r);
1761 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1762 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1763 hz_out43_l);
1764 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1765 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1766 hz_out87_r);
1767 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1768 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1769 hz_out87_l);
1770
1771 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1772 filt1, filt2);
1773 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1774 filt1, filt2);
1775 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1776 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1777 filt1, filt2);
1778 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1779 filt1, filt2);
1780 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1781 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1782 filt1, filt2);
1783 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1784 filt1, filt2);
1785 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1786 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1787 filt1, filt2);
1788 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1789 filt1, filt2);
1790 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1791
1792 dst1 = __msa_srari_h(hz_out3, 5);
1793 dst3 = __msa_srari_h(hz_out4, 5);
1794 dst5 = __msa_srari_h(hz_out5, 5);
1795 dst7 = __msa_srari_h(hz_out6, 5);
1796 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1797
1798 dst0 = __msa_aver_s_h(dst0, dst1);
1799 dst1 = __msa_aver_s_h(dst2, dst3);
1800 dst2 = __msa_aver_s_h(dst4, dst5);
1801 dst3 = __msa_aver_s_h(dst6, dst7);
1802
1803 out0 = PCKEV_XORI128_UB(dst0, dst1);
1804 out1 = PCKEV_XORI128_UB(dst2, dst3);
1805 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1806 dst += (4 * stride);
1807
1808 hz_out0 = hz_out4;
1809 hz_out1 = hz_out5;
1810 hz_out2 = hz_out6;
1811 hz_out3 = hz_out7;
1812 hz_out4 = hz_out8;
1813 }
1814
1815 src_tmp += 8;
1816 dst_tmp += 8;
1817 }
1818 }
1819
ff_put_h264_qpel8_mc21_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1820 void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
1821 ptrdiff_t stride)
1822 {
1823 const int32_t filt_const0 = 0xfffb0001;
1824 const int32_t filt_const1 = 0x140014;
1825 const int32_t filt_const2 = 0x1fffb;
1826 v16u8 out0, out1;
1827 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1828 v16i8 src11, src12, mask0, mask1, mask2;
1829 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1830 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1831 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1832 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1833 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1834 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1835 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1836 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1837 v4i32 tmp0, tmp1;
1838
1839 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1840
1841 filt0 = (v8i16) __msa_fill_w(filt_const0);
1842 filt1 = (v8i16) __msa_fill_w(filt_const1);
1843 filt2 = (v8i16) __msa_fill_w(filt_const2);
1844
1845 src -= ((2 * stride) + 2);
1846
1847 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1848 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1849 src += (5 * stride);
1850
1851 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1852 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1853 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1854 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1855 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1856
1857 LD_SB4(src, stride, src5, src6, src7, src8);
1858 src += (4 * stride);
1859 XORI_B4_128_SB(src5, src6, src7, src8);
1860
1861 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1862 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1863 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1864 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1865
1866 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1867 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1868 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1870 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1871 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1872 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1874
1875 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1876 filt2);
1877 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1878 filt2);
1879 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1880 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1881 filt2);
1882 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1883 filt2);
1884 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1885 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1886 filt2);
1887 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1888 filt2);
1889 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1890 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1891 filt2);
1892 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1893 filt2);
1894 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1895
1896 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1897 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1898
1899 dst0 = __msa_aver_s_h(dst0, hz_out2);
1900 dst1 = __msa_aver_s_h(dst1, hz_out3);
1901 dst2 = __msa_aver_s_h(dst2, hz_out4);
1902 dst3 = __msa_aver_s_h(dst3, hz_out5);
1903
1904 out0 = PCKEV_XORI128_UB(dst0, dst1);
1905 out1 = PCKEV_XORI128_UB(dst2, dst3);
1906 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1907 dst += (4 * stride);
1908
1909 LD_SB4(src, stride, src9, src10, src11, src12);
1910 XORI_B4_128_SB(src9, src10, src11, src12);
1911 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
1912 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
1913 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
1914 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
1915 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1916 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1917 hz_out1211_r);
1918 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1919 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1920 hz_out1211_l);
1921 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1922 filt2);
1923 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1924 filt2);
1925 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1926 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1927 filt2);
1928 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1929 filt2);
1930 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1931 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1932 filt2);
1933 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1934 filt2);
1935 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1936 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1937 filt2);
1938 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1939 filt2);
1940 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1941
1942 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1943 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1944
1945 dst0 = __msa_aver_s_h(dst0, hz_out6);
1946 dst1 = __msa_aver_s_h(dst1, hz_out7);
1947 dst2 = __msa_aver_s_h(dst2, hz_out8);
1948 dst3 = __msa_aver_s_h(dst3, hz_out9);
1949
1950 out0 = PCKEV_XORI128_UB(dst0, dst1);
1951 out1 = PCKEV_XORI128_UB(dst2, dst3);
1952 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1953 }
1954
ff_put_h264_qpel8_mc23_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)1955 void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
1956 ptrdiff_t stride)
1957 {
1958 const int32_t filt_const0 = 0xfffb0001;
1959 const int32_t filt_const1 = 0x140014;
1960 const int32_t filt_const2 = 0x1fffb;
1961 v16u8 out0, out1;
1962 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1963 v16i8 src11, src12, mask0, mask1, mask2;
1964 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1965 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1966 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1967 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1968 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1969 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1970 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1971 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1972 v4i32 tmp0, tmp1;
1973
1974 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1975
1976 filt0 = (v8i16) __msa_fill_w(filt_const0);
1977 filt1 = (v8i16) __msa_fill_w(filt_const1);
1978 filt2 = (v8i16) __msa_fill_w(filt_const2);
1979
1980 src -= ((2 * stride) + 2);
1981
1982 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1983 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1984 src += (5 * stride);
1985
1986 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1987 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1988 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1989 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1990 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1991
1992 LD_SB4(src, stride, src5, src6, src7, src8);
1993 src += (4 * stride);
1994 XORI_B4_128_SB(src5, src6, src7, src8);
1995
1996 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1997 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1998 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1999 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2000
2001 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2002 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2003 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2005 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2006 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2007 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2009
2010 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2011 filt2);
2012 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2013 filt2);
2014 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2015 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2016 filt2);
2017 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2018 filt2);
2019 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2020 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2021 filt2);
2022 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2023 filt2);
2024 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2025 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2026 filt2);
2027 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2028 filt2);
2029 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2030
2031 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2032 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2033
2034 dst0 = __msa_aver_s_h(dst0, hz_out3);
2035 dst1 = __msa_aver_s_h(dst1, hz_out4);
2036 dst2 = __msa_aver_s_h(dst2, hz_out5);
2037 dst3 = __msa_aver_s_h(dst3, hz_out6);
2038
2039 out0 = PCKEV_XORI128_UB(dst0, dst1);
2040 out1 = PCKEV_XORI128_UB(dst2, dst3);
2041 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2042 dst += (4 * stride);
2043
2044 LD_SB4(src, stride, src9, src10, src11, src12);
2045 XORI_B4_128_SB(src9, src10, src11, src12);
2046 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
2047 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
2048 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
2049 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
2050 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2051 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2052 hz_out1211_r);
2053 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2054 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2055 hz_out1211_l);
2056 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2057 filt2);
2058 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2059 filt2);
2060 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2061 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2062 filt2);
2063 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2064 filt2);
2065 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2066 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2067 filt2);
2068 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2069 filt2);
2070 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2071 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2072 filt2);
2073 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2074 filt2);
2075 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2076
2077 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2078 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2079
2080 dst0 = __msa_aver_s_h(dst0, hz_out7);
2081 dst1 = __msa_aver_s_h(dst1, hz_out8);
2082 dst2 = __msa_aver_s_h(dst2, hz_out9);
2083 dst3 = __msa_aver_s_h(dst3, hz_out10);
2084
2085 out0 = PCKEV_XORI128_UB(dst0, dst1);
2086 out1 = PCKEV_XORI128_UB(dst2, dst3);
2087 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2088 }
2089
ff_put_h264_qpel4_mc21_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2090 void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
2091 ptrdiff_t stride)
2092 {
2093 const int32_t filt_const0 = 0xfffb0001;
2094 const int32_t filt_const1 = 0x140014;
2095 const int32_t filt_const2 = 0x1fffb;
2096 v16u8 res;
2097 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2098 v16i8 mask0, mask1, mask2;
2099 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2100 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2101 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2102 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2103 v4i32 tmp0, tmp1;
2104
2105 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2106
2107 filt0 = (v8i16) __msa_fill_w(filt_const0);
2108 filt1 = (v8i16) __msa_fill_w(filt_const1);
2109 filt2 = (v8i16) __msa_fill_w(filt_const2);
2110
2111 src -= ((2 * stride) + 2);
2112
2113 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2114 src += (5 * stride);
2115 LD_SB4(src, stride, src5, src6, src7, src8);
2116
2117 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2118 XORI_B4_128_SB(src5, src6, src7, src8);
2119
2120 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2121 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2122 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2123 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2124 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2125 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2126 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2127
2128 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2129 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2130 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2131 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2132
2133 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2134 filt2);
2135 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2136 filt2);
2137 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2138 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2139 filt2);
2140 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2141 filt2);
2142 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2143
2144 SRARI_H2_SH(hz_out2, hz_out4, 5);
2145 SAT_SH2_SH(hz_out2, hz_out4, 7);
2146
2147 dst0 = __msa_aver_s_h(dst0, hz_out2);
2148 dst1 = __msa_aver_s_h(dst1, hz_out4);
2149
2150 res = PCKEV_XORI128_UB(dst0, dst1);
2151 ST_W4(res, 0, 1, 2, 3, dst, stride);
2152 }
2153
ff_put_h264_qpel4_mc23_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2154 void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
2155 ptrdiff_t stride)
2156 {
2157 const int32_t filt_const0 = 0xfffb0001;
2158 const int32_t filt_const1 = 0x140014;
2159 const int32_t filt_const2 = 0x1fffb;
2160 v16u8 res;
2161 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2162 v16i8 mask0, mask1, mask2;
2163 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2164 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2165 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2166 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2167 v4i32 tmp0, tmp1;
2168
2169 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2170
2171 filt0 = (v8i16) __msa_fill_w(filt_const0);
2172 filt1 = (v8i16) __msa_fill_w(filt_const1);
2173 filt2 = (v8i16) __msa_fill_w(filt_const2);
2174
2175 src -= ((2 * stride) + 2);
2176
2177 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2178 src += (5 * stride);
2179 LD_SB4(src, stride, src5, src6, src7, src8);
2180
2181 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2182 XORI_B4_128_SB(src5, src6, src7, src8);
2183
2184 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2185 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2186 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2187 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2188 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2189 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2190 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2191
2192 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2193 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2194 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2195 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2196
2197 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2198 filt2);
2199 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2200 filt2);
2201 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2202 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2203 filt2);
2204 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2205 filt2);
2206 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2207
2208 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2209 SRARI_H2_SH(hz_out0, hz_out1, 5);
2210 SAT_SH2_SH(hz_out0, hz_out1, 7);
2211
2212 dst0 = __msa_aver_s_h(dst0, hz_out0);
2213 dst1 = __msa_aver_s_h(dst1, hz_out1);
2214
2215 res = PCKEV_XORI128_UB(dst0, dst1);
2216 ST_W4(res, 0, 1, 2, 3, dst, stride);
2217 }
2218
ff_put_h264_qpel16_mc02_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2219 void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
2220 ptrdiff_t stride)
2221 {
2222 int32_t loop_cnt;
2223 int16_t filt_const0 = 0xfb01;
2224 int16_t filt_const1 = 0x1414;
2225 int16_t filt_const2 = 0x1fb;
2226 v16u8 res0, res1, res2, res3;
2227 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2228 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2229 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2230 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2231 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2232
2233 filt0 = (v16i8) __msa_fill_h(filt_const0);
2234 filt1 = (v16i8) __msa_fill_h(filt_const1);
2235 filt2 = (v16i8) __msa_fill_h(filt_const2);
2236 src -= (stride * 2);
2237
2238 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2239 src += (5 * stride);
2240
2241 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2242 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2243 src32_r, src43_r);
2244 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2245 src32_l, src43_l);
2246
2247 for (loop_cnt = 4; loop_cnt--;) {
2248 LD_SB4(src, stride, src5, src6, src7, src8);
2249 src += (4 * stride);
2250
2251 XORI_B4_128_SB(src5, src6, src7, src8);
2252 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2253 src65_r, src76_r, src87_r);
2254 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2255 src65_l, src76_l, src87_l);
2256 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2257 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2258 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2259 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2260 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2261 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2262 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2263 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2264 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2265 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2266 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2267 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2268 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2269 out3_r, res0, res1, res2, res3);
2270 XORI_B4_128_UB(res0, res1, res2, res3);
2271 ST_UB4(res0, res1, res2, res3, dst, stride);
2272 dst += (4 * stride);
2273
2274 src10_r = src54_r;
2275 src32_r = src76_r;
2276 src21_r = src65_r;
2277 src43_r = src87_r;
2278 src10_l = src54_l;
2279 src32_l = src76_l;
2280 src21_l = src65_l;
2281 src43_l = src87_l;
2282 src4 = src8;
2283 }
2284 }
2285
ff_put_h264_qpel8_mc02_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2286 void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
2287 ptrdiff_t stride)
2288 {
2289 const int16_t filt_const0 = 0xfb01;
2290 const int16_t filt_const1 = 0x1414;
2291 const int16_t filt_const2 = 0x1fb;
2292 v16u8 out0, out1, out2, out3;
2293 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2295 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2296 v16i8 filt0, filt1, filt2;
2297 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2298
2299 filt0 = (v16i8) __msa_fill_h(filt_const0);
2300 filt1 = (v16i8) __msa_fill_h(filt_const1);
2301 filt2 = (v16i8) __msa_fill_h(filt_const2);
2302
2303 src -= (stride * 2);
2304
2305 LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2306 src += (8 * stride);
2307 LD_SB5(src, stride, src8, src9, src10, src11, src12);
2308 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2309 src32_r, src43_r);
2310 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2311 src98_r, src109_r);
2312 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2313 src910_r, src1110_r, src1211_r);
2314 XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
2315 XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
2316 XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
2317 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2318 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2319 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2320 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2321 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2322 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2323 out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2324 out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2325 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2326 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2327 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2328 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2329 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
2330 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
2331 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
2332 out3 = PCKEV_XORI128_UB(out6_r, out7_r);
2333 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
2334 }
2335
ff_put_h264_qpel4_mc02_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2336 void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
2337 ptrdiff_t stride)
2338 {
2339 const int16_t filt_const0 = 0xfb01;
2340 const int16_t filt_const1 = 0x1414;
2341 const int16_t filt_const2 = 0x1fb;
2342 v16u8 out;
2343 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2344 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2345 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2346 v8i16 out10, out32;
2347
2348 filt0 = (v16i8) __msa_fill_h(filt_const0);
2349 filt1 = (v16i8) __msa_fill_h(filt_const1);
2350 filt2 = (v16i8) __msa_fill_h(filt_const2);
2351
2352 src -= (stride * 2);
2353
2354 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2355 src += (5 * stride);
2356 LD_SB4(src, stride, src5, src6, src7, src8);
2357
2358 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2359 src32_r, src43_r);
2360 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2361 src76_r, src87_r);
2362 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2363 src76_r, src2110, src4332, src6554, src8776);
2364 XORI_B4_128_SB(src2110, src4332, src6554, src8776);
2365 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2366 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2367 SRARI_H2_SH(out10, out32, 5);
2368 SAT_SH2_SH(out10, out32, 7);
2369 out = PCKEV_XORI128_UB(out10, out32);
2370 ST_W4(out, 0, 1, 2, 3, dst, stride);
2371 }
2372
ff_put_h264_qpel16_mc12_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2373 void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
2374 ptrdiff_t stride)
2375 {
2376 uint32_t row;
2377 v16u8 out;
2378 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2379 v16i8 src11;
2380 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2381 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2382 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2383 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2384 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2385 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2386 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2387 v8i16 minus5h = __msa_ldi_h(-5);
2388 v8i16 plus20h = __msa_ldi_h(20);
2389
2390 mask3 = mask0 + 4;
2391 mask4 = mask1 + 4;
2392 mask5 = mask2 + 4;
2393
2394 src -= ((2 * stride) + 2);
2395
2396 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2397 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2398 src += (5 * stride);
2399 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2400 XORI_B5_128_SB(src7, src8, src9, src10, src11);
2401
2402 for (row = 16; row--;) {
2403 LD_SB2(src, 8, src5, src6);
2404 src += stride;
2405 XORI_B2_128_SB(src5, src6);
2406
2407 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2408 vt_res0, vt_res1);
2409 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2410 vt_res2, vt_res3);
2411 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2412 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2413 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2414 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2415 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2416 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2417 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2418 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2419 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2420 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2421 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2422 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2423 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2424 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2425 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2426 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2427 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2428 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2429 dst0 = __msa_srari_h(shf_vec2, 5);
2430 dst1 = __msa_srari_h(shf_vec5, 5);
2431 dst2 = __msa_srari_h(shf_vec8, 5);
2432 dst3 = __msa_srari_h(shf_vec11, 5);
2433 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2434 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2435 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2436 dst0 = __msa_aver_s_h(dst2, dst0);
2437 dst1 = __msa_aver_s_h(dst3, dst1);
2438 out = PCKEV_XORI128_UB(dst0, dst1);
2439 ST_UB(out, dst);
2440 dst += stride;
2441
2442 src0 = src1;
2443 src1 = src2;
2444 src2 = src3;
2445 src3 = src4;
2446 src4 = src5;
2447 src7 = src8;
2448 src8 = src9;
2449 src9 = src10;
2450 src10 = src11;
2451 src11 = src6;
2452 }
2453 }
2454
ff_put_h264_qpel16_mc32_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2455 void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
2456 ptrdiff_t stride)
2457 {
2458 uint32_t row;
2459 v16u8 out;
2460 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2461 v16i8 src11;
2462 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2463 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2464 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2465 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2466 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2467 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2468 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2469 v8i16 minus5h = __msa_ldi_h(-5);
2470 v8i16 plus20h = __msa_ldi_h(20);
2471
2472 mask3 = mask0 + 4;
2473 mask4 = mask1 + 4;
2474 mask5 = mask2 + 4;
2475
2476 src -= ((2 * stride) + 2);
2477
2478 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2479 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2480 src += (5 * stride);
2481 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2482 XORI_B5_128_SB(src7, src8, src9, src10, src11);
2483
2484 for (row = 16; row--;) {
2485 LD_SB2(src, 8, src5, src6);
2486 src += stride;
2487 XORI_B2_128_SB(src5, src6);
2488
2489 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2490 vt_res0, vt_res1);
2491 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2492 vt_res2, vt_res3);
2493 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2494 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2495 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2496 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2497 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2498 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2499 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2500 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2501 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2502 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2503 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2504 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2505 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2506 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2507 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2508 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2509 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2510 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2511 dst0 = __msa_srari_h(shf_vec2, 5);
2512 dst1 = __msa_srari_h(shf_vec5, 5);
2513 dst2 = __msa_srari_h(shf_vec8, 5);
2514 dst3 = __msa_srari_h(shf_vec11, 5);
2515 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2516 dst0 = __msa_pckod_h(dst2, dst0);
2517 dst1 = __msa_pckod_h(dst3, dst1);
2518 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2519 dst0 = __msa_aver_s_h(dst2, dst0);
2520 dst1 = __msa_aver_s_h(dst3, dst1);
2521 out = PCKEV_XORI128_UB(dst0, dst1);
2522 ST_UB(out, dst);
2523 dst += stride;
2524
2525 src0 = src1;
2526 src1 = src2;
2527 src2 = src3;
2528 src3 = src4;
2529 src4 = src5;
2530 src7 = src8;
2531 src8 = src9;
2532 src9 = src10;
2533 src10 = src11;
2534 src11 = src6;
2535 }
2536 }
2537
ff_put_h264_qpel8_mc12_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2538 void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
2539 ptrdiff_t stride)
2540 {
2541 uint32_t row;
2542 v16u8 out;
2543 v16i8 src0, src1, src2, src3, src4, src5, src6;
2544 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2545 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2546 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2547 v8i16 mask3, mask4, mask5;
2548 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2549 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2550 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2551 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2552 v8i16 minus5h = __msa_ldi_h(-5);
2553 v8i16 plus20h = __msa_ldi_h(20);
2554
2555 mask3 = mask0 + 4;
2556 mask4 = mask1 + 4;
2557 mask5 = mask2 + 4;
2558
2559 src -= ((2 * stride) + 2);
2560
2561 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2562 src += (5 * stride);
2563 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2564
2565 for (row = 4; row--;) {
2566 LD_SB2(src, stride, src5, src6);
2567 src += (2 * stride);
2568 XORI_B2_128_SB(src5, src6);
2569
2570 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2571 vt_res0, vt_res1);
2572 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2573 vt_res2, vt_res3);
2574 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2575 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2576 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2577 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2578 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2579 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2580 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2581 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2582 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2583 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2584 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2585 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2586 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2587 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2588 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2589 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2590 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2591 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2592 dst0 = __msa_srari_h(shf_vec2, 5);
2593 dst1 = __msa_srari_h(shf_vec5, 5);
2594 dst2 = __msa_srari_h(shf_vec8, 5);
2595 dst3 = __msa_srari_h(shf_vec11, 5);
2596 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2597 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2598 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2599 dst0 = __msa_aver_s_h(dst2, dst0);
2600 dst1 = __msa_aver_s_h(dst3, dst1);
2601 out = PCKEV_XORI128_UB(dst0, dst1);
2602 ST_D2(out, 0, 1, dst, stride);
2603 dst += (2 * stride);
2604
2605 src0 = src2;
2606 src1 = src3;
2607 src2 = src4;
2608 src3 = src5;
2609 src4 = src6;
2610 }
2611 }
2612
ff_put_h264_qpel8_mc32_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2613 void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
2614 ptrdiff_t stride)
2615 {
2616 uint32_t row;
2617 v16u8 out;
2618 v16i8 src0, src1, src2, src3, src4, src5, src6;
2619 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2620 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2621 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2622 v8i16 mask3, mask4, mask5;
2623 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2624 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2625 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2626 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2627 v8i16 minus5h = __msa_ldi_h(-5);
2628 v8i16 plus20h = __msa_ldi_h(20);
2629
2630 mask3 = mask0 + 4;
2631 mask4 = mask1 + 4;
2632 mask5 = mask2 + 4;
2633
2634 src -= ((2 * stride) + 2);
2635
2636 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2637 src += (5 * stride);
2638 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2639
2640 for (row = 4; row--;) {
2641 LD_SB2(src, stride, src5, src6);
2642 src += (2 * stride);
2643 XORI_B2_128_SB(src5, src6);
2644
2645 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2646 vt_res0, vt_res1);
2647 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2648 vt_res2, vt_res3);
2649 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2650 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2651 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2652 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2653 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2654 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2655 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2656 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2657 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2658 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2659 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2660 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2661 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2662 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2663 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2664 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2665 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2666 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2667 dst0 = __msa_srari_h(shf_vec2, 5);
2668 dst1 = __msa_srari_h(shf_vec5, 5);
2669 dst2 = __msa_srari_h(shf_vec8, 5);
2670 dst3 = __msa_srari_h(shf_vec11, 5);
2671 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2672 dst0 = __msa_pckod_h(dst2, dst0);
2673 dst1 = __msa_pckod_h(dst3, dst1);
2674 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2675 dst0 = __msa_aver_s_h(dst2, dst0);
2676 dst1 = __msa_aver_s_h(dst3, dst1);
2677 out = PCKEV_XORI128_UB(dst0, dst1);
2678 ST_D2(out, 0, 1, dst, stride);
2679 dst += (2 * stride);
2680
2681 src0 = src2;
2682 src1 = src3;
2683 src2 = src4;
2684 src3 = src5;
2685 src4 = src6;
2686 }
2687 }
2688
ff_put_h264_qpel4_mc12_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2689 void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
2690 ptrdiff_t stride)
2691 {
2692 const int16_t filt_const0 = 0xfb01;
2693 const int16_t filt_const1 = 0x1414;
2694 const int16_t filt_const2 = 0x1fb;
2695 v16u8 out;
2696 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2697 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2698 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2699 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2700 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2701 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2702 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2703 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2704 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2705 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2706 v8i16 minus5h = __msa_ldi_h(-5);
2707 v8i16 plus20h = __msa_ldi_h(20);
2708 v8i16 zeros = { 0 };
2709
2710 filt0 = (v16i8) __msa_fill_h(filt_const0);
2711 filt1 = (v16i8) __msa_fill_h(filt_const1);
2712 filt2 = (v16i8) __msa_fill_h(filt_const2);
2713
2714 src -= ((2 * stride) + 2);
2715
2716 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2717 src += (5 * stride);
2718 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2719 LD_SB4(src, stride, src5, src6, src7, src8);
2720 XORI_B4_128_SB(src5, src6, src7, src8);
2721
2722 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2723 src32_r, src43_r);
2724 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2725 src76_r, src87_r);
2726 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2727 src32_l, src43_l);
2728 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2729 src76_l, src87_l);
2730 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2731 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2732 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2733 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2734 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2735 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2736 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2737 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2738 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2739 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2740 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2741 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2742
2743 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2744 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2745 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2746 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2747 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2748 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2749 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2750 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2751 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2752 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2753 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2754 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2755
2756 SRARI_W2_SW(hz_res0, hz_res1, 10);
2757 SAT_SW2_SW(hz_res0, hz_res1, 7);
2758 SRARI_W2_SW(hz_res2, hz_res3, 10);
2759 SAT_SW2_SW(hz_res2, hz_res3, 7);
2760
2761 dst0 = __msa_srari_h(shf_vec2, 5);
2762 dst1 = __msa_srari_h(shf_vec5, 5);
2763 dst2 = __msa_srari_h(shf_vec6, 5);
2764 dst3 = __msa_srari_h(shf_vec7, 5);
2765
2766 SAT_SH2_SH(dst0, dst1, 7);
2767 SAT_SH2_SH(dst2, dst3, 7);
2768 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2769 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2770
2771 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2772 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2773 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2774 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2775
2776 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2777 out = PCKEV_XORI128_UB(dst0, dst2);
2778 ST_W4(out, 0, 1, 2, 3, dst, stride);
2779 }
2780
ff_put_h264_qpel4_mc32_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2781 void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
2782 ptrdiff_t stride)
2783 {
2784 const int16_t filt_const0 = 0xfb01;
2785 const int16_t filt_const1 = 0x1414;
2786 const int16_t filt_const2 = 0x1fb;
2787 v16u8 out;
2788 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2789 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2790 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2791 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2792 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2793 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2794 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2795 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2796 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2797 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2798 v8i16 minus5h = __msa_ldi_h(-5);
2799 v8i16 plus20h = __msa_ldi_h(20);
2800 v8i16 zeros = { 0 };
2801
2802 filt0 = (v16i8) __msa_fill_h(filt_const0);
2803 filt1 = (v16i8) __msa_fill_h(filt_const1);
2804 filt2 = (v16i8) __msa_fill_h(filt_const2);
2805
2806 src -= ((2 * stride) + 2);
2807
2808 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2809 src += (5 * stride);
2810 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2811 LD_SB4(src, stride, src5, src6, src7, src8);
2812 XORI_B4_128_SB(src5, src6, src7, src8);
2813
2814 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2815 src32_r, src43_r);
2816 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2817 src76_r, src87_r);
2818 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2819 src32_l, src43_l);
2820 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2821 src76_l, src87_l);
2822
2823 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2824 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2825 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2826 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2827 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2828 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2829 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2830 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2831 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2832 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2833 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2834 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2835
2836 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2837 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2838 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2839 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2840 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2841 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2842 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2843 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2844 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2845 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2846 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2847 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2848
2849 SRARI_W2_SW(hz_res0, hz_res1, 10);
2850 SAT_SW2_SW(hz_res0, hz_res1, 7);
2851 SRARI_W2_SW(hz_res2, hz_res3, 10);
2852 SAT_SW2_SW(hz_res2, hz_res3, 7);
2853
2854 dst0 = __msa_srari_h(shf_vec2, 5);
2855 dst1 = __msa_srari_h(shf_vec5, 5);
2856 dst2 = __msa_srari_h(shf_vec6, 5);
2857 dst3 = __msa_srari_h(shf_vec7, 5);
2858
2859 SAT_SH2_SH(dst0, dst1, 7);
2860 SAT_SH2_SH(dst2, dst3, 7);
2861
2862 dst0 = __msa_ilvod_h(zeros, dst0);
2863 dst1 = __msa_ilvod_h(zeros, dst1);
2864 dst2 = __msa_ilvod_h(zeros, dst2);
2865 dst3 = __msa_ilvod_h(zeros, dst3);
2866
2867 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2868 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2869 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2870 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2871
2872 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2873 out = PCKEV_XORI128_UB(dst0, dst2);
2874 ST_W4(out, 0, 1, 2, 3, dst, stride);
2875 }
2876
ff_put_h264_qpel16_mc22_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2877 void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
2878 ptrdiff_t stride)
2879 {
2880 const int32_t filt_const0 = 0xfffb0001;
2881 const int32_t filt_const1 = 0x140014;
2882 const int32_t filt_const2 = 0x1fffb;
2883 const uint8_t *src_tmp = src - (2 * stride) - 2;
2884 uint8_t *dst_tmp = dst;
2885 uint32_t multiple8_cnt, loop_cnt;
2886 v16u8 out0, out1;
2887 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2888 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2889 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2890 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2891 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2892 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2893 v8i16 hz_out87_l, filt0, filt1, filt2;
2894 v4i32 tmp0, tmp1;
2895
2896 filt0 = (v8i16) __msa_fill_w(filt_const0);
2897 filt1 = (v8i16) __msa_fill_w(filt_const1);
2898 filt2 = (v8i16) __msa_fill_w(filt_const2);
2899
2900 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2901
2902 for (multiple8_cnt = 2; multiple8_cnt--;) {
2903 src = src_tmp;
2904 dst = dst_tmp;
2905
2906 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2907 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2908 src += (5 * stride);
2909
2910 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2911 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2912 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2913 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2914 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2915
2916 for (loop_cnt = 4; loop_cnt--;) {
2917 LD_SB4(src, stride, src0, src1, src2, src3);
2918 XORI_B4_128_SB(src0, src1, src2, src3);
2919 src += (4 * stride);
2920
2921 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2922 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2923 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2924 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2925
2926 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2927 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2928 hz_out43_r);
2929 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2930 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2931 hz_out43_l);
2932 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2933 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2934 hz_out87_r);
2935 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2936 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2937 hz_out87_l);
2938
2939 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2940 filt1, filt2);
2941 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2942 filt1, filt2);
2943 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2944 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2945 filt1, filt2);
2946 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2947 filt1, filt2);
2948 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2949 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2950 filt1, filt2);
2951 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2952 filt1, filt2);
2953 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2954 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2955 filt1, filt2);
2956 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2957 filt1, filt2);
2958 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2959
2960 out0 = PCKEV_XORI128_UB(dst0, dst1);
2961 out1 = PCKEV_XORI128_UB(dst2, dst3);
2962 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2963 dst += (4 * stride);
2964
2965 hz_out0 = hz_out4;
2966 hz_out1 = hz_out5;
2967 hz_out2 = hz_out6;
2968 hz_out3 = hz_out7;
2969 hz_out4 = hz_out8;
2970 }
2971
2972 src_tmp += 8;
2973 dst_tmp += 8;
2974 }
2975 }
2976
ff_put_h264_qpel8_mc22_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)2977 void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
2978 ptrdiff_t stride)
2979 {
2980 const int32_t filt_const0 = 0xfffb0001;
2981 const int32_t filt_const1 = 0x140014;
2982 const int32_t filt_const2 = 0x1fffb;
2983 v16u8 out0, out1;
2984 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2985 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2986 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2987 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2988 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2989 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2990 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2991 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2992 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2993 v4i32 tmp0, tmp1;
2994
2995 filt0 = (v8i16) __msa_fill_w(filt_const0);
2996 filt1 = (v8i16) __msa_fill_w(filt_const1);
2997 filt2 = (v8i16) __msa_fill_w(filt_const2);
2998
2999 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3000
3001 src -= ((2 * stride) + 2);
3002 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3003 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3004 src += (5 * stride);
3005
3006 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3007 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3008 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3009 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3010 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3011
3012 LD_SB4(src, stride, src0, src1, src2, src3);
3013 XORI_B4_128_SB(src0, src1, src2, src3);
3014 src += (4 * stride);
3015 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3016 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3017 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3018 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3019 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3020 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3021 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3023 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3024 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3025 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3027
3028 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3029 filt2);
3030 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3031 filt2);
3032 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3033 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3034 filt2);
3035 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3036 filt2);
3037 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3038 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3039 filt2);
3040 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3041 filt2);
3042 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3043 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3044 filt2);
3045 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3046 filt2);
3047 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3048 out0 = PCKEV_XORI128_UB(dst0, dst1);
3049 out1 = PCKEV_XORI128_UB(dst2, dst3);
3050 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3051 dst += (4 * stride);
3052
3053 LD_SB4(src, stride, src0, src1, src2, src3);
3054 XORI_B4_128_SB(src0, src1, src2, src3);
3055 hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3056 hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3057 hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3058 hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3059 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3060 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3061 hz_out1211_r);
3062 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3063 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3064 hz_out1211_l);
3065 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3066 filt2);
3067 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3068 filt2);
3069 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3070 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3071 filt2);
3072 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3073 filt2);
3074 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3075 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3076 filt2);
3077 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3078 filt2);
3079 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3080 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3081 filt2);
3082 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3083 filt2);
3084 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3085 out0 = PCKEV_XORI128_UB(dst0, dst1);
3086 out1 = PCKEV_XORI128_UB(dst2, dst3);
3087 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3088 }
3089
ff_put_h264_qpel4_mc22_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3090 void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
3091 ptrdiff_t stride)
3092 {
3093 const int32_t filt_const0 = 0xfffb0001;
3094 const int32_t filt_const1 = 0x140014;
3095 const int32_t filt_const2 = 0x1fffb;
3096 v16u8 res;
3097 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3098 v16i8 mask0, mask1, mask2;
3099 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3100 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3101 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3102 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3103 v4i32 tmp0, tmp1;
3104
3105 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3106
3107 filt0 = (v8i16) __msa_fill_w(filt_const0);
3108 filt1 = (v8i16) __msa_fill_w(filt_const1);
3109 filt2 = (v8i16) __msa_fill_w(filt_const2);
3110
3111 src -= ((2 * stride) + 2);
3112
3113 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3114 src += (5 * stride);
3115 LD_SB4(src, stride, src5, src6, src7, src8);
3116
3117 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3118 XORI_B4_128_SB(src5, src6, src7, src8);
3119 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3120 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3121 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3122 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3123 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3124 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3125 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3126 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3127 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3128 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3129 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3130
3131 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3132 filt2);
3133 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3134 filt2);
3135 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3136 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3137 filt2);
3138 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3139 filt2);
3140 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3141 res = PCKEV_XORI128_UB(dst0, dst1);
3142 ST_W4(res, 0, 1, 2, 3, dst, stride);
3143 }
3144
ff_avg_h264_qpel16_mc10_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3145 void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
3146 ptrdiff_t stride)
3147 {
3148 uint32_t loop_cnt;
3149 v16u8 dst0, dst1, dst2, dst3;
3150 v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3151 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3152 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3153 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3154 v16i8 minus5b = __msa_ldi_b(-5);
3155 v16i8 plus20b = __msa_ldi_b(20);
3156
3157 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3158 mask3 = mask0 + 8;
3159 mask4 = mask1 + 8;
3160 mask5 = mask2 + 8;
3161 src -= 2;
3162
3163 for (loop_cnt = 4; loop_cnt--;) {
3164 LD_SB2(src, 16, src0, src1);
3165 src += stride;
3166 LD_SB2(src, 16, src2, src3);
3167 src += stride;
3168 LD_SB2(src, 16, src4, src5);
3169 src += stride;
3170 LD_SB2(src, 16, src6, src7);
3171 src += stride;
3172
3173 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3174 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3175 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3176 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3177 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3178 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3179 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3180 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3181 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3182 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3183 minus5b, res0, res1, res2, res3);
3184 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3185 plus20b, res0, res1, res2, res3);
3186 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3187 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3188 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3189 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3190 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3191 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3192 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3193 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3194 minus5b, res4, res5, res6, res7);
3195 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3196 plus20b, res4, res5, res6, res7);
3197 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
3198 src0, src2, src4, src6);
3199 SRARI_H4_SH(res0, res1, res2, res3, 5);
3200 SRARI_H4_SH(res4, res5, res6, res7, 5);
3201 SAT_SH4_SH(res0, res1, res2, res3, 7);
3202 SAT_SH4_SH(res4, res5, res6, res7, 7);
3203 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3204 PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3205 out0 = __msa_aver_s_b(out0, src0);
3206 out1 = __msa_aver_s_b(out1, src2);
3207 out2 = __msa_aver_s_b(out2, src4);
3208 out3 = __msa_aver_s_b(out3, src6);
3209 XORI_B4_128_SB(out0, out1, out2, out3);
3210 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3211 AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3212 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3213 dst += (4 * stride);
3214 }
3215 }
3216
ff_avg_h264_qpel16_mc30_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3217 void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3218 ptrdiff_t stride)
3219 {
3220 uint32_t loop_cnt;
3221 v16u8 dst0, dst1, dst2, dst3;
3222 v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3223 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3224 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3225 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3226 v16i8 minus5b = __msa_ldi_b(-5);
3227 v16i8 plus20b = __msa_ldi_b(20);
3228
3229 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3230 mask3 = mask0 + 8;
3231 mask4 = mask1 + 8;
3232 mask5 = mask2 + 8;
3233 src -= 2;
3234
3235 for (loop_cnt = 4; loop_cnt--;) {
3236 LD_SB2(src, 16, src0, src1);
3237 src += stride;
3238 LD_SB2(src, 16, src2, src3);
3239 src += stride;
3240 LD_SB2(src, 16, src4, src5);
3241 src += stride;
3242 LD_SB2(src, 16, src6, src7);
3243 src += stride;
3244
3245 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3246 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3247 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3248 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3249 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3250 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3251 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3252 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3253 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3254 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3255 minus5b, res0, res1, res2, res3);
3256 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3257 plus20b, res0, res1, res2, res3);
3258 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3259 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3260 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3261 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3262 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3263 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3264 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3265 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3266 minus5b, res4, res5, res6, res7);
3267 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3268 plus20b, res4, res5, res6, res7);
3269 SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
3270 src0, src2, src4, src6);
3271 SRARI_H4_SH(res0, res1, res2, res3, 5);
3272 SRARI_H4_SH(res4, res5, res6, res7, 5);
3273 SAT_SH4_SH(res0, res1, res2, res3, 7);
3274 SAT_SH4_SH(res4, res5, res6, res7, 7);
3275 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3276 PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3277 out0 = __msa_aver_s_b(out0, src0);
3278 out1 = __msa_aver_s_b(out1, src2);
3279 out2 = __msa_aver_s_b(out2, src4);
3280 out3 = __msa_aver_s_b(out3, src6);
3281 XORI_B4_128_SB(out0, out1, out2, out3);
3282 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3283 AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3284 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3285 dst += (4 * stride);
3286 }
3287 }
3288
ff_avg_h264_qpel8_mc10_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3289 void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3290 ptrdiff_t stride)
3291 {
3292 uint64_t tp0, tp1, tp2, tp3;
3293 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3294 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3295 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3296 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3297 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3298 v16i8 minus5b = __msa_ldi_b(-5);
3299 v16i8 plus20b = __msa_ldi_b(20);
3300
3301 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3302 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3303 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3304 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3305 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3306 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3307 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3308 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3309 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3310 res0, res1, res2, res3);
3311 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3312 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3313 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3314 res0, res1, res2, res3);
3315 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3316 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3317 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3318 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3319 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3320 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3321 res4, res5, res6, res7);
3322 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3323 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3324 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3325 res4, res5, res6, res7);
3326 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3327 src0, src1, src2, src3);
3328 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
3329 src4, src5, src6, src7);
3330 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3331 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3332 SRARI_H4_SH(res0, res1, res2, res3, 5);
3333 SRARI_H4_SH(res4, res5, res6, res7, 5);
3334 SAT_SH4_SH(res0, res1, res2, res3, 7);
3335 SAT_SH4_SH(res4, res5, res6, res7, 7);
3336 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3337 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3338 tmp0 = __msa_aver_s_b(tmp0, src0);
3339 tmp1 = __msa_aver_s_b(tmp1, src1);
3340 tmp2 = __msa_aver_s_b(tmp2, src4);
3341 tmp3 = __msa_aver_s_b(tmp3, src5);
3342 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3343 LD4(dst, stride, tp0, tp1, tp2, tp3);
3344 INSERT_D2_UB(tp0, tp1, dst0);
3345 INSERT_D2_UB(tp2, tp3, dst1);
3346 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3347 INSERT_D2_UB(tp0, tp1, dst2);
3348 INSERT_D2_UB(tp2, tp3, dst3);
3349 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3350 AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3351 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3352 }
3353
ff_avg_h264_qpel8_mc30_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3354 void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3355 ptrdiff_t stride)
3356 {
3357 uint64_t tp0, tp1, tp2, tp3;
3358 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3359 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3360 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3361 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3362 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3363 v16i8 minus5b = __msa_ldi_b(-5);
3364 v16i8 plus20b = __msa_ldi_b(20);
3365
3366 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3367 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3368 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3369 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3370 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3371 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3372 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3373 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3374 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3375 res0, res1, res2, res3);
3376 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3377 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3378 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3379 res0, res1, res2, res3);
3380 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3381 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3382 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3383 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3384 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3385 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3386 res4, res5, res6, res7);
3387 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3388 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3389 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3390 res4, res5, res6, res7);
3391 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3392 src0, src1, src2, src3);
3393 SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
3394 src4, src5, src6, src7);
3395 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3396 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3397 SRARI_H4_SH(res0, res1, res2, res3, 5);
3398 SRARI_H4_SH(res4, res5, res6, res7, 5);
3399 SAT_SH4_SH(res0, res1, res2, res3, 7);
3400 SAT_SH4_SH(res4, res5, res6, res7, 7);
3401 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3402 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3403 tmp0 = __msa_aver_s_b(tmp0, src0);
3404 tmp1 = __msa_aver_s_b(tmp1, src1);
3405 tmp2 = __msa_aver_s_b(tmp2, src4);
3406 tmp3 = __msa_aver_s_b(tmp3, src5);
3407 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3408 LD4(dst, stride, tp0, tp1, tp2, tp3);
3409 INSERT_D2_UB(tp0, tp1, dst0);
3410 INSERT_D2_UB(tp2, tp3, dst1);
3411 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3412 INSERT_D2_UB(tp0, tp1, dst2);
3413 INSERT_D2_UB(tp2, tp3, dst3);
3414 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3415 AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3416 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3417 }
3418
ff_avg_h264_qpel4_mc10_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3419 void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3420 ptrdiff_t stride)
3421 {
3422 uint32_t tp0, tp1, tp2, tp3;
3423 v16u8 dst0 = { 0 };
3424 v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3425 v16i8 mask0, mask1, mask2;
3426 v8i16 out0, out1;
3427 v16i8 minus5b = __msa_ldi_b(-5);
3428 v16i8 plus20b = __msa_ldi_b(20);
3429
3430 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3431 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3432 XORI_B4_128_SB(src0, src1, src2, src3);
3433 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3434 HADD_SB2_SH(vec0, vec1, out0, out1);
3435 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3436 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3437 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3438 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3439 SRARI_H2_SH(out0, out1, 5);
3440 SAT_SH2_SH(out0, out1, 7);
3441 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3442 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3443 src0, src1, src2, src3);
3444 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3445 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3446 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3447 res = __msa_aver_s_b(res, src0);
3448 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3449 LW4(dst, stride, tp0, tp1, tp2, tp3);
3450 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3451 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3452 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3453 }
3454
ff_avg_h264_qpel4_mc30_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3455 void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3456 ptrdiff_t stride)
3457 {
3458 uint32_t tp0, tp1, tp2, tp3;
3459 v16u8 dst0 = { 0 };
3460 v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3461 v16i8 mask0, mask1, mask2;
3462 v8i16 out0, out1;
3463 v16i8 minus5b = __msa_ldi_b(-5);
3464 v16i8 plus20b = __msa_ldi_b(20);
3465
3466 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3467 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3468 XORI_B4_128_SB(src0, src1, src2, src3);
3469 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3470 HADD_SB2_SH(vec0, vec1, out0, out1);
3471 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3472 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3473 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3474 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3475 SRARI_H2_SH(out0, out1, 5);
3476 SAT_SH2_SH(out0, out1, 7);
3477 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3478 SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3479 src0, src1, src2, src3);
3480 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3481 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3482 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3483 res = __msa_aver_s_b(res, src0);
3484 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3485 LW4(dst, stride, tp0, tp1, tp2, tp3);
3486 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3487 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3488 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3489 }
3490
ff_avg_h264_qpel16_mc20_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3491 void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3492 ptrdiff_t stride)
3493 {
3494 uint32_t loop_cnt;
3495 v16u8 dst0, dst1, dst2, dst3;
3496 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3497 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3498 v16i8 vec11;
3499 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3500 v16i8 minus5b = __msa_ldi_b(-5);
3501 v16i8 plus20b = __msa_ldi_b(20);
3502
3503 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3504 src -= 2;
3505
3506 for (loop_cnt = 4; loop_cnt--;) {
3507 LD_SB2(src, 8, src0, src1);
3508 src += stride;
3509 LD_SB2(src, 8, src2, src3);
3510 src += stride;
3511 LD_SB2(src, 8, src4, src5);
3512 src += stride;
3513 LD_SB2(src, 8, src6, src7);
3514 src += stride;
3515
3516 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3517 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3518 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3519 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3520 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3521 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3522 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3523 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3524 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3525 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3526 minus5b, res0, res1, res2, res3);
3527 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3528 plus20b, res0, res1, res2, res3);
3529 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3530 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3531 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3532 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3533 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3534 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3535 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3536 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3537 minus5b, res4, res5, res6, res7);
3538 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3539 plus20b, res4, res5, res6, res7);
3540 SRARI_H4_SH(res0, res1, res2, res3, 5);
3541 SRARI_H4_SH(res4, res5, res6, res7, 5);
3542 SAT_SH4_SH(res0, res1, res2, res3, 7);
3543 SAT_SH4_SH(res4, res5, res6, res7, 7);
3544 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3545 vec2, vec3);
3546 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
3547 AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
3548 AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3);
3549 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3550 dst += (4 * stride);
3551 }
3552 }
3553
ff_avg_h264_qpel8_mc20_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3554 void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3555 ptrdiff_t stride)
3556 {
3557 uint64_t tp0, tp1, tp2, tp3;
3558 v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3559 v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3560 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3561 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3562 v16i8 vec11;
3563 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3564 v16i8 minus5b = __msa_ldi_b(-5);
3565 v16i8 plus20b = __msa_ldi_b(20);
3566
3567 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3568
3569 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3570 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3571 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3572 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3573 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3574 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3575 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3576 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3577 res0, res1, res2, res3);
3578 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3579 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3580 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3581 res0, res1, res2, res3);
3582 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3583 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3584 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3585 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3586 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3587 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3588 res4, res5, res6, res7);
3589 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3590 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3591 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3592 res4, res5, res6, res7);
3593 SRARI_H4_SH(res0, res1, res2, res3, 5);
3594 SRARI_H4_SH(res4, res5, res6, res7, 5);
3595 SAT_SH4_SH(res0, res1, res2, res3, 7);
3596 SAT_SH4_SH(res4, res5, res6, res7, 7);
3597 out0 = PCKEV_XORI128_UB(res0, res1);
3598 out1 = PCKEV_XORI128_UB(res2, res3);
3599 out4 = PCKEV_XORI128_UB(res4, res5);
3600 out5 = PCKEV_XORI128_UB(res6, res7);
3601 LD4(dst, stride, tp0, tp1, tp2, tp3);
3602 INSERT_D2_UB(tp0, tp1, out2);
3603 INSERT_D2_UB(tp2, tp3, out3);
3604 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3605 INSERT_D2_UB(tp0, tp1, out6);
3606 INSERT_D2_UB(tp2, tp3, out7);
3607 AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
3608 AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
3609 ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3610 }
3611
ff_avg_h264_qpel4_mc20_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3612 void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3613 ptrdiff_t stride)
3614 {
3615 uint32_t tp0, tp1, tp2, tp3;
3616 v16u8 res, dst0 = { 0 };
3617 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3618 v16i8 mask0, mask1, mask2;
3619 v8i16 res0, res1;
3620 v16i8 minus5b = __msa_ldi_b(-5);
3621 v16i8 plus20b = __msa_ldi_b(20);
3622
3623 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3624 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3625 XORI_B4_128_SB(src0, src1, src2, src3);
3626 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3627 HADD_SB2_SH(vec0, vec1, res0, res1);
3628 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3629 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3630 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3631 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3632 SRARI_H2_SH(res0, res1, 5);
3633 SAT_SH2_SH(res0, res1, 7);
3634 res = PCKEV_XORI128_UB(res0, res1);
3635 LW4(dst, stride, tp0, tp1, tp2, tp3);
3636 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3637 res = __msa_aver_u_b(res, dst0);
3638 ST_W4(res, 0, 1, 2, 3, dst, stride);
3639 }
3640
ff_avg_h264_qpel16_mc01_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3641 void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3642 ptrdiff_t stride)
3643 {
3644 int32_t loop_cnt;
3645 int16_t filt_const0 = 0xfb01;
3646 int16_t filt_const1 = 0x1414;
3647 int16_t filt_const2 = 0x1fb;
3648 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3649 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3650 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3651 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3652 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3653 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3654
3655 filt0 = (v16i8) __msa_fill_h(filt_const0);
3656 filt1 = (v16i8) __msa_fill_h(filt_const1);
3657 filt2 = (v16i8) __msa_fill_h(filt_const2);
3658
3659 src -= (stride * 2);
3660
3661 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3662 src += (5 * stride);
3663
3664 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3665 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3666 src32_r, src43_r);
3667 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3668 src32_l, src43_l);
3669
3670 for (loop_cnt = 4; loop_cnt--;) {
3671 LD_SB4(src, stride, src5, src6, src7, src8);
3672 src += (4 * stride);
3673
3674 XORI_B4_128_SB(src5, src6, src7, src8);
3675 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3676 src65_r, src76_r, src87_r);
3677 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3678 src65_l, src76_l, src87_l);
3679 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3680 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3681 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3682 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3683 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3684 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3685 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3686 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3687 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3688 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3689 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3690 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3691 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3692 out3_r, res0, res1, res2, res3);
3693 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3694 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3695 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3696 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3697 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3698 XORI_B4_128_UB(res0, res1, res2, res3);
3699 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3700 AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3701 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3702 dst += (4 * stride);
3703
3704 src10_r = src54_r;
3705 src32_r = src76_r;
3706 src21_r = src65_r;
3707 src43_r = src87_r;
3708 src10_l = src54_l;
3709 src32_l = src76_l;
3710 src21_l = src65_l;
3711 src43_l = src87_l;
3712 src2 = src6;
3713 src3 = src7;
3714 src4 = src8;
3715 }
3716 }
3717
ff_avg_h264_qpel16_mc03_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3718 void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3719 ptrdiff_t stride)
3720 {
3721 int32_t loop_cnt;
3722 int16_t filt_const0 = 0xfb01;
3723 int16_t filt_const1 = 0x1414;
3724 int16_t filt_const2 = 0x1fb;
3725 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3726 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3727 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3728 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3729 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3730 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3731
3732 filt0 = (v16i8) __msa_fill_h(filt_const0);
3733 filt1 = (v16i8) __msa_fill_h(filt_const1);
3734 filt2 = (v16i8) __msa_fill_h(filt_const2);
3735
3736 src -= (stride * 2);
3737
3738 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3739 src += (5 * stride);
3740
3741 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3742 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3743 src32_r, src43_r);
3744 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3745 src32_l, src43_l);
3746
3747 for (loop_cnt = 4; loop_cnt--;) {
3748 LD_SB4(src, stride, src5, src6, src7, src8);
3749 src += (4 * stride);
3750
3751 XORI_B4_128_SB(src5, src6, src7, src8);
3752 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3753 src65_r, src76_r, src87_r);
3754 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3755 src65_l, src76_l, src87_l);
3756 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3757 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3758 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3759 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3760 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3761 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3762 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3763 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3764 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3765 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3766 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3767 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3768 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3769 out3_r, res0, res1, res2, res3);
3770 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3771 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3772 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3773 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3774 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3775 XORI_B4_128_UB(res0, res1, res2, res3);
3776 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3777 AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3778 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3779 dst += (4 * stride);
3780
3781 src10_r = src54_r;
3782 src32_r = src76_r;
3783 src21_r = src65_r;
3784 src43_r = src87_r;
3785 src10_l = src54_l;
3786 src32_l = src76_l;
3787 src21_l = src65_l;
3788 src43_l = src87_l;
3789 src3 = src7;
3790 src4 = src8;
3791 }
3792 }
3793
ff_avg_h264_qpel8_mc01_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3794 void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3795 ptrdiff_t stride)
3796 {
3797 uint64_t tp0, tp1, tp2, tp3;
3798 const int16_t filt_const0 = 0xfb01;
3799 const int16_t filt_const1 = 0x1414;
3800 const int16_t filt_const2 = 0x1fb;
3801 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3802 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3803 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3804 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3805 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3806 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3807
3808 filt0 = (v16i8) __msa_fill_h(filt_const0);
3809 filt1 = (v16i8) __msa_fill_h(filt_const1);
3810 filt2 = (v16i8) __msa_fill_h(filt_const2);
3811
3812 src -= (stride * 2);
3813
3814 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3815 src += (5 * stride);
3816
3817 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3818 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3819 src32_r, src43_r);
3820 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3821 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3822 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3823 src87_r, src98_r, src109_r);
3824 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3825 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3826 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3827 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3828 PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1);
3829 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3830 src21_r, src32_r, src43_r);
3831 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3832 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3833 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3834 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3835 PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3836 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3837 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3838 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3839 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3840
3841 LD4(dst, stride, tp0, tp1, tp2, tp3);
3842 INSERT_D2_UB(tp0, tp1, dst0);
3843 INSERT_D2_UB(tp2, tp3, dst1);
3844 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3845 INSERT_D2_UB(tp0, tp1, dst2);
3846 INSERT_D2_UB(tp2, tp3, dst3);
3847
3848 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3849 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3850 out0 = __msa_aver_s_b(out0, tmp0);
3851 out1 = __msa_aver_s_b(out1, tmp1);
3852 out2 = __msa_aver_s_b(out2, tmp2);
3853 out3 = __msa_aver_s_b(out3, tmp3);
3854 XORI_B4_128_SB(out0, out1, out2, out3);
3855 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3856 dst2, dst3);
3857 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3858 }
3859
ff_avg_h264_qpel8_mc03_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3860 void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3861 ptrdiff_t stride)
3862 {
3863 uint64_t tp0, tp1, tp2, tp3;
3864 const int16_t filt_const0 = 0xfb01;
3865 const int16_t filt_const1 = 0x1414;
3866 const int16_t filt_const2 = 0x1fb;
3867 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3868 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3869 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3870 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3871 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3872 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3873
3874 filt0 = (v16i8) __msa_fill_h(filt_const0);
3875 filt1 = (v16i8) __msa_fill_h(filt_const1);
3876 filt2 = (v16i8) __msa_fill_h(filt_const2);
3877
3878 src -= (stride * 2);
3879
3880 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3881 src += (5 * stride);
3882
3883 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3884 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3885 src32_r, src43_r);
3886 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3887 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3888 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3889 src87_r, src98_r, src109_r);
3890 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3891 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3892 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3893 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3894 PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1);
3895 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3896 src21_r, src32_r, src43_r);
3897 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3898 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3899 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3900 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3901 PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3902 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3903 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3904 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3905 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3906
3907 LD4(dst, stride, tp0, tp1, tp2, tp3);
3908 INSERT_D2_UB(tp0, tp1, dst0);
3909 INSERT_D2_UB(tp2, tp3, dst1);
3910 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3911 INSERT_D2_UB(tp0, tp1, dst2);
3912 INSERT_D2_UB(tp2, tp3, dst3);
3913
3914 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3915 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3916 out0 = __msa_aver_s_b(out0, tmp0);
3917 out1 = __msa_aver_s_b(out1, tmp1);
3918 out2 = __msa_aver_s_b(out2, tmp2);
3919 out3 = __msa_aver_s_b(out3, tmp3);
3920 XORI_B4_128_SB(out0, out1, out2, out3);
3921 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3922 dst2, dst3);
3923 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3924 }
3925
ff_avg_h264_qpel4_mc01_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3926 void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3927 ptrdiff_t stride)
3928 {
3929 uint32_t tp0, tp1, tp2, tp3;
3930 int16_t filt_const0 = 0xfb01;
3931 int16_t filt_const1 = 0x1414;
3932 int16_t filt_const2 = 0x1fb;
3933 v16u8 res, dst0 = { 0 };
3934 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3935 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3936 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3937 v8i16 out10, out32;
3938
3939 filt0 = (v16i8) __msa_fill_h(filt_const0);
3940 filt1 = (v16i8) __msa_fill_h(filt_const1);
3941 filt2 = (v16i8) __msa_fill_h(filt_const2);
3942
3943 src -= (stride * 2);
3944 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3945 src += (5 * stride);
3946
3947 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3948 src32_r, src43_r);
3949 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3950 XORI_B2_128_SB(src2110, src4332);
3951 LD_SB4(src, stride, src5, src6, src7, src8);
3952 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3953 src76_r, src87_r);
3954 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3955 XORI_B2_128_SB(src6554, src8776);
3956 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3957 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3958 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3959 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3960 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3961 SRARI_H2_SH(out10, out32, 5);
3962 SAT_SH2_SH(out10, out32, 7);
3963 LW4(dst, stride, tp0, tp1, tp2, tp3);
3964 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3965 res = PCKEV_XORI128_UB(out10, out32);
3966 res = __msa_aver_u_b(res, (v16u8) src32_r);
3967 dst0 = __msa_aver_u_b(res, dst0);
3968 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3969 }
3970
ff_avg_h264_qpel4_mc03_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)3971 void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3972 ptrdiff_t stride)
3973 {
3974 uint32_t tp0, tp1, tp2, tp3;
3975 int16_t filt_const0 = 0xfb01;
3976 int16_t filt_const1 = 0x1414;
3977 int16_t filt_const2 = 0x1fb;
3978 v16u8 res, dst0 = { 0 };
3979 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3980 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3981 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3982 v8i16 out10, out32;
3983
3984 filt0 = (v16i8) __msa_fill_h(filt_const0);
3985 filt1 = (v16i8) __msa_fill_h(filt_const1);
3986 filt2 = (v16i8) __msa_fill_h(filt_const2);
3987
3988 src -= (stride * 2);
3989
3990 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3991 src += (5 * stride);
3992
3993 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3994 src32_r, src43_r);
3995 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3996 XORI_B2_128_SB(src2110, src4332);
3997 LD_SB4(src, stride, src5, src6, src7, src8);
3998 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3999 src76_r, src87_r);
4000 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4001 XORI_B2_128_SB(src6554, src8776);
4002 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4003 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4004 SRARI_H2_SH(out10, out32, 5);
4005 SAT_SH2_SH(out10, out32, 7);
4006 LW4(dst, stride, tp0, tp1, tp2, tp3);
4007 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4008 res = PCKEV_XORI128_UB(out10, out32);
4009 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4010 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4011 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4012 res = __msa_aver_u_b(res, (v16u8) src32_r);
4013 dst0 = __msa_aver_u_b(res, dst0);
4014 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4015 }
4016
ff_avg_h264_qpel16_mc11_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4017 void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
4018 ptrdiff_t stride)
4019 {
4020 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4021 src - (stride * 2),
4022 dst, stride);
4023 }
4024
ff_avg_h264_qpel16_mc31_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4025 void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
4026 ptrdiff_t stride)
4027 {
4028 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4029 src - (stride * 2) +
4030 sizeof(uint8_t),
4031 dst, stride);
4032 }
4033
ff_avg_h264_qpel16_mc13_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4034 void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
4035 ptrdiff_t stride)
4036 {
4037 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4038 src - (stride * 2),
4039 dst, stride);
4040 }
4041
ff_avg_h264_qpel16_mc33_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4042 void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
4043 ptrdiff_t stride)
4044 {
4045 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4046 src - (stride * 2) +
4047 sizeof(uint8_t),
4048 dst, stride);
4049 }
4050
ff_avg_h264_qpel8_mc11_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4051 void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
4052 ptrdiff_t stride)
4053 {
4054 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4055 src - (stride * 2),
4056 dst, stride);
4057 }
4058
ff_avg_h264_qpel8_mc31_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4059 void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
4060 ptrdiff_t stride)
4061 {
4062 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4063 src - (stride * 2) +
4064 sizeof(uint8_t), dst, stride);
4065 }
4066
ff_avg_h264_qpel8_mc13_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4067 void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
4068 ptrdiff_t stride)
4069 {
4070 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4071 src - (stride * 2),
4072 dst, stride);
4073 }
4074
ff_avg_h264_qpel8_mc33_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4075 void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
4076 ptrdiff_t stride)
4077 {
4078 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4079 src - (stride * 2) +
4080 sizeof(uint8_t), dst, stride);
4081 }
4082
4083
ff_avg_h264_qpel4_mc11_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4084 void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
4085 ptrdiff_t stride)
4086 {
4087 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4088 src - (stride * 2),
4089 dst, stride);
4090 }
4091
ff_avg_h264_qpel4_mc31_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4092 void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
4093 ptrdiff_t stride)
4094 {
4095 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4096 src - (stride * 2) +
4097 sizeof(uint8_t), dst, stride);
4098 }
4099
ff_avg_h264_qpel4_mc13_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4100 void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
4101 ptrdiff_t stride)
4102 {
4103 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4104 src - (stride * 2),
4105 dst, stride);
4106 }
4107
ff_avg_h264_qpel4_mc33_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4108 void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
4109 ptrdiff_t stride)
4110 {
4111 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4112 src - (stride * 2) +
4113 sizeof(uint8_t), dst, stride);
4114 }
4115
ff_avg_h264_qpel16_mc21_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4116 void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
4117 ptrdiff_t stride)
4118 {
4119 uint64_t tp0, tp1, tp2, tp3;
4120 uint8_t *dst_tmp = dst;
4121 const uint8_t *src_tmp = src - (2 * stride) - 2;
4122 uint32_t multiple8_cnt, loop_cnt;
4123 const int32_t filt_const0 = 0xfffb0001;
4124 const int32_t filt_const1 = 0x140014;
4125 const int32_t filt_const2 = 0x1fffb;
4126 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4127 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4128 v16i8 mask2;
4129 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4130 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4131 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4132 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4133 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4134 v8i16 hz_out87_l, filt0, filt1, filt2;
4135 v4i32 tmp0_w, tmp1_w;
4136
4137 filt0 = (v8i16) __msa_fill_w(filt_const0);
4138 filt1 = (v8i16) __msa_fill_w(filt_const1);
4139 filt2 = (v8i16) __msa_fill_w(filt_const2);
4140
4141 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4142
4143 for (multiple8_cnt = 2; multiple8_cnt--;) {
4144 dst = dst_tmp;
4145 src = src_tmp;
4146
4147 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4148 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4149 src += (5 * stride);
4150
4151 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4152 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4153 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4154 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4155 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4156
4157 for (loop_cnt = 4; loop_cnt--;) {
4158 LD_SB2(src, stride, src5, src6);
4159 src += (2 * stride);
4160
4161 XORI_B2_128_SB(src5, src6);
4162 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4163 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4164 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4165 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4166 hz_out43_r);
4167 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4168 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4169 hz_out43_l);
4170 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4171 hz_out65_r);
4172 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4173 hz_out65_l);
4174 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4175 filt1, filt2);
4176 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4177 filt1, filt2);
4178 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4179 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4180 filt1, filt2);
4181 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4182 filt1, filt2);
4183 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4184
4185 tmp1 = __msa_srari_h(hz_out2, 5);
4186 tmp3 = __msa_srari_h(hz_out3, 5);
4187 SAT_SH2_SH(tmp1, tmp3, 7);
4188
4189 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4190 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4191
4192 LD2(dst, stride, tp0, tp1);
4193 INSERT_D2_UB(tp0, tp1, dst0);
4194
4195 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4196 dst0 = __msa_aver_u_b(out0, dst0);
4197 ST_D2(dst0, 0, 1, dst, stride);
4198 dst += (2 * stride);
4199
4200 LD_SB2(src, stride, src7, src8);
4201 src += (2 * stride);
4202
4203 XORI_B2_128_SB(src7, src8);
4204 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4205 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4206 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4207 hz_out87_r);
4208 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4209 hz_out87_l);
4210 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4211 filt1, filt2);
4212 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4213 filt1, filt2);
4214 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4215 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4216 filt1, filt2);
4217 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4218 filt1, filt2);
4219 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4220
4221 tmp5 = __msa_srari_h(hz_out4, 5);
4222 tmp7 = __msa_srari_h(hz_out5, 5);
4223 SAT_SH2_SH(tmp5, tmp7, 7);
4224
4225 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4226 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4227
4228 LD2(dst, stride, tp2, tp3);
4229 INSERT_D2_UB(tp2, tp3, dst1);
4230
4231 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4232 dst1 = __msa_aver_u_b(out1, dst1);
4233 ST_D2(dst1, 0, 1, dst, stride);
4234 dst += (2 * stride);
4235
4236 hz_out0 = hz_out4;
4237 hz_out1 = hz_out5;
4238 hz_out2 = hz_out6;
4239 hz_out3 = hz_out7;
4240 hz_out4 = hz_out8;
4241 }
4242
4243 src_tmp += 8;
4244 dst_tmp += 8;
4245 }
4246 }
4247
ff_avg_h264_qpel16_mc23_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4248 void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
4249 ptrdiff_t stride)
4250 {
4251 uint64_t tp0, tp1, tp2, tp3;
4252 uint8_t *dst_tmp = dst;
4253 const uint8_t *src_tmp = src - (2 * stride) - 2;
4254 uint32_t multiple8_cnt, loop_cnt;
4255 const int32_t filt_const0 = 0xfffb0001;
4256 const int32_t filt_const1 = 0x140014;
4257 const int32_t filt_const2 = 0x1fffb;
4258 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4259 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4260 v16i8 mask2;
4261 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4262 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4263 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4264 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4265 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4266 v8i16 hz_out87_l, filt0, filt1, filt2;
4267 v4i32 tmp0_w, tmp1_w;
4268
4269 filt0 = (v8i16) __msa_fill_w(filt_const0);
4270 filt1 = (v8i16) __msa_fill_w(filt_const1);
4271 filt2 = (v8i16) __msa_fill_w(filt_const2);
4272
4273 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4274
4275 for (multiple8_cnt = 2; multiple8_cnt--;) {
4276 dst = dst_tmp;
4277 src = src_tmp;
4278
4279 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4280 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4281 src += (5 * stride);
4282
4283 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4284 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4285 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4286 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4287 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4288
4289 for (loop_cnt = 4; loop_cnt--;) {
4290 LD_SB2(src, stride, src5, src6);
4291 src += (2 * stride);
4292
4293 XORI_B2_128_SB(src5, src6);
4294 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4295 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4296 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4297 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4298 hz_out43_r);
4299 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4300 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4301 hz_out43_l);
4302 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4303 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4304
4305 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4306 filt1, filt2);
4307 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4308 filt1, filt2);
4309 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4310 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4311 filt1, filt2);
4312 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4313 filt1, filt2);
4314 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4315
4316 tmp1 = __msa_srari_h(hz_out3, 5);
4317 tmp3 = __msa_srari_h(hz_out4, 5);
4318 SAT_SH2_SH(tmp1, tmp3, 7);
4319
4320 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4321 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4322
4323 LD2(dst, stride, tp0, tp1);
4324 INSERT_D2_UB(tp0, tp1, dst0);
4325 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4326 dst0 = __msa_aver_u_b(out0, dst0);
4327 ST_D2(dst0, 0, 1, dst, stride);
4328 dst += (2 * stride);
4329
4330 LD_SB2(src, stride, src7, src8);
4331 src += (2 * stride);
4332
4333 XORI_B2_128_SB(src7, src8);
4334 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4335 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4336 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4337 hz_out87_r);
4338 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4339 hz_out87_l);
4340 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4341 filt1, filt2);
4342 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4343 filt1, filt2);
4344 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4345 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4346 filt1, filt2);
4347 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4348 filt1, filt2);
4349 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4350
4351 tmp5 = __msa_srari_h(hz_out5, 5);
4352 tmp7 = __msa_srari_h(hz_out6, 5);
4353 SAT_SH2_SH(tmp5, tmp7, 7);
4354
4355 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4356 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4357
4358 LD2(dst, stride, tp2, tp3);
4359 INSERT_D2_UB(tp2, tp3, dst1);
4360 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4361 dst1 = __msa_aver_u_b(out1, dst1);
4362 ST_D2(dst1, 0, 1, dst, stride);
4363 dst += (2 * stride);
4364
4365 hz_out0 = hz_out4;
4366 hz_out1 = hz_out5;
4367 hz_out2 = hz_out6;
4368 hz_out3 = hz_out7;
4369 hz_out4 = hz_out8;
4370 }
4371
4372 src_tmp += 8;
4373 dst_tmp += 8;
4374 }
4375 }
4376
ff_avg_h264_qpel8_mc21_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4377 void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
4378 ptrdiff_t stride)
4379 {
4380 const int32_t filt_const0 = 0xfffb0001;
4381 const int32_t filt_const1 = 0x140014;
4382 const int32_t filt_const2 = 0x1fffb;
4383 uint64_t tp0, tp1, tp2, tp3;
4384 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4385 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4386 v16i8 src11, src12, mask0, mask1, mask2;
4387 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4388 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4389 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4390 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4391 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4392 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4393 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4394 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4395 v4i32 tmp0_w, tmp1_w;
4396
4397 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4398
4399 filt0 = (v8i16) __msa_fill_w(filt_const0);
4400 filt1 = (v8i16) __msa_fill_w(filt_const1);
4401 filt2 = (v8i16) __msa_fill_w(filt_const2);
4402
4403 src -= ((2 * stride) + 2);
4404
4405 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4406 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4407 src += (5 * stride);
4408
4409 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4410 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4411 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4412 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4413 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4414
4415 LD_SB4(src, stride, src5, src6, src7, src8);
4416 src += (4 * stride);
4417 XORI_B4_128_SB(src5, src6, src7, src8);
4418
4419 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4420 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4421 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4422 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4423
4424 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4425 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4426 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4428 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4429 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4430 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4432
4433 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4434 filt2);
4435 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4436 filt2);
4437 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4438 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4439 filt2);
4440 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4441 filt2);
4442 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4443 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4444 filt2);
4445 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4446 filt2);
4447 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4448 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4449 filt2);
4450 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4451 filt2);
4452 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4453
4454 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4455 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4456
4457 LD4(dst, stride, tp0, tp1, tp2, tp3);
4458 INSERT_D2_UB(tp0, tp1, dst0);
4459 INSERT_D2_UB(tp2, tp3, dst1);
4460
4461 tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4462 tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4463 tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4464 tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4465
4466 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4467 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4468 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4469 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4470 dst += (4 * stride);
4471
4472 LD_SB4(src, stride, src9, src10, src11, src12);
4473 XORI_B4_128_SB(src9, src10, src11, src12);
4474 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4475 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4476 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4477 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4478 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4479 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4480 hz_out1211_r);
4481 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4482 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4483 hz_out1211_l);
4484 tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4485 filt2);
4486 tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4487 filt2);
4488 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4489 tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4490 filt2);
4491 tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4492 filt2);
4493 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4494 tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4495 filt2);
4496 tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4497 filt2);
4498 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4499 tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4500 filt2);
4501 tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4502 filt2);
4503 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4504
4505 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4506 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4507
4508 LD4(dst, stride, tp0, tp1, tp2, tp3);
4509 INSERT_D2_UB(tp0, tp1, dst0);
4510 INSERT_D2_UB(tp2, tp3, dst1);
4511
4512 tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4513 tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4514 tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4515 tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4516
4517 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4518 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4519 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4520 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4521 }
4522
ff_avg_h264_qpel8_mc23_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4523 void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
4524 ptrdiff_t stride)
4525 {
4526 const int32_t filt_const0 = 0xfffb0001;
4527 const int32_t filt_const1 = 0x140014;
4528 const int32_t filt_const2 = 0x1fffb;
4529 uint64_t tp0, tp1, tp2, tp3;
4530 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4531 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4532 v16i8 src11, src12, mask0, mask1, mask2;
4533 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4534 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4535 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4536 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4537 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4538 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4539 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4540 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4541 v4i32 tmp0_w, tmp1_w;
4542
4543 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4544
4545 filt0 = (v8i16) __msa_fill_w(filt_const0);
4546 filt1 = (v8i16) __msa_fill_w(filt_const1);
4547 filt2 = (v8i16) __msa_fill_w(filt_const2);
4548
4549 src -= ((2 * stride) + 2);
4550
4551 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4552 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4553 src += (5 * stride);
4554
4555 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4556 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4557 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4558 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4559 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4560
4561 LD_SB4(src, stride, src5, src6, src7, src8);
4562 src += (4 * stride);
4563 XORI_B4_128_SB(src5, src6, src7, src8);
4564
4565 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4566 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4567 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4568 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4569
4570 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4571 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4572 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4574 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4575 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4576 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4578
4579 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4580 filt2);
4581 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4582 filt2);
4583 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4584 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4585 filt2);
4586 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4587 filt2);
4588 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4589 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4590 filt2);
4591 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4592 filt2);
4593 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4594 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4595 filt2);
4596 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4597 filt2);
4598 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4599
4600 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4601 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4602
4603 LD4(dst, stride, tp0, tp1, tp2, tp3);
4604 INSERT_D2_UB(tp0, tp1, dst0);
4605 INSERT_D2_UB(tp2, tp3, dst1);
4606
4607 tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4608 tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4609 tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4610 tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4611
4612 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4613 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4614 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4615 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4616 dst += (4 * stride);
4617
4618 LD_SB4(src, stride, src9, src10, src11, src12);
4619 XORI_B4_128_SB(src9, src10, src11, src12);
4620 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4621 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4622 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4623 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4624 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4625 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4626 hz_out1211_r);
4627 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4628 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4629 hz_out1211_l);
4630 tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4631 filt2);
4632 tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4633 filt2);
4634 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4635 tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4636 filt2);
4637 tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4638 filt2);
4639 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4640 tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4641 filt2);
4642 tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4643 filt2);
4644 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4645 tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4646 filt2);
4647 tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4648 filt2);
4649 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4650
4651 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4652 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4653
4654 LD4(dst, stride, tp0, tp1, tp2, tp3);
4655 INSERT_D2_UB(tp0, tp1, dst0);
4656 INSERT_D2_UB(tp2, tp3, dst1);
4657
4658 tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4659 tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4660 tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4661 tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4662
4663 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4664 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4665 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4666 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4667 }
4668
ff_avg_h264_qpel4_mc21_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4669 void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
4670 ptrdiff_t stride)
4671 {
4672 uint32_t tp0, tp1, tp2, tp3;
4673 const int32_t filt_const0 = 0xfffb0001;
4674 const int32_t filt_const1 = 0x140014;
4675 const int32_t filt_const2 = 0x1fffb;
4676 v16u8 res, out = { 0 };
4677 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4678 v16i8 mask0, mask1, mask2;
4679 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4680 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4681 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4682 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4683 v4i32 tmp0, tmp1;
4684
4685 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4686
4687 filt0 = (v8i16) __msa_fill_w(filt_const0);
4688 filt1 = (v8i16) __msa_fill_w(filt_const1);
4689 filt2 = (v8i16) __msa_fill_w(filt_const2);
4690
4691 src -= ((2 * stride) + 2);
4692
4693 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4694 src += (5 * stride);
4695 LD_SB4(src, stride, src5, src6, src7, src8);
4696
4697 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4698 XORI_B4_128_SB(src5, src6, src7, src8);
4699
4700 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4701 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4702 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4703 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4704 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4705 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4706 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4707
4708 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4709 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4710 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4711 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4712
4713 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4714 filt2);
4715 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4716 filt2);
4717 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4718 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4719 filt2);
4720 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4721 filt2);
4722 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4723
4724 SRARI_H2_SH(hz_out2, hz_out4, 5);
4725 SAT_SH2_SH(hz_out2, hz_out4, 7);
4726
4727 dst0 = __msa_aver_s_h(dst0, hz_out2);
4728 dst1 = __msa_aver_s_h(dst1, hz_out4);
4729 LW4(dst, stride, tp0, tp1, tp2, tp3);
4730 INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4731 res = PCKEV_XORI128_UB(dst0, dst1);
4732 res = __msa_aver_u_b(res, out);
4733 ST_W4(res, 0, 1, 2, 3, dst, stride);
4734 }
4735
ff_avg_h264_qpel4_mc23_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4736 void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
4737 ptrdiff_t stride)
4738 {
4739 const int32_t filt_const0 = 0xfffb0001;
4740 const int32_t filt_const1 = 0x140014;
4741 const int32_t filt_const2 = 0x1fffb;
4742 uint32_t tp0, tp1, tp2, tp3;
4743 v16u8 res, out = { 0 };
4744 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4745 v16i8 mask0, mask1, mask2;
4746 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4747 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4748 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4749 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4750 v4i32 tmp0, tmp1;
4751
4752 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4753
4754 filt0 = (v8i16) __msa_fill_w(filt_const0);
4755 filt1 = (v8i16) __msa_fill_w(filt_const1);
4756 filt2 = (v8i16) __msa_fill_w(filt_const2);
4757
4758 src -= ((2 * stride) + 2);
4759
4760 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4761 src += (5 * stride);
4762 LD_SB4(src, stride, src5, src6, src7, src8);
4763
4764 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4765 XORI_B4_128_SB(src5, src6, src7, src8);
4766
4767 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4768 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4769 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4770 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4771 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4772 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4773 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4774
4775 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4776 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4777 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4778 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4779
4780 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4781 filt2);
4782 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4783 filt2);
4784 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4785 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4786 filt2);
4787 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4788 filt2);
4789 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4790
4791 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4792 SRARI_H2_SH(hz_out0, hz_out1, 5);
4793 SAT_SH2_SH(hz_out0, hz_out1, 7);
4794
4795 dst0 = __msa_aver_s_h(dst0, hz_out0);
4796 dst1 = __msa_aver_s_h(dst1, hz_out1);
4797 LW4(dst, stride, tp0, tp1, tp2, tp3);
4798 INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4799 res = PCKEV_XORI128_UB(dst0, dst1);
4800 res = __msa_aver_u_b(res, out);
4801 ST_W4(res, 0, 1, 2, 3, dst, stride);
4802 }
4803
ff_avg_h264_qpel16_mc02_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4804 void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
4805 ptrdiff_t stride)
4806 {
4807 int32_t loop_cnt;
4808 int16_t filt_const0 = 0xfb01;
4809 int16_t filt_const1 = 0x1414;
4810 int16_t filt_const2 = 0x1fb;
4811 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4812 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4813 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4814 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4815 v16i8 src65_l, src87_l, filt0, filt1, filt2;
4816 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4817
4818 filt0 = (v16i8) __msa_fill_h(filt_const0);
4819 filt1 = (v16i8) __msa_fill_h(filt_const1);
4820 filt2 = (v16i8) __msa_fill_h(filt_const2);
4821 src -= (stride * 2);
4822
4823 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4824 src += (5 * stride);
4825
4826 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4827 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4828 src32_r, src43_r);
4829 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
4830 src32_l, src43_l);
4831
4832 for (loop_cnt = 4; loop_cnt--;) {
4833 LD_SB4(src, stride, src5, src6, src7, src8);
4834 src += (4 * stride);
4835
4836 XORI_B4_128_SB(src5, src6, src7, src8);
4837 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4838 src65_r, src76_r, src87_r);
4839 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4840 src65_l, src76_l, src87_l);
4841 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4842 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4843 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4844 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4845 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4846 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4847 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4848 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4849 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4850 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4851 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
4852 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4853 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
4854 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4855 out3_r, res0, res1, res2, res3);
4856 XORI_B4_128_UB(res0, res1, res2, res3);
4857 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
4858 AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3);
4859 ST_UB4(res0, res1, res2, res3, dst, stride);
4860 dst += (4 * stride);
4861
4862 src10_r = src54_r;
4863 src32_r = src76_r;
4864 src21_r = src65_r;
4865 src43_r = src87_r;
4866 src10_l = src54_l;
4867 src32_l = src76_l;
4868 src21_l = src65_l;
4869 src43_l = src87_l;
4870 src4 = src8;
4871 }
4872 }
4873
ff_avg_h264_qpel8_mc02_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4874 void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
4875 ptrdiff_t stride)
4876 {
4877 uint64_t tp0, tp1, tp2, tp3;
4878 const int16_t filt_const0 = 0xfb01;
4879 const int16_t filt_const1 = 0x1414;
4880 const int16_t filt_const2 = 0x1fb;
4881 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4882 v16u8 out0, out1, out2, out3;
4883 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4884 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4885 v16i8 filt0, filt1, filt2;
4886 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4887
4888 filt0 = (v16i8) __msa_fill_h(filt_const0);
4889 filt1 = (v16i8) __msa_fill_h(filt_const1);
4890 filt2 = (v16i8) __msa_fill_h(filt_const2);
4891
4892 src -= (stride * 2);
4893
4894 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4895 src += (5 * stride);
4896
4897 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4898 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4899 src32_r, src43_r);
4900
4901 LD_SB4(src, stride, src7, src8, src9, src10);
4902 src += (4 * stride);
4903 XORI_B4_128_SB(src7, src8, src9, src10);
4904 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4905 src87_r, src98_r, src109_r);
4906 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4907 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4908 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4909 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4910
4911 LD_SB4(src, stride, src0, src1, src2, src3);
4912 XORI_B4_128_SB(src0, src1, src2, src3);
4913 ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
4914 src21_r, src32_r, src43_r);
4915 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4916 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4917 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4918 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4919
4920 LD4(dst, stride, tp0, tp1, tp2, tp3);
4921 INSERT_D2_UB(tp0, tp1, dst0);
4922 INSERT_D2_UB(tp2, tp3, dst1);
4923 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
4924 INSERT_D2_UB(tp0, tp1, dst2);
4925 INSERT_D2_UB(tp2, tp3, dst3);
4926
4927 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4928 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
4929 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4930 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4931 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
4932 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
4933 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
4934 out3 = PCKEV_XORI128_UB(out6_r, out7_r);
4935 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4936 dst2, dst3);
4937 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
4938 }
4939
ff_avg_h264_qpel4_mc02_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4940 void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
4941 ptrdiff_t stride)
4942 {
4943 uint32_t tp0, tp1, tp2, tp3;
4944 int16_t filt_const0 = 0xfb01;
4945 int16_t filt_const1 = 0x1414;
4946 int16_t filt_const2 = 0x1fb;
4947 v16u8 res, dst0 = { 0 };
4948 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4949 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4950 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4951 v8i16 out10, out32;
4952
4953 filt0 = (v16i8) __msa_fill_h(filt_const0);
4954 filt1 = (v16i8) __msa_fill_h(filt_const1);
4955 filt2 = (v16i8) __msa_fill_h(filt_const2);
4956
4957 src -= (stride * 2);
4958 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4959 src += (5 * stride);
4960
4961 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4962 src32_r, src43_r);
4963 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4964 XORI_B2_128_SB(src2110, src4332);
4965 LD_SB4(src, stride, src5, src6, src7, src8);
4966 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4967 src76_r, src87_r);
4968 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4969 XORI_B2_128_SB(src6554, src8776);
4970 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4971 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4972 SRARI_H2_SH(out10, out32, 5);
4973 SAT_SH2_SH(out10, out32, 7);
4974 LW4(dst, stride, tp0, tp1, tp2, tp3);
4975 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4976 res = PCKEV_XORI128_UB(out10, out32);
4977 dst0 = __msa_aver_u_b(res, dst0);
4978 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4979 }
4980
ff_avg_h264_qpel16_mc12_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)4981 void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
4982 ptrdiff_t stride)
4983 {
4984 uint32_t row;
4985 v16u8 out, dst0;
4986 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4987 v16i8 src11;
4988 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4989 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4990 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4991 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4992 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4993 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4994 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4995 v8i16 minus5h = __msa_ldi_h(-5);
4996 v8i16 plus20h = __msa_ldi_h(20);
4997
4998 mask3 = mask0 + 4;
4999 mask4 = mask1 + 4;
5000 mask5 = mask2 + 4;
5001
5002 src -= ((2 * stride) + 2);
5003
5004 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5005 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5006 src += (5 * stride);
5007 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5008 XORI_B5_128_SB(src7, src8, src9, src10, src11);
5009
5010 for (row = 16; row--;) {
5011 LD_SB2(src, 8, src5, src6);
5012 src += stride;
5013 XORI_B2_128_SB(src5, src6);
5014 dst0 = LD_UB(dst);
5015
5016 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5017 vt_res0, vt_res1);
5018 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5019 vt_res2, vt_res3);
5020 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5021 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5022 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5023 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5024 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5025 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5026 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5027 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5028 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5029 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5030 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5031 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5032 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5033 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5034 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5035 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5036 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5037 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5038 tmp0 = __msa_srari_h(shf_vec2, 5);
5039 tmp1 = __msa_srari_h(shf_vec5, 5);
5040 tmp2 = __msa_srari_h(shf_vec8, 5);
5041 tmp3 = __msa_srari_h(shf_vec11, 5);
5042 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5043 PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5044 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5045 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5046 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5047 out = PCKEV_XORI128_UB(tmp0, tmp1);
5048 out = __msa_aver_u_b(out, dst0);
5049 ST_UB(out, dst);
5050 dst += stride;
5051
5052 src0 = src1;
5053 src1 = src2;
5054 src2 = src3;
5055 src3 = src4;
5056 src4 = src5;
5057 src7 = src8;
5058 src8 = src9;
5059 src9 = src10;
5060 src10 = src11;
5061 src11 = src6;
5062 }
5063 }
5064
ff_avg_h264_qpel16_mc32_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5065 void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
5066 ptrdiff_t stride)
5067 {
5068 uint32_t row;
5069 v16u8 out, dst0;
5070 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5071 v16i8 src11;
5072 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5073 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5074 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5075 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5076 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5077 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5078 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5079 v8i16 minus5h = __msa_ldi_h(-5);
5080 v8i16 plus20h = __msa_ldi_h(20);
5081
5082 mask3 = mask0 + 4;
5083 mask4 = mask1 + 4;
5084 mask5 = mask2 + 4;
5085
5086 src -= ((2 * stride) + 2);
5087
5088 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5089 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5090 src += (5 * stride);
5091 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5092 XORI_B5_128_SB(src7, src8, src9, src10, src11);
5093
5094 for (row = 16; row--;) {
5095 LD_SB2(src, 8, src5, src6);
5096 src += stride;
5097 XORI_B2_128_SB(src5, src6);
5098 dst0 = LD_UB(dst);
5099
5100 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5101 vt_res0, vt_res1);
5102 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5103 vt_res2, vt_res3);
5104 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5105 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5106 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5107 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5108 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5109 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5110 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5111 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5112 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5113 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5114 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5115 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5116 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5117 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5118 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5119 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5120 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5121 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5122 tmp0 = __msa_srari_h(shf_vec2, 5);
5123 tmp1 = __msa_srari_h(shf_vec5, 5);
5124 tmp2 = __msa_srari_h(shf_vec8, 5);
5125 tmp3 = __msa_srari_h(shf_vec11, 5);
5126 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5127 tmp0 = __msa_pckod_h(tmp2, tmp0);
5128 tmp1 = __msa_pckod_h(tmp3, tmp1);
5129 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5130 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5131 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5132 out = PCKEV_XORI128_UB(tmp0, tmp1);
5133 out = __msa_aver_u_b(out, dst0);
5134 ST_UB(out, dst);
5135 dst += stride;
5136
5137 src0 = src1;
5138 src1 = src2;
5139 src2 = src3;
5140 src3 = src4;
5141 src4 = src5;
5142 src7 = src8;
5143 src8 = src9;
5144 src9 = src10;
5145 src10 = src11;
5146 src11 = src6;
5147 }
5148 }
5149
ff_avg_h264_qpel8_mc12_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5150 void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
5151 ptrdiff_t stride)
5152 {
5153 uint32_t row;
5154 uint64_t tp0, tp1;
5155 v16u8 out, dst0 = { 0 };
5156 v16i8 src0, src1, src2, src3, src4, src5, src6;
5157 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5158 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5159 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5160 v8i16 mask3, mask4, mask5;
5161 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5162 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5163 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5164 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5165 v8i16 minus5h = __msa_ldi_h(-5);
5166 v8i16 plus20h = __msa_ldi_h(20);
5167
5168 mask3 = mask0 + 4;
5169 mask4 = mask1 + 4;
5170 mask5 = mask2 + 4;
5171
5172 src -= ((2 * stride) + 2);
5173
5174 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5175 src += (5 * stride);
5176 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5177
5178 for (row = 4; row--;) {
5179 LD_SB2(src, stride, src5, src6);
5180 src += (2 * stride);
5181 XORI_B2_128_SB(src5, src6);
5182
5183 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5184 vt_res0, vt_res1);
5185 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5186 vt_res2, vt_res3);
5187 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5188 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5189 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5190 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5191 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5192 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5193 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5194 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5195 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5196 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5197 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5198 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5199 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5200 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5201 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5202 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5203 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5204 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5205 tmp0 = __msa_srari_h(shf_vec2, 5);
5206 tmp1 = __msa_srari_h(shf_vec5, 5);
5207 tmp2 = __msa_srari_h(shf_vec8, 5);
5208 tmp3 = __msa_srari_h(shf_vec11, 5);
5209 LD2(dst, stride, tp0, tp1);
5210 INSERT_D2_UB(tp0, tp1, dst0);
5211 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5212 PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5213 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5214 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5215 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5216 out = PCKEV_XORI128_UB(tmp0, tmp1);
5217 out = __msa_aver_u_b(out, dst0);
5218 ST_D2(out, 0, 1, dst, stride);
5219 dst += (2 * stride);
5220
5221 src0 = src2;
5222 src1 = src3;
5223 src2 = src4;
5224 src3 = src5;
5225 src4 = src6;
5226 }
5227 }
5228
ff_avg_h264_qpel8_mc32_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5229 void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
5230 ptrdiff_t stride)
5231 {
5232 uint32_t row;
5233 uint64_t tp0, tp1;
5234 v16u8 out, dst0 = { 0 };
5235 v16i8 src0, src1, src2, src3, src4, src5, src6;
5236 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5237 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5238 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5239 v8i16 mask3, mask4, mask5;
5240 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5241 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5242 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5243 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5244 v8i16 minus5h = __msa_ldi_h(-5);
5245 v8i16 plus20h = __msa_ldi_h(20);
5246
5247 mask3 = mask0 + 4;
5248 mask4 = mask1 + 4;
5249 mask5 = mask2 + 4;
5250
5251 src -= ((2 * stride) + 2);
5252
5253 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5254 src += (5 * stride);
5255 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5256
5257 for (row = 4; row--;) {
5258 LD_SB2(src, stride, src5, src6);
5259 src += (2 * stride);
5260 XORI_B2_128_SB(src5, src6);
5261
5262 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5263 vt_res0, vt_res1);
5264 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5265 vt_res2, vt_res3);
5266 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5267 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5268 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5269 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5270 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5271 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5272 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5273 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5274 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5275 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5276 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5277 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5278 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5279 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5280 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5281 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5282 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5283 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5284 tmp0 = __msa_srari_h(shf_vec2, 5);
5285 tmp1 = __msa_srari_h(shf_vec5, 5);
5286 tmp2 = __msa_srari_h(shf_vec8, 5);
5287 tmp3 = __msa_srari_h(shf_vec11, 5);
5288 LD2(dst, stride, tp0, tp1);
5289 INSERT_D2_UB(tp0, tp1, dst0);
5290 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5291 tmp0 = __msa_pckod_h(tmp2, tmp0);
5292 tmp1 = __msa_pckod_h(tmp3, tmp1);
5293 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5294 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5295 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5296 out = PCKEV_XORI128_UB(tmp0, tmp1);
5297 out = __msa_aver_u_b(out, dst0);
5298 ST_D2(out, 0, 1, dst, stride);
5299 dst += (2 * stride);
5300
5301 src0 = src2;
5302 src1 = src3;
5303 src2 = src4;
5304 src3 = src5;
5305 src4 = src6;
5306 }
5307 }
5308
ff_avg_h264_qpel4_mc12_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5309 void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
5310 ptrdiff_t stride)
5311 {
5312 uint32_t tp0, tp1, tp2, tp3;
5313 const int16_t filt_const0 = 0xfb01;
5314 const int16_t filt_const1 = 0x1414;
5315 const int16_t filt_const2 = 0x1fb;
5316 v16u8 out, dstv = { 0 };
5317 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5318 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5319 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5320 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5321 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5322 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5323 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5324 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5325 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5326 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5327 v8i16 minus5h = __msa_ldi_h(-5);
5328 v8i16 plus20h = __msa_ldi_h(20);
5329 v8i16 zeros = { 0 };
5330
5331 filt0 = (v16i8) __msa_fill_h(filt_const0);
5332 filt1 = (v16i8) __msa_fill_h(filt_const1);
5333 filt2 = (v16i8) __msa_fill_h(filt_const2);
5334
5335 src -= ((2 * stride) + 2);
5336
5337 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5338 src += (5 * stride);
5339 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5340 LD_SB4(src, stride, src5, src6, src7, src8);
5341 XORI_B4_128_SB(src5, src6, src7, src8);
5342
5343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5344 src32_r, src43_r);
5345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5346 src76_r, src87_r);
5347 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5348 src32_l, src43_l);
5349 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5350 src76_l, src87_l);
5351 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5352 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5353 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5354 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5355 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5356 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5357 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5358 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5359 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5360 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5361 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5362 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5363
5364 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5365 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5366 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5367 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5368 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5369 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5370 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5371 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5372 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5373 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5374 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5375 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5376
5377 SRARI_W2_SW(hz_res0, hz_res1, 10);
5378 SAT_SW2_SW(hz_res0, hz_res1, 7);
5379 SRARI_W2_SW(hz_res2, hz_res3, 10);
5380 SAT_SW2_SW(hz_res2, hz_res3, 7);
5381
5382 dst0 = __msa_srari_h(shf_vec2, 5);
5383 dst1 = __msa_srari_h(shf_vec5, 5);
5384 dst2 = __msa_srari_h(shf_vec6, 5);
5385 dst3 = __msa_srari_h(shf_vec7, 5);
5386
5387 SAT_SH2_SH(dst0, dst1, 7);
5388 SAT_SH2_SH(dst2, dst3, 7);
5389 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5390 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5391
5392 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5393 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5394 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5395 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5396
5397 LW4(dst, stride, tp0, tp1, tp2, tp3);
5398 INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5399 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5400 out = PCKEV_XORI128_UB(dst0, dst2);
5401 out = __msa_aver_u_b(out, dstv);
5402 ST_W4(out, 0, 1, 2, 3, dst, stride);
5403 }
5404
ff_avg_h264_qpel4_mc32_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5405 void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
5406 ptrdiff_t stride)
5407 {
5408 uint32_t tp0, tp1, tp2, tp3;
5409 const int16_t filt_const0 = 0xfb01;
5410 const int16_t filt_const1 = 0x1414;
5411 const int16_t filt_const2 = 0x1fb;
5412 v16u8 out, dstv = { 0 };
5413 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5414 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5415 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5416 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5417 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5418 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5419 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5420 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5421 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5422 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5423 v8i16 minus5h = __msa_ldi_h(-5);
5424 v8i16 plus20h = __msa_ldi_h(20);
5425 v8i16 zeros = { 0 };
5426
5427 filt0 = (v16i8) __msa_fill_h(filt_const0);
5428 filt1 = (v16i8) __msa_fill_h(filt_const1);
5429 filt2 = (v16i8) __msa_fill_h(filt_const2);
5430
5431 src -= ((2 * stride) + 2);
5432
5433 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5434 src += (5 * stride);
5435 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5436 LD_SB4(src, stride, src5, src6, src7, src8);
5437 XORI_B4_128_SB(src5, src6, src7, src8);
5438
5439 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5440 src32_r, src43_r);
5441 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5442 src76_r, src87_r);
5443 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5444 src32_l, src43_l);
5445 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5446 src76_l, src87_l);
5447 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5448 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5449 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5450 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5451 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5452 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5453 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5454 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5455 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5456 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5457 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5458 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5459
5460 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5461 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5462 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5463 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5464 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5465 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5466 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5467 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5468 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5469 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5470 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5471 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5472
5473 SRARI_W2_SW(hz_res0, hz_res1, 10);
5474 SAT_SW2_SW(hz_res0, hz_res1, 7);
5475 SRARI_W2_SW(hz_res2, hz_res3, 10);
5476 SAT_SW2_SW(hz_res2, hz_res3, 7);
5477
5478 dst0 = __msa_srari_h(shf_vec2, 5);
5479 dst1 = __msa_srari_h(shf_vec5, 5);
5480 dst2 = __msa_srari_h(shf_vec6, 5);
5481 dst3 = __msa_srari_h(shf_vec7, 5);
5482
5483 SAT_SH2_SH(dst0, dst1, 7);
5484 SAT_SH2_SH(dst2, dst3, 7);
5485
5486 dst0 = __msa_ilvod_h(zeros, dst0);
5487 dst1 = __msa_ilvod_h(zeros, dst1);
5488 dst2 = __msa_ilvod_h(zeros, dst2);
5489 dst3 = __msa_ilvod_h(zeros, dst3);
5490
5491 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5492 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5493 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5494 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5495
5496 LW4(dst, stride, tp0, tp1, tp2, tp3);
5497 INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5498 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5499 out = PCKEV_XORI128_UB(dst0, dst2);
5500 out = __msa_aver_u_b(out, dstv);
5501 ST_W4(out, 0, 1, 2, 3, dst, stride);
5502 }
5503
ff_avg_h264_qpel16_mc22_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5504 void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
5505 ptrdiff_t stride)
5506 {
5507 const int32_t filt_const0 = 0xfffb0001;
5508 const int32_t filt_const1 = 0x140014;
5509 const int32_t filt_const2 = 0x1fffb;
5510 const uint8_t *src_tmp = src - (2 * stride) - 2;
5511 uint8_t *dst_tmp = dst;
5512 uint64_t tp0, tp1, tp2, tp3;
5513 uint32_t multiple8_cnt, loop_cnt;
5514 v16u8 dst0, dst1, out0, out1;
5515 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5516 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5517 v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5518 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5519 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5520 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5521 v8i16 hz_out87_l, filt0, filt1, filt2;
5522 v4i32 tmp0, tmp1;
5523
5524 filt0 = (v8i16) __msa_fill_w(filt_const0);
5525 filt1 = (v8i16) __msa_fill_w(filt_const1);
5526 filt2 = (v8i16) __msa_fill_w(filt_const2);
5527
5528 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5529
5530 for (multiple8_cnt = 2; multiple8_cnt--;) {
5531 src = src_tmp;
5532 dst = dst_tmp;
5533
5534 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5535 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5536 src += (5 * stride);
5537
5538 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5539 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5540 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5541 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5542 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5543
5544 for (loop_cnt = 4; loop_cnt--;) {
5545 LD_SB4(src, stride, src0, src1, src2, src3);
5546 XORI_B4_128_SB(src0, src1, src2, src3);
5547 src += (4 * stride);
5548
5549 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5550 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5551 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5552 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5553 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5554 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5555 hz_out43_r);
5556 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5557 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5558 hz_out43_l);
5559 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5560 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5561 hz_out87_r);
5562 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5563 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5564 hz_out87_l);
5565
5566 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
5567 filt1, filt2);
5568 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
5569 filt1, filt2);
5570 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5571 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
5572 filt1, filt2);
5573 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
5574 filt1, filt2);
5575 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5576 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
5577 filt1, filt2);
5578 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
5579 filt1, filt2);
5580 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5581 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
5582 filt1, filt2);
5583 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
5584 filt1, filt2);
5585 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5586
5587 LD4(dst, stride, tp0, tp1, tp2, tp3);
5588 INSERT_D2_UB(tp0, tp1, dst0);
5589 INSERT_D2_UB(tp2, tp3, dst1);
5590 out0 = PCKEV_XORI128_UB(res0, res1);
5591 out1 = PCKEV_XORI128_UB(res2, res3);
5592 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
5593 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
5594 dst += (4 * stride);
5595
5596 hz_out0 = hz_out4;
5597 hz_out1 = hz_out5;
5598 hz_out2 = hz_out6;
5599 hz_out3 = hz_out7;
5600 hz_out4 = hz_out8;
5601 }
5602
5603 src_tmp += 8;
5604 dst_tmp += 8;
5605 }
5606 }
5607
ff_avg_h264_qpel8_mc22_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5608 void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
5609 ptrdiff_t stride)
5610 {
5611 const int32_t filt_const0 = 0xfffb0001;
5612 const int32_t filt_const1 = 0x140014;
5613 const int32_t filt_const2 = 0x1fffb;
5614 uint64_t tp0, tp1, tp2, tp3;
5615 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5616 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5617 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5618 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5619 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5620 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5621 v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5622 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5623 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5624 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5625 v4i32 tmp0, tmp1;
5626
5627 filt0 = (v8i16) __msa_fill_w(filt_const0);
5628 filt1 = (v8i16) __msa_fill_w(filt_const1);
5629 filt2 = (v8i16) __msa_fill_w(filt_const2);
5630
5631 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5632
5633 src -= ((2 * stride) + 2);
5634 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5635 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5636 src += (5 * stride);
5637
5638 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5639 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5640 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5641 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5642 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5643
5644 LD_SB4(src, stride, src0, src1, src2, src3);
5645 XORI_B4_128_SB(src0, src1, src2, src3);
5646 src += (4 * stride);
5647 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5648 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5649 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5650 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5651 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5652 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5653 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5655 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5656 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5657 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5659
5660 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5661 filt2);
5662 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5663 filt2);
5664 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5665 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5666 filt2);
5667 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5668 filt2);
5669 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5670 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5671 filt2);
5672 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5673 filt2);
5674 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5675 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5676 filt2);
5677 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5678 filt2);
5679 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5680 LD4(dst, stride, tp0, tp1, tp2, tp3);
5681 INSERT_D2_UB(tp0, tp1, dst0);
5682 INSERT_D2_UB(tp2, tp3, dst1);
5683 out0 = PCKEV_XORI128_UB(res0, res1);
5684 out1 = PCKEV_XORI128_UB(res2, res3);
5685 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5686 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5687 dst += (4 * stride);
5688
5689 LD_SB4(src, stride, src0, src1, src2, src3);
5690 XORI_B4_128_SB(src0, src1, src2, src3);
5691 hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5692 hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5693 hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5694 hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5695 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5696 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5697 hz_out1211_r);
5698 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5699 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5700 hz_out1211_l);
5701 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5702 filt2);
5703 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5704 filt2);
5705 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5706 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5707 filt2);
5708 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5709 filt2);
5710 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5711 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5712 filt2);
5713 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5714 filt2);
5715 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5716 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5717 filt2);
5718 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5719 filt2);
5720 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5721 LD4(dst, stride, tp0, tp1, tp2, tp3);
5722 INSERT_D2_UB(tp0, tp1, dst0);
5723 INSERT_D2_UB(tp2, tp3, dst1);
5724 out0 = PCKEV_XORI128_UB(res0, res1);
5725 out1 = PCKEV_XORI128_UB(res2, res3);
5726 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5727 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5728 }
5729
ff_avg_h264_qpel4_mc22_msa(uint8_t * dst,const uint8_t * src,ptrdiff_t stride)5730 void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
5731 ptrdiff_t stride)
5732 {
5733 const int32_t filt_const0 = 0xfffb0001;
5734 const int32_t filt_const1 = 0x140014;
5735 const int32_t filt_const2 = 0x1fffb;
5736 uint32_t tp0, tp1, tp2, tp3;
5737 v16u8 res, dst0 = { 0 };
5738 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5739 v16i8 mask0, mask1, mask2;
5740 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5741 v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5742 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5743 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5744 v4i32 tmp0, tmp1;
5745
5746 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
5747
5748 filt0 = (v8i16) __msa_fill_w(filt_const0);
5749 filt1 = (v8i16) __msa_fill_w(filt_const1);
5750 filt2 = (v8i16) __msa_fill_w(filt_const2);
5751
5752 src -= ((2 * stride) + 2);
5753
5754 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5755 src += (5 * stride);
5756 LD_SB4(src, stride, src5, src6, src7, src8);
5757
5758 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5759 XORI_B4_128_SB(src5, src6, src7, src8);
5760 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
5761 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
5762 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
5763 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
5764 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
5765 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
5766 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
5767 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5768 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5769 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5770 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5771
5772 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5773 filt2);
5774 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5775 filt2);
5776 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5777 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5778 filt2);
5779 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5780 filt2);
5781 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5782 LW4(dst, stride, tp0, tp1, tp2, tp3);
5783 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
5784 res = PCKEV_XORI128_UB(res0, res1);
5785 res = __msa_aver_u_b(res, dst0);
5786 ST_W4(res, 0, 1, 2, 3, dst, stride);
5787 }
5788