1 /*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "qpeldsp_mips.h"
23
24 #define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \
25 ( { \
26 v16u8 out, tmp0, tmp1; \
27 v16u8 data0, data1, data2, data3, data4, data5; \
28 v8i16 res_r, res_l; \
29 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
30 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
31 \
32 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
33 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
34 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
35 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
36 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
37 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
38 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
39 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
40 sum0_r *= (v8u16) (coef0); \
41 sum0_l *= (v8u16) (coef0); \
42 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
43 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
44 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
45 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
46 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
47 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
48 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
49 res_r = (v8i16) (sum0_r - sum3_r); \
50 res_l = (v8i16) (sum0_l - sum3_l); \
51 SRARI_H2_SH(res_r, res_l, 5); \
52 CLIP_SH2_0_255(res_r, res_l); \
53 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
54 \
55 out; \
56 } )
57
58 #define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \
59 mask0, mask1, mask2, mask3, \
60 coef0, coef1, coef2) \
61 ( { \
62 v16u8 out; \
63 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
64 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
65 v8i16 res0_r, res1_r; \
66 \
67 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
68 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
69 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
70 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
71 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
72 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
73 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
74 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
75 res0_r = (v8i16) (sum0_r - sum3_r); \
76 res1_r = (v8i16) (sum4_r - sum7_r); \
77 SRARI_H2_SH(res0_r, res1_r, 5); \
78 CLIP_SH2_0_255(res0_r, res1_r); \
79 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
80 \
81 out; \
82 } )
83
84 #define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \
85 mask0, mask1, mask2, mask3, \
86 coef0, coef1, coef2) \
87 ( { \
88 v16u8 out; \
89 v8i16 res0_r; \
90 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
91 \
92 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
93 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
94 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
95 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
96 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
97 res0_r = (v8i16) (sum0_r - sum3_r); \
98 res0_r = __msa_srari_h(res0_r, 5); \
99 CLIP_SH_0_255(res0_r); \
100 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
101 \
102 out; \
103 } )
104
105 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \
106 mask2, mask3, coef0, \
107 coef1, coef2) \
108 ( { \
109 v16u8 out; \
110 v8i16 res0_r; \
111 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
112 \
113 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
114 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
115 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
116 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
117 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
118 res0_r = (v8i16) (sum0_r - sum3_r); \
119 res0_r += 15; \
120 res0_r >>= 5; \
121 CLIP_SH_0_255(res0_r); \
122 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
123 \
124 out; \
125 } )
126
127 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \
128 coef0, coef1, coef2) \
129 ( { \
130 v16u8 out, tmp0, tmp1; \
131 v16u8 data0, data1, data2, data3, data4, data5; \
132 v8i16 res_r, res_l; \
133 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
134 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
135 \
136 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
137 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
138 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
139 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
140 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
141 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
142 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
143 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
144 sum0_r *= (v8u16) (coef0); \
145 sum0_l *= (v8u16) (coef0); \
146 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
147 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
148 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
149 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
150 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
151 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
152 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
153 res_r = (v8i16) (sum0_r - sum3_r); \
154 res_l = (v8i16) (sum0_l - sum3_l); \
155 res_r += 15; \
156 res_l += 15; \
157 res_r >>= 5; \
158 res_l >>= 5; \
159 CLIP_SH2_0_255(res_r, res_l); \
160 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
161 \
162 out; \
163 } )
164
165 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \
166 mask0, mask1, mask2, mask3, \
167 coef0, coef1, coef2) \
168 ( { \
169 v16u8 out; \
170 v8i16 res0_r, res1_r; \
171 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
172 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
173 \
174 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
175 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
176 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
177 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
178 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
179 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
180 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
181 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
182 res0_r = (v8i16) (sum0_r - sum3_r); \
183 res1_r = (v8i16) (sum4_r - sum7_r); \
184 res0_r += 15; \
185 res1_r += 15; \
186 res0_r >>= 5; \
187 res1_r >>= 5; \
188 CLIP_SH2_0_255(res0_r, res1_r); \
189 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
190 \
191 out; \
192 } )
193
194 #define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \
195 inp4, inp5, inp6, inp7, \
196 coef0, coef1, coef2) \
197 ( { \
198 v16u8 res; \
199 v8i16 res_r, res_l; \
200 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
201 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
202 \
203 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
204 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
205 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
206 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
207 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
208 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
209 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
210 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
211 res_r = (v8i16) (sum0_r - sum3_r); \
212 res_l = (v8i16) (sum0_l - sum3_l); \
213 SRARI_H2_SH(res_r, res_l, 5); \
214 CLIP_SH2_0_255(res_r, res_l); \
215 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
216 \
217 res; \
218 } )
219
220 #define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
221 inp04, inp05, inp06, inp07, \
222 inp10, inp11, inp12, inp13, \
223 inp14, inp15, inp16, inp17, \
224 coef0, coef1, coef2) \
225 ( { \
226 v16u8 res; \
227 v8i16 val0, val1; \
228 v8u16 sum00, sum01, sum02, sum03; \
229 v8u16 sum10, sum11, sum12, sum13; \
230 \
231 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
232 sum00, sum10, sum03, sum13); \
233 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
234 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
235 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
236 sum02, sum12, sum01, sum11); \
237 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
238 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
239 val0 = (v8i16) (sum00 - sum03); \
240 val1 = (v8i16) (sum10 - sum13); \
241 SRARI_H2_SH(val0, val1, 5); \
242 CLIP_SH2_0_255(val0, val1); \
243 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
244 \
245 res; \
246 } )
247
248 #define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \
249 inp4, inp5, inp6, inp7, \
250 coef0, coef1, coef2) \
251 ( { \
252 v16u8 res; \
253 v8i16 res_r, res_l; \
254 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
255 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
256 \
257 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
258 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
259 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
260 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
261 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
262 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
263 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
264 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
265 res_r = (v8i16) (sum0_r - sum3_r); \
266 res_l = (v8i16) (sum0_l - sum3_l); \
267 res_r += 15; \
268 res_l += 15; \
269 res_r >>= 5; \
270 res_l >>= 5; \
271 CLIP_SH2_0_255(res_r, res_l); \
272 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
273 \
274 res; \
275 } )
276
277 #define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
278 inp04, inp05, inp06, inp07, \
279 inp10, inp11, inp12, inp13, \
280 inp14, inp15, inp16, inp17, \
281 coef0, coef1, coef2) \
282 ( { \
283 v16u8 res; \
284 v8i16 val0, val1; \
285 v8u16 sum00, sum01, sum02, sum03; \
286 v8u16 sum10, sum11, sum12, sum13; \
287 \
288 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
289 sum00, sum10, sum03, sum13); \
290 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
291 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
292 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
293 sum02, sum12, sum01, sum11); \
294 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
295 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
296 val0 = (v8i16) (sum00 - sum03); \
297 val1 = (v8i16) (sum10 - sum13); \
298 val0 += 15; \
299 val1 += 15; \
300 val0 >>= 5; \
301 val1 >>= 5; \
302 CLIP_SH2_0_255(val0, val1); \
303 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
304 \
305 res; \
306 } )
307
horiz_mc_qpel_aver_src0_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)308 static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src,
309 int32_t src_stride,
310 uint8_t *dst,
311 int32_t dst_stride,
312 int32_t height)
313 {
314 uint8_t loop_count;
315 v16u8 inp0, inp1, inp2, inp3;
316 v16u8 res0, res1;
317 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321 v16u8 const20 = (v16u8) __msa_ldi_b(20);
322 v16u8 const6 = (v16u8) __msa_ldi_b(6);
323 v16u8 const3 = (v16u8) __msa_ldi_b(3);
324
325 for (loop_count = (height >> 2); loop_count--;) {
326 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
327 src += (4 * src_stride);
328 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
329 mask0, mask1, mask2, mask3,
330 const20, const6, const3);
331 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
332 mask0, mask1, mask2, mask3,
333 const20, const6, const3);
334 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
336 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
337 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
338 dst += (4 * dst_stride);
339 }
340 }
341
horiz_mc_qpel_aver_src0_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)342 static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src,
343 int32_t src_stride,
344 uint8_t *dst,
345 int32_t dst_stride,
346 int32_t height)
347 {
348 uint8_t loop_count;
349 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
350 v16u8 res;
351 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352 v16u8 const6 = (v16u8) __msa_ldi_b(6);
353 v16u8 const3 = (v16u8) __msa_ldi_b(3);
354 v8u16 const20 = (v8u16) __msa_ldi_h(20);
355
356 for (loop_count = (height >> 2); loop_count--;) {
357 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
358 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
359 src += (4 * src_stride);
360 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
361 const20, const6, const3);
362 res = __msa_aver_u_b(inp0, res);
363 ST_UB(res, dst);
364 dst += dst_stride;
365
366 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
367 const20, const6, const3);
368 res = __msa_aver_u_b(inp2, res);
369 ST_UB(res, dst);
370 dst += dst_stride;
371
372 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
373 const20, const6, const3);
374 res = __msa_aver_u_b(inp4, res);
375 ST_UB(res, dst);
376 dst += dst_stride;
377
378 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
379 const20, const6, const3);
380 res = __msa_aver_u_b(inp6, res);
381 ST_UB(res, dst);
382 dst += dst_stride;
383 }
384 }
385
horiz_mc_qpel_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)386 static void horiz_mc_qpel_8width_msa(const uint8_t *src,
387 int32_t src_stride,
388 uint8_t *dst,
389 int32_t dst_stride,
390 int32_t height)
391 {
392 uint8_t loop_count;
393 v16u8 inp0, inp1, inp2, inp3;
394 v16u8 res0, res1;
395 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399 v16u8 const20 = (v16u8) __msa_ldi_b(20);
400 v16u8 const6 = (v16u8) __msa_ldi_b(6);
401 v16u8 const3 = (v16u8) __msa_ldi_b(3);
402
403 for (loop_count = (height >> 2); loop_count--;) {
404 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
405 src += (4 * src_stride);
406 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
407 mask0, mask1, mask2, mask3,
408 const20, const6, const3);
409 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
410 mask0, mask1, mask2, mask3,
411 const20, const6, const3);
412 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
413 dst += (4 * dst_stride);
414 }
415 }
416
horiz_mc_qpel_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)417 static void horiz_mc_qpel_16width_msa(const uint8_t *src,
418 int32_t src_stride,
419 uint8_t *dst,
420 int32_t dst_stride,
421 int32_t height)
422 {
423 uint8_t loop_count;
424 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
425 v16u8 res;
426 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427 v8u16 const20 = (v8u16) __msa_ldi_h(20);
428 v16u8 const6 = (v16u8) __msa_ldi_b(6);
429 v16u8 const3 = (v16u8) __msa_ldi_b(3);
430
431 for (loop_count = (height >> 2); loop_count--;) {
432 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
433 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
434 src += (4 * src_stride);
435 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
436 const20, const6, const3);
437 ST_UB(res, dst);
438 dst += dst_stride;
439
440 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
441 const20, const6, const3);
442 ST_UB(res, dst);
443 dst += dst_stride;
444
445 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
446 const20, const6, const3);
447 ST_UB(res, dst);
448 dst += dst_stride;
449
450 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
451 const20, const6, const3);
452 ST_UB(res, dst);
453 dst += dst_stride;
454 }
455 }
456
horiz_mc_qpel_aver_src1_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)457 static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src,
458 int32_t src_stride,
459 uint8_t *dst,
460 int32_t dst_stride,
461 int32_t height)
462 {
463 uint8_t loop_count;
464 v16u8 inp0, inp1, inp2, inp3;
465 v16u8 res0, res1;
466 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470 v16u8 const20 = (v16u8) __msa_ldi_b(20);
471 v16u8 const6 = (v16u8) __msa_ldi_b(6);
472 v16u8 const3 = (v16u8) __msa_ldi_b(3);
473
474 for (loop_count = (height >> 2); loop_count--;) {
475 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
476 src += (4 * src_stride);
477 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
478 mask0, mask1, mask2, mask3,
479 const20, const6, const3);
480 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
481 mask0, mask1, mask2, mask3,
482 const20, const6, const3);
483 SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
484 inp0, inp1, inp2, inp3);
485 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
487 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
488 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
489 dst += (4 * dst_stride);
490 }
491 }
492
horiz_mc_qpel_aver_src1_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)493 static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src,
494 int32_t src_stride,
495 uint8_t *dst,
496 int32_t dst_stride,
497 int32_t height)
498 {
499 uint8_t loop_count;
500 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
501 v16u8 res;
502 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503 v8u16 const20 = (v8u16) __msa_ldi_h(20);
504 v16u8 const6 = (v16u8) __msa_ldi_b(6);
505 v16u8 const3 = (v16u8) __msa_ldi_b(3);
506
507 for (loop_count = (height >> 2); loop_count--;) {
508 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
509 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
510 src += (4 * src_stride);
511 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
512 const20, const6, const3);
513 res = __msa_aver_u_b(res, inp1);
514 ST_UB(res, dst);
515 dst += dst_stride;
516
517 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
518 const20, const6, const3);
519 res = __msa_aver_u_b(res, inp3);
520 ST_UB(res, dst);
521 dst += dst_stride;
522
523 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
524 const20, const6, const3);
525 res = __msa_aver_u_b(res, inp5);
526 ST_UB(res, dst);
527 dst += dst_stride;
528
529 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
530 const20, const6, const3);
531 res = __msa_aver_u_b(res, inp7);
532 ST_UB(res, dst);
533 dst += dst_stride;
534 }
535 }
536
horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)537 static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src,
538 int32_t src_stride,
539 uint8_t *dst,
540 int32_t dst_stride,
541 int32_t height)
542 {
543 uint8_t loop_count;
544 v16u8 inp0, inp1, inp2, inp3;
545 v16u8 res0, res1;
546 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550 v16u8 const20 = (v16u8) __msa_ldi_b(20);
551 v16u8 const6 = (v16u8) __msa_ldi_b(6);
552 v16u8 const3 = (v16u8) __msa_ldi_b(3);
553
554 for (loop_count = (height >> 2); loop_count--;) {
555 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
556 src += (4 * src_stride);
557 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
558 mask2, mask3, const20,
559 const6, const3);
560 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
561 mask2, mask3, const20,
562 const6, const3);
563 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565 res0 = __msa_ave_u_b(inp0, res0);
566 res1 = __msa_ave_u_b(inp2, res1);
567 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
568 dst += (4 * dst_stride);
569 }
570 }
571
horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)572 static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src,
573 int32_t src_stride,
574 uint8_t *dst,
575 int32_t dst_stride,
576 int32_t height)
577 {
578 uint8_t loop_count;
579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
580 v16u8 res;
581 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582 v8u16 const20 = (v8u16) __msa_ldi_h(20);
583 v16u8 const6 = (v16u8) __msa_ldi_b(6);
584 v16u8 const3 = (v16u8) __msa_ldi_b(3);
585
586 for (loop_count = (height >> 2); loop_count--;) {
587 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
588 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
589 src += (4 * src_stride);
590 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
591 const20, const6, const3);
592 res = __msa_ave_u_b(inp0, res);
593 ST_UB(res, dst);
594 dst += dst_stride;
595
596 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
597 const20, const6, const3);
598 res = __msa_ave_u_b(inp2, res);
599 ST_UB(res, dst);
600 dst += dst_stride;
601
602 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
603 const20, const6, const3);
604 res = __msa_ave_u_b(inp4, res);
605 ST_UB(res, dst);
606 dst += dst_stride;
607
608 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
609 const20, const6, const3);
610 res = __msa_ave_u_b(inp6, res);
611 ST_UB(res, dst);
612 dst += dst_stride;
613 }
614 }
615
horiz_mc_qpel_no_rnd_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)616 static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src,
617 int32_t src_stride,
618 uint8_t *dst,
619 int32_t dst_stride,
620 int32_t height)
621 {
622 uint8_t loop_count;
623 v16u8 inp0, inp1, inp2, inp3;
624 v16u8 res0, res1;
625 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629 v16u8 const20 = (v16u8) __msa_ldi_b(20);
630 v16u8 const6 = (v16u8) __msa_ldi_b(6);
631 v16u8 const3 = (v16u8) __msa_ldi_b(3);
632
633 for (loop_count = (height >> 2); loop_count--;) {
634 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
635 src += (4 * src_stride);
636 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
637 mask2, mask3, const20,
638 const6, const3);
639 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
640 mask2, mask3, const20,
641 const6, const3);
642 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
643 dst += (4 * dst_stride);
644 }
645 }
646
horiz_mc_qpel_no_rnd_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)647 static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src,
648 int32_t src_stride,
649 uint8_t *dst,
650 int32_t dst_stride,
651 int32_t height)
652 {
653 uint8_t loop_count;
654 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
655 v16u8 res;
656 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657 v16u8 const6 = (v16u8) __msa_ldi_b(6);
658 v16u8 const3 = (v16u8) __msa_ldi_b(3);
659 v8u16 const20 = (v8u16) __msa_ldi_h(20);
660
661 for (loop_count = (height >> 2); loop_count--;) {
662 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
663 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
664 src += (4 * src_stride);
665 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
666 const20, const6, const3);
667 ST_UB(res, dst);
668 dst += dst_stride;
669
670 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
671 const20, const6, const3);
672 ST_UB(res, dst);
673 dst += dst_stride;
674
675 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
676 const20, const6, const3);
677 ST_UB(res, dst);
678 dst += dst_stride;
679
680 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
681 const20, const6, const3);
682 ST_UB(res, dst);
683 dst += dst_stride;
684 }
685 }
686
horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)687 static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src,
688 int32_t src_stride,
689 uint8_t *dst,
690 int32_t dst_stride,
691 int32_t height)
692 {
693 uint8_t loop_count;
694 v16u8 inp0, inp1, inp2, inp3;
695 v16u8 res0, res1;
696 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700 v16u8 const20 = (v16u8) __msa_ldi_b(20);
701 v16u8 const6 = (v16u8) __msa_ldi_b(6);
702 v16u8 const3 = (v16u8) __msa_ldi_b(3);
703
704 for (loop_count = (height >> 2); loop_count--;) {
705 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
706 src += (4 * src_stride);
707 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
708 mask2, mask3, const20,
709 const6, const3);
710 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
711 mask2, mask3, const20,
712 const6, const3);
713 SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
714 inp0, inp1, inp2, inp3);
715 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717 res0 = __msa_ave_u_b(inp0, res0);
718 res1 = __msa_ave_u_b(inp2, res1);
719 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
720 dst += (4 * dst_stride);
721 }
722 }
723
horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)724 static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src,
725 int32_t src_stride,
726 uint8_t *dst,
727 int32_t dst_stride,
728 int32_t height)
729 {
730 uint8_t loop_count;
731 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
732 v16u8 res;
733 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734 v16u8 const6 = (v16u8) __msa_ldi_b(6);
735 v16u8 const3 = (v16u8) __msa_ldi_b(3);
736 v8u16 const20 = (v8u16) __msa_ldi_h(20);
737
738 for (loop_count = (height >> 2); loop_count--;) {
739 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
740 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
741 src += (4 * src_stride);
742 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
743 const20, const6, const3);
744 res = __msa_ave_u_b(res, inp1);
745 ST_UB(res, dst);
746 dst += dst_stride;
747
748 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
749 const20, const6, const3);
750 res = __msa_ave_u_b(res, inp3);
751 ST_UB(res, dst);
752 dst += dst_stride;
753
754 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
755 const20, const6, const3);
756 res = __msa_ave_u_b(res, inp5);
757 ST_UB(res, dst);
758 dst += dst_stride;
759
760 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
761 const20, const6, const3);
762 res = __msa_ave_u_b(res, inp7);
763 ST_UB(res, dst);
764 dst += dst_stride;
765 }
766 }
767
horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)768 static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src,
769 int32_t src_stride,
770 uint8_t *dst,
771 int32_t dst_stride,
772 int32_t height)
773 {
774 uint8_t loop_count;
775 v16u8 inp0, inp1, inp2, inp3;
776 v16u8 dst0, dst1, dst2, dst3;
777 v16u8 res0, res1;
778 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782 v16u8 const20 = (v16u8) __msa_ldi_b(20);
783 v16u8 const6 = (v16u8) __msa_ldi_b(6);
784 v16u8 const3 = (v16u8) __msa_ldi_b(3);
785
786 for (loop_count = (height >> 2); loop_count--;) {
787 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
788 src += (4 * src_stride);
789 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
790 mask0, mask1, mask2, mask3,
791 const20, const6, const3);
792 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
793 mask0, mask1, mask2, mask3,
794 const20, const6, const3);
795 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
800 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
801 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
802 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
803 dst += (4 * dst_stride);
804 }
805 }
806
horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)807 static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src,
808 int32_t src_stride,
809 uint8_t *dst,
810 int32_t dst_stride,
811 int32_t height)
812 {
813 uint8_t loop_count;
814 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
815 v16u8 res0, res1;
816 v16u8 dst0, dst1;
817 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818 v16u8 const6 = (v16u8) __msa_ldi_b(6);
819 v16u8 const3 = (v16u8) __msa_ldi_b(3);
820 v8u16 const20 = (v8u16) __msa_ldi_h(20);
821
822 for (loop_count = (height >> 2); loop_count--;) {
823 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
824 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
825 src += (4 * src_stride);
826 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
827 const20, const6, const3);
828 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
829 const20, const6, const3);
830 LD_UB2(dst, dst_stride, dst0, dst1);
831 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
832 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
833 ST_UB2(res0, res1, dst, dst_stride);
834 dst += (2 * dst_stride);
835
836 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
837 const20, const6, const3);
838 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
839 const20, const6, const3);
840 LD_UB2(dst, dst_stride, dst0, dst1);
841 AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
842 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
843 ST_UB2(res0, res1, dst, dst_stride);
844 dst += (2 * dst_stride);
845 }
846 }
847
horiz_mc_qpel_avg_dst_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)848 static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src,
849 int32_t src_stride,
850 uint8_t *dst,
851 int32_t dst_stride,
852 int32_t height)
853 {
854 uint8_t loop_count;
855 v16u8 inp0, inp1, inp2, inp3;
856 v16u8 dst0, dst1, dst2, dst3;
857 v16u8 res0, res1;
858 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862 v16u8 const20 = (v16u8) __msa_ldi_b(20);
863 v16u8 const6 = (v16u8) __msa_ldi_b(6);
864 v16u8 const3 = (v16u8) __msa_ldi_b(3);
865
866 for (loop_count = (height >> 2); loop_count--;) {
867 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
868 src += (4 * src_stride);
869 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
870 mask0, mask1, mask2, mask3,
871 const20, const6, const3);
872 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
873 mask0, mask1, mask2, mask3,
874 const20, const6, const3);
875 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
878 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
879 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
880 dst += (4 * dst_stride);
881 }
882 }
883
horiz_mc_qpel_avg_dst_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)884 static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src,
885 int32_t src_stride,
886 uint8_t *dst,
887 int32_t dst_stride,
888 int32_t height)
889 {
890 uint8_t loop_count;
891 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
892 v16u8 res0, res1;
893 v16u8 dst0, dst1;
894 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895 v16u8 const6 = (v16u8) __msa_ldi_b(6);
896 v16u8 const3 = (v16u8) __msa_ldi_b(3);
897 v8u16 const20 = (v8u16) __msa_ldi_h(20);
898
899 for (loop_count = (height >> 2); loop_count--;) {
900 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
901 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
902 src += (4 * src_stride);
903 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
904 const20, const6, const3);
905 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
906 const20, const6, const3);
907 LD_UB2(dst, dst_stride, dst0, dst1);
908 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
909 ST_UB2(res0, res1, dst, dst_stride);
910 dst += (2 * dst_stride);
911
912 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
913 const20, const6, const3);
914 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
915 const20, const6, const3);
916 LD_UB2(dst, dst_stride, dst0, dst1);
917 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
918 ST_UB2(res0, res1, dst, dst_stride);
919 dst += (2 * dst_stride);
920 }
921 }
922
horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)923 static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src,
924 int32_t src_stride,
925 uint8_t *dst,
926 int32_t dst_stride,
927 int32_t height)
928 {
929 uint8_t loop_count;
930 v16u8 inp0, inp1, inp2, inp3;
931 v16u8 dst0, dst1, dst2, dst3;
932 v16u8 res0, res1;
933 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937 v16u8 const20 = (v16u8) __msa_ldi_b(20);
938 v16u8 const6 = (v16u8) __msa_ldi_b(6);
939 v16u8 const3 = (v16u8) __msa_ldi_b(3);
940
941 for (loop_count = (height >> 2); loop_count--;) {
942 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
943 src += (4 * src_stride);
944 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
945 mask0, mask1, mask2, mask3,
946 const20, const6, const3);
947 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
948 mask0, mask1, mask2, mask3,
949 const20, const6, const3);
950 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951 SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
952 inp0, inp1, inp2, inp3);
953 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
957 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
958 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
959 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
960 dst += (4 * dst_stride);
961 }
962 }
963
horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)964 static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src,
965 int32_t src_stride,
966 uint8_t *dst,
967 int32_t dst_stride,
968 int32_t height)
969 {
970 uint8_t loop_count;
971 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972 v16u8 res0, res1, dst0, dst1;
973 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974 v16u8 const6 = (v16u8) __msa_ldi_b(6);
975 v16u8 const3 = (v16u8) __msa_ldi_b(3);
976 v8u16 const20 = (v8u16) __msa_ldi_h(20);
977
978 for (loop_count = (height >> 2); loop_count--;) {
979 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
980 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
981 src += (4 * src_stride);
982 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
983 const20, const6, const3);
984 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
985 const20, const6, const3);
986 LD_UB2(dst, dst_stride, dst0, dst1);
987 AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
988 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
989 ST_UB2(res0, res1, dst, dst_stride);
990 dst += (2 * dst_stride);
991 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
992 const20, const6, const3);
993 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
994 const20, const6, const3);
995 LD_UB2(dst, dst_stride, dst0, dst1);
996 AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
997 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
998 ST_UB2(res0, res1, dst, dst_stride);
999 dst += (2 * dst_stride);
1000 }
1001 }
1002
1003
vert_mc_qpel_aver_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1004 static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src,
1005 int32_t src_stride,
1006 uint8_t *dst,
1007 int32_t dst_stride)
1008 {
1009 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010 v16u8 tmp0, tmp1, res0, res1;
1011 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1014
1015 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1016 src += (4 * src_stride);
1017 LD_UB2(src, src_stride, inp4, inp5);
1018 src += (2 * src_stride);
1019 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1020 inp1, inp2, inp3, inp4,
1021 inp1, inp0, inp0, inp1,
1022 inp2, inp3, inp4, inp5,
1023 const20, const6, const3);
1024 LD_UB2(src, src_stride, inp6, inp7);
1025 src += (2 * src_stride);
1026 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1027 inp3, inp4, inp5, inp6,
1028 inp3, inp2, inp1, inp0,
1029 inp4, inp5, inp6, inp7,
1030 const20, const6, const3);
1031 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1033 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1034 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1035
1036 inp8 = LD_UB(src);
1037 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1038 inp5, inp6, inp7, inp8,
1039 inp5, inp4, inp3, inp2,
1040 inp6, inp7, inp8, inp8,
1041 const20, const6, const3);
1042 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1043 inp7, inp8, inp8, inp7,
1044 inp7, inp6, inp5, inp4,
1045 inp8, inp8, inp7, inp6,
1046 const20, const6, const3);
1047 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1048 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1049 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1050 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1051 }
1052
vert_mc_qpel_aver_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1053 static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
1054 int32_t src_stride,
1055 uint8_t *dst,
1056 int32_t dst_stride)
1057 {
1058 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1059 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1060 v16u8 res0;
1061 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1062 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1063 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1064
1065 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1066 src += (5 * src_stride);
1067 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1068 inp1, inp2, inp3, inp4,
1069 const20, const6, const3);
1070 res0 = __msa_aver_u_b(res0, inp0);
1071 ST_UB(res0, dst);
1072 dst += dst_stride;
1073
1074 inp5 = LD_UB(src);
1075 src += src_stride;
1076 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1077 inp2, inp3, inp4, inp5,
1078 const20, const6, const3);
1079 res0 = __msa_aver_u_b(res0, inp1);
1080 ST_UB(res0, dst);
1081 dst += dst_stride;
1082
1083 inp6 = LD_UB(src);
1084 src += src_stride;
1085 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1086 inp3, inp4, inp5, inp6,
1087 const20, const6, const3);
1088 res0 = __msa_aver_u_b(res0, inp2);
1089 ST_UB(res0, dst);
1090 dst += dst_stride;
1091
1092 inp7 = LD_UB(src);
1093 src += src_stride;
1094 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1095 inp4, inp5, inp6, inp7,
1096 const20, const6, const3);
1097 res0 = __msa_aver_u_b(res0, inp3);
1098 ST_UB(res0, dst);
1099 dst += dst_stride;
1100
1101 LD_UB2(src, src_stride, inp8, inp9);
1102 src += (2 * src_stride);
1103 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1104 inp5, inp6, inp7, inp8,
1105 const20, const6, const3);
1106 res0 = __msa_aver_u_b(res0, inp4);
1107 ST_UB(res0, dst);
1108 dst += dst_stride;
1109
1110 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1111 inp6, inp7, inp8, inp9,
1112 const20, const6, const3);
1113 res0 = __msa_aver_u_b(res0, inp5);
1114 ST_UB(res0, dst);
1115 dst += dst_stride;
1116
1117 LD_UB2(src, src_stride, inp10, inp11);
1118 src += (2 * src_stride);
1119 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1120 inp7, inp8, inp9, inp10,
1121 const20, const6, const3);
1122 res0 = __msa_aver_u_b(res0, inp6);
1123 ST_UB(res0, dst);
1124 dst += dst_stride;
1125
1126 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1127 inp8, inp9, inp10, inp11,
1128 const20, const6, const3);
1129 res0 = __msa_aver_u_b(res0, inp7);
1130 ST_UB(res0, dst);
1131 dst += dst_stride;
1132
1133 LD_UB2(src, src_stride, inp12, inp13);
1134 src += (2 * src_stride);
1135 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1136 inp9, inp10, inp11, inp12,
1137 const20, const6, const3);
1138 res0 = __msa_aver_u_b(res0, inp8);
1139 ST_UB(res0, dst);
1140 dst += dst_stride;
1141
1142 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1143 inp10, inp11, inp12, inp13,
1144 const20, const6, const3);
1145 res0 = __msa_aver_u_b(res0, inp9);
1146 ST_UB(res0, dst);
1147 dst += dst_stride;
1148
1149 LD_UB2(src, src_stride, inp14, inp15);
1150 src += (2 * src_stride);
1151 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1152 inp11, inp12, inp13, inp14,
1153 const20, const6, const3);
1154 res0 = __msa_aver_u_b(res0, inp10);
1155 ST_UB(res0, dst);
1156 dst += dst_stride;
1157
1158 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1159 inp12, inp13, inp14, inp15,
1160 const20, const6, const3);
1161 res0 = __msa_aver_u_b(res0, inp11);
1162 ST_UB(res0, dst);
1163 dst += dst_stride;
1164
1165 inp16 = LD_UB(src);
1166 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1167 inp13, inp14, inp15, inp16,
1168 const20, const6, const3);
1169 res0 = __msa_aver_u_b(res0, inp12);
1170 ST_UB(res0, dst);
1171 dst += dst_stride;
1172
1173 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1174 inp14, inp15, inp16, inp16,
1175 const20, const6, const3);
1176 res0 = __msa_aver_u_b(res0, inp13);
1177 ST_UB(res0, dst);
1178 dst += dst_stride;
1179
1180 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1181 inp15, inp16, inp16, inp15,
1182 const20, const6, const3);
1183 res0 = __msa_aver_u_b(res0, inp14);
1184 ST_UB(res0, dst);
1185 dst += dst_stride;
1186
1187 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1188 inp16, inp16, inp15, inp14,
1189 const20, const6, const3);
1190 res0 = __msa_aver_u_b(res0, inp15);
1191 ST_UB(res0, dst);
1192 }
1193
vert_mc_qpel_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1194 static void vert_mc_qpel_8x8_msa(const uint8_t *src,
1195 int32_t src_stride,
1196 uint8_t *dst,
1197 int32_t dst_stride)
1198 {
1199 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1200 v16u8 res0, res1;
1201 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1202 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1203 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1204
1205 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1206 src += (4 * src_stride);
1207 LD_UB2(src, src_stride, inp4, inp5);
1208 src += (2 * src_stride);
1209 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1210 inp1, inp2, inp3, inp4,
1211 inp1, inp0, inp0, inp1,
1212 inp2, inp3, inp4, inp5,
1213 const20, const6, const3);
1214 LD_UB2(src, src_stride, inp6, inp7);
1215 src += (2 * src_stride);
1216 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1217 inp3, inp4, inp5, inp6,
1218 inp3, inp2, inp1, inp0,
1219 inp4, inp5, inp6, inp7,
1220 const20, const6, const3);
1221 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1222
1223 inp8 = LD_UB(src);
1224 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1225 inp5, inp6, inp7, inp8,
1226 inp5, inp4, inp3, inp2,
1227 inp6, inp7, inp8, inp8,
1228 const20, const6, const3);
1229 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1230 inp7, inp8, inp8, inp7,
1231 inp7, inp6, inp5, inp4,
1232 inp8, inp8, inp7, inp6,
1233 const20, const6, const3);
1234 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1235 }
1236
vert_mc_qpel_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1237 static void vert_mc_qpel_16x16_msa(const uint8_t *src,
1238 int32_t src_stride,
1239 uint8_t *dst,
1240 int32_t dst_stride)
1241 {
1242 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1243 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1244 v16u8 res0;
1245 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1246 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1247 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1248
1249 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1250 src += (4 * src_stride);
1251 inp4 = LD_UB(src);
1252 src += src_stride;
1253 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1254 inp1, inp2, inp3, inp4,
1255 const20, const6, const3);
1256 ST_UB(res0, dst);
1257 dst += dst_stride;
1258
1259 inp5 = LD_UB(src);
1260 src += src_stride;
1261 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1262 inp2, inp3, inp4, inp5,
1263 const20, const6, const3);
1264 ST_UB(res0, dst);
1265 dst += dst_stride;
1266
1267 inp6 = LD_UB(src);
1268 src += src_stride;
1269 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1270 inp3, inp4, inp5, inp6,
1271 const20, const6, const3);
1272 ST_UB(res0, dst);
1273 dst += dst_stride;
1274
1275 inp7 = LD_UB(src);
1276 src += src_stride;
1277 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1278 inp4, inp5, inp6, inp7,
1279 const20, const6, const3);
1280 ST_UB(res0, dst);
1281 dst += dst_stride;
1282
1283 inp8 = LD_UB(src);
1284 src += src_stride;
1285 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1286 inp5, inp6, inp7, inp8,
1287 const20, const6, const3);
1288 ST_UB(res0, dst);
1289 dst += dst_stride;
1290
1291 inp9 = LD_UB(src);
1292 src += src_stride;
1293 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1294 inp6, inp7, inp8, inp9,
1295 const20, const6, const3);
1296 ST_UB(res0, dst);
1297 dst += dst_stride;
1298
1299 inp10 = LD_UB(src);
1300 src += src_stride;
1301 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1302 inp7, inp8, inp9, inp10,
1303 const20, const6, const3);
1304 ST_UB(res0, dst);
1305 dst += dst_stride;
1306
1307 inp11 = LD_UB(src);
1308 src += src_stride;
1309 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1310 inp8, inp9, inp10, inp11,
1311 const20, const6, const3);
1312 ST_UB(res0, dst);
1313 dst += dst_stride;
1314
1315 inp12 = LD_UB(src);
1316 src += src_stride;
1317 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1318 inp9, inp10, inp11, inp12,
1319 const20, const6, const3);
1320 ST_UB(res0, dst);
1321 dst += dst_stride;
1322
1323 inp13 = LD_UB(src);
1324 src += src_stride;
1325 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1326 inp10, inp11, inp12, inp13,
1327 const20, const6, const3);
1328 ST_UB(res0, dst);
1329 dst += dst_stride;
1330
1331 inp14 = LD_UB(src);
1332 src += src_stride;
1333 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1334 inp11, inp12, inp13, inp14,
1335 const20, const6, const3);
1336 ST_UB(res0, dst);
1337 dst += dst_stride;
1338
1339 inp15 = LD_UB(src);
1340 src += src_stride;
1341 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1342 inp12, inp13, inp14, inp15,
1343 const20, const6, const3);
1344 ST_UB(res0, dst);
1345 dst += dst_stride;
1346
1347 inp16 = LD_UB(src);
1348 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1349 inp13, inp14, inp15, inp16,
1350 const20, const6, const3);
1351 ST_UB(res0, dst);
1352 dst += dst_stride;
1353
1354 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1355 inp14, inp15, inp16, inp16,
1356 const20, const6, const3);
1357 ST_UB(res0, dst);
1358 dst += dst_stride;
1359
1360 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1361 inp15, inp16, inp16, inp15,
1362 const20, const6, const3);
1363 ST_UB(res0, dst);
1364 dst += dst_stride;
1365
1366 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1367 inp16, inp16, inp15, inp14,
1368 const20, const6, const3);
1369 ST_UB(res0, dst);
1370 dst += dst_stride;
1371 }
1372
vert_mc_qpel_aver_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1373 static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src,
1374 int32_t src_stride,
1375 uint8_t *dst,
1376 int32_t dst_stride)
1377 {
1378 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1379 v16u8 tmp0, tmp1, res0, res1;
1380 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1381 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1382 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1383
1384 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1385 src += (4 * src_stride);
1386 LD_UB2(src, src_stride, inp4, inp5);
1387 src += (2 * src_stride);
1388 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1389 inp1, inp2, inp3, inp4,
1390 inp1, inp0, inp0, inp1,
1391 inp2, inp3, inp4, inp5,
1392 const20, const6, const3);
1393
1394 LD_UB2(src, src_stride, inp6, inp7);
1395 src += (2 * src_stride);
1396 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1397 inp3, inp4, inp5, inp6,
1398 inp3, inp2, inp1, inp0,
1399 inp4, inp5, inp6, inp7,
1400 const20, const6, const3);
1401 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1402 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1403 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1404 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1405
1406 inp8 = LD_UB(src);
1407 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1408 inp5, inp6, inp7, inp8,
1409 inp5, inp4, inp3, inp2,
1410 inp6, inp7, inp8, inp8,
1411 const20, const6, const3);
1412 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1413 inp7, inp8, inp8, inp7,
1414 inp7, inp6, inp5, inp4,
1415 inp8, inp8, inp7, inp6,
1416 const20, const6, const3);
1417 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1418 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1419 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1420 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1421 }
1422
vert_mc_qpel_aver_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1423 static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
1424 int32_t src_stride,
1425 uint8_t *dst,
1426 int32_t dst_stride)
1427 {
1428 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1429 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1430 v16u8 res0;
1431 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1432 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1433 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1434
1435 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1436 src += (4 * src_stride);
1437 inp4 = LD_UB(src);
1438 src += src_stride;
1439 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1440 inp1, inp2, inp3, inp4,
1441 const20, const6, const3);
1442 res0 = __msa_aver_u_b(res0, inp1);
1443 ST_UB(res0, dst);
1444 dst += dst_stride;
1445
1446 inp5 = LD_UB(src);
1447 src += src_stride;
1448 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1449 inp2, inp3, inp4, inp5,
1450 const20, const6, const3);
1451 res0 = __msa_aver_u_b(res0, inp2);
1452 ST_UB(res0, dst);
1453 dst += dst_stride;
1454
1455 inp6 = LD_UB(src);
1456 src += src_stride;
1457 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1458 inp3, inp4, inp5, inp6,
1459 const20, const6, const3);
1460 res0 = __msa_aver_u_b(res0, inp3);
1461 ST_UB(res0, dst);
1462 dst += dst_stride;
1463
1464 inp7 = LD_UB(src);
1465 src += src_stride;
1466 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1467 inp4, inp5, inp6, inp7,
1468 const20, const6, const3);
1469 res0 = __msa_aver_u_b(res0, inp4);
1470 ST_UB(res0, dst);
1471 dst += dst_stride;
1472
1473 inp8 = LD_UB(src);
1474 src += src_stride;
1475 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1476 inp5, inp6, inp7, inp8,
1477 const20, const6, const3);
1478 res0 = __msa_aver_u_b(res0, inp5);
1479 ST_UB(res0, dst);
1480 dst += dst_stride;
1481
1482 inp9 = LD_UB(src);
1483 src += src_stride;
1484 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1485 inp6, inp7, inp8, inp9,
1486 const20, const6, const3);
1487 res0 = __msa_aver_u_b(res0, inp6);
1488 ST_UB(res0, dst);
1489 dst += dst_stride;
1490
1491 inp10 = LD_UB(src);
1492 src += src_stride;
1493 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1494 inp7, inp8, inp9, inp10,
1495 const20, const6, const3);
1496 res0 = __msa_aver_u_b(res0, inp7);
1497 ST_UB(res0, dst);
1498 dst += dst_stride;
1499
1500 inp11 = LD_UB(src);
1501 src += src_stride;
1502 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1503 inp8, inp9, inp10, inp11,
1504 const20, const6, const3);
1505 res0 = __msa_aver_u_b(res0, inp8);
1506 ST_UB(res0, dst);
1507 dst += dst_stride;
1508
1509 inp12 = LD_UB(src);
1510 src += src_stride;
1511 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1512 inp9, inp10, inp11, inp12,
1513 const20, const6, const3);
1514 res0 = __msa_aver_u_b(res0, inp9);
1515 ST_UB(res0, dst);
1516 dst += dst_stride;
1517
1518 inp13 = LD_UB(src);
1519 src += src_stride;
1520 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1521 inp10, inp11, inp12, inp13,
1522 const20, const6, const3);
1523 res0 = __msa_aver_u_b(res0, inp10);
1524 ST_UB(res0, dst);
1525 dst += dst_stride;
1526
1527 inp14 = LD_UB(src);
1528 src += src_stride;
1529 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1530 inp11, inp12, inp13, inp14,
1531 const20, const6, const3);
1532 res0 = __msa_aver_u_b(res0, inp11);
1533 ST_UB(res0, dst);
1534 dst += dst_stride;
1535
1536 inp15 = LD_UB(src);
1537 src += src_stride;
1538 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1539 inp12, inp13, inp14, inp15,
1540 const20, const6, const3);
1541 res0 = __msa_aver_u_b(res0, inp12);
1542 ST_UB(res0, dst);
1543 dst += dst_stride;
1544
1545 inp16 = LD_UB(src);
1546 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1547 inp13, inp14, inp15, inp16,
1548 const20, const6, const3);
1549 res0 = __msa_aver_u_b(res0, inp13);
1550 ST_UB(res0, dst);
1551 dst += dst_stride;
1552
1553 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1554 inp14, inp15, inp16, inp16,
1555 const20, const6, const3);
1556 res0 = __msa_aver_u_b(res0, inp14);
1557 ST_UB(res0, dst);
1558 dst += dst_stride;
1559
1560 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1561 inp15, inp16, inp16, inp15,
1562 const20, const6, const3);
1563 res0 = __msa_aver_u_b(res0, inp15);
1564 ST_UB(res0, dst);
1565 dst += dst_stride;
1566
1567 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1568 inp16, inp16, inp15, inp14,
1569 const20, const6, const3);
1570 res0 = __msa_aver_u_b(res0, inp16);
1571 ST_UB(res0, dst);
1572 }
1573
vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1574 static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src,
1575 int32_t src_stride,
1576 uint8_t *dst,
1577 int32_t dst_stride)
1578 {
1579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1580 v16u8 tmp0, tmp1, res0, res1;
1581 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1582 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1583 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1584
1585 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1586 src += (4 * src_stride);
1587 LD_UB2(src, src_stride, inp4, inp5);
1588 src += (2 * src_stride);
1589 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1590 inp1, inp2, inp3, inp4,
1591 inp1, inp0, inp0, inp1,
1592 inp2, inp3, inp4, inp5,
1593 const20, const6, const3);
1594 LD_UB2(src, src_stride, inp6, inp7);
1595 src += (2 * src_stride);
1596 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1597 inp3, inp4, inp5, inp6,
1598 inp3, inp2, inp1, inp0,
1599 inp4, inp5, inp6, inp7,
1600 const20, const6, const3);
1601 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1602 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1603 res0 = __msa_ave_u_b(res0, tmp0);
1604 res1 = __msa_ave_u_b(res1, tmp1);
1605 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1606
1607 inp8 = LD_UB(src);
1608 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1609 inp5, inp6, inp7, inp8,
1610 inp5, inp4, inp3, inp2,
1611 inp6, inp7, inp8, inp8,
1612 const20, const6, const3);
1613 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1614 inp7, inp8, inp8, inp7,
1615 inp7, inp6, inp5, inp4,
1616 inp8, inp8, inp7, inp6,
1617 const20, const6, const3);
1618 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1619 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1620 res0 = __msa_ave_u_b(res0, tmp0);
1621 res1 = __msa_ave_u_b(res1, tmp1);
1622 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1623 }
1624
vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1625 static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
1626 int32_t src_stride,
1627 uint8_t *dst,
1628 int32_t dst_stride)
1629 {
1630 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1631 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1632 v16u8 res0;
1633 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1634 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1635 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1636
1637 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1638 src += (5 * src_stride);
1639 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1640 inp1, inp2, inp3, inp4,
1641 const20, const6, const3);
1642 res0 = __msa_ave_u_b(res0, inp0);
1643 ST_UB(res0, dst);
1644 dst += dst_stride;
1645
1646 inp5 = LD_UB(src);
1647 src += src_stride;
1648 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1649 inp2, inp3, inp4, inp5,
1650 const20, const6, const3);
1651 res0 = __msa_ave_u_b(res0, inp1);
1652 ST_UB(res0, dst);
1653 dst += dst_stride;
1654
1655 inp6 = LD_UB(src);
1656 src += src_stride;
1657 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1658 inp3, inp4, inp5, inp6,
1659 const20, const6, const3);
1660 res0 = __msa_ave_u_b(res0, inp2);
1661 ST_UB(res0, dst);
1662 dst += dst_stride;
1663
1664 inp7 = LD_UB(src);
1665 src += src_stride;
1666 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1667 inp4, inp5, inp6, inp7,
1668 const20, const6, const3);
1669 res0 = __msa_ave_u_b(res0, inp3);
1670 ST_UB(res0, dst);
1671 dst += dst_stride;
1672
1673 inp8 = LD_UB(src);
1674 src += src_stride;
1675 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1676 inp5, inp6, inp7, inp8,
1677 const20, const6, const3);
1678 res0 = __msa_ave_u_b(res0, inp4);
1679 ST_UB(res0, dst);
1680 dst += dst_stride;
1681
1682 inp9 = LD_UB(src);
1683 src += src_stride;
1684 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1685 inp6, inp7, inp8, inp9,
1686 const20, const6, const3);
1687 res0 = __msa_ave_u_b(res0, inp5);
1688 ST_UB(res0, dst);
1689 dst += dst_stride;
1690
1691 inp10 = LD_UB(src);
1692 src += src_stride;
1693 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1694 inp7, inp8, inp9, inp10,
1695 const20, const6, const3);
1696 res0 = __msa_ave_u_b(res0, inp6);
1697 ST_UB(res0, dst);
1698 dst += dst_stride;
1699
1700 inp11 = LD_UB(src);
1701 src += src_stride;
1702 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1703 inp8, inp9, inp10, inp11,
1704 const20, const6, const3);
1705 res0 = __msa_ave_u_b(res0, inp7);
1706 ST_UB(res0, dst);
1707 dst += dst_stride;
1708
1709 inp12 = LD_UB(src);
1710 src += src_stride;
1711 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1712 inp9, inp10, inp11, inp12,
1713 const20, const6, const3);
1714 res0 = __msa_ave_u_b(res0, inp8);
1715 ST_UB(res0, dst);
1716 dst += dst_stride;
1717
1718 inp13 = LD_UB(src);
1719 src += src_stride;
1720 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1721 inp10, inp11, inp12, inp13,
1722 const20, const6, const3);
1723 res0 = __msa_ave_u_b(res0, inp9);
1724 ST_UB(res0, dst);
1725 dst += dst_stride;
1726
1727 inp14 = LD_UB(src);
1728 src += src_stride;
1729 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1730 inp11, inp12, inp13, inp14,
1731 const20, const6, const3);
1732 res0 = __msa_ave_u_b(res0, inp10);
1733 ST_UB(res0, dst);
1734 dst += dst_stride;
1735
1736 inp15 = LD_UB(src);
1737 src += src_stride;
1738 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1739 inp12, inp13, inp14, inp15,
1740 const20, const6, const3);
1741 res0 = __msa_ave_u_b(res0, inp11);
1742 ST_UB(res0, dst);
1743 dst += dst_stride;
1744
1745 inp16 = LD_UB(src);
1746 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1747 inp13, inp14, inp15, inp16,
1748 const20, const6, const3);
1749 res0 = __msa_ave_u_b(res0, inp12);
1750 ST_UB(res0, dst);
1751 dst += dst_stride;
1752
1753 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1754 inp14, inp15, inp16, inp16,
1755 const20, const6, const3);
1756 res0 = __msa_ave_u_b(res0, inp13);
1757 ST_UB(res0, dst);
1758 dst += dst_stride;
1759
1760 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1761 inp15, inp16, inp16, inp15,
1762 const20, const6, const3);
1763 res0 = __msa_ave_u_b(res0, inp14);
1764 ST_UB(res0, dst);
1765 dst += dst_stride;
1766
1767 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1768 inp16, inp16, inp15, inp14,
1769 const20, const6, const3);
1770 res0 = __msa_ave_u_b(res0, inp15);
1771 ST_UB(res0, dst);
1772 dst += dst_stride;
1773 }
1774
vert_mc_qpel_no_rnd_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1775 static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
1776 int32_t src_stride,
1777 uint8_t *dst,
1778 int32_t dst_stride)
1779 {
1780 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1781 v16u8 res0, res1;
1782 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1783 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1784 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1785
1786 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1787 src += (4 * src_stride);
1788 LD_UB2(src, src_stride, inp4, inp5);
1789 src += (2 * src_stride);
1790 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1791 inp1, inp2, inp3, inp4,
1792 inp1, inp0, inp0, inp1,
1793 inp2, inp3, inp4, inp5,
1794 const20, const6, const3);
1795 LD_UB2(src, src_stride, inp6, inp7);
1796 src += (2 * src_stride);
1797 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1798 inp3, inp4, inp5, inp6,
1799 inp3, inp2, inp1, inp0,
1800 inp4, inp5, inp6, inp7,
1801 const20, const6, const3);
1802 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1803
1804 inp8 = LD_UB(src);
1805 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1806 inp5, inp6, inp7, inp8,
1807 inp5, inp4, inp3, inp2,
1808 inp6, inp7, inp8, inp8,
1809 const20, const6, const3);
1810 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1811 inp7, inp8, inp8, inp7,
1812 inp7, inp6, inp5, inp4,
1813 inp8, inp8, inp7, inp6,
1814 const20, const6, const3);
1815 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1816 }
1817
vert_mc_qpel_no_rnd_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1818 static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
1819 int32_t src_stride,
1820 uint8_t *dst,
1821 int32_t dst_stride)
1822 {
1823 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1824 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1825 v16u8 res0;
1826 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1827 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1828 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1829
1830 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1831 src += (5 * src_stride);
1832 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1833 inp1, inp2, inp3, inp4,
1834 const20, const6, const3);
1835 ST_UB(res0, dst);
1836 dst += dst_stride;
1837
1838 inp5 = LD_UB(src);
1839 src += src_stride;
1840 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1841 inp2, inp3, inp4, inp5,
1842 const20, const6, const3);
1843 ST_UB(res0, dst);
1844 dst += dst_stride;
1845
1846 inp6 = LD_UB(src);
1847 src += src_stride;
1848 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1849 inp3, inp4, inp5, inp6,
1850 const20, const6, const3);
1851 ST_UB(res0, dst);
1852 dst += dst_stride;
1853
1854 inp7 = LD_UB(src);
1855 src += src_stride;
1856 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1857 inp4, inp5, inp6, inp7,
1858 const20, const6, const3);
1859 ST_UB(res0, dst);
1860 dst += dst_stride;
1861
1862 inp8 = LD_UB(src);
1863 src += src_stride;
1864 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1865 inp5, inp6, inp7, inp8,
1866 const20, const6, const3);
1867 ST_UB(res0, dst);
1868 dst += dst_stride;
1869
1870 inp9 = LD_UB(src);
1871 src += src_stride;
1872 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1873 inp6, inp7, inp8, inp9,
1874 const20, const6, const3);
1875 ST_UB(res0, dst);
1876 dst += dst_stride;
1877
1878 inp10 = LD_UB(src);
1879 src += src_stride;
1880 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1881 inp7, inp8, inp9, inp10,
1882 const20, const6, const3);
1883 ST_UB(res0, dst);
1884 dst += dst_stride;
1885
1886 inp11 = LD_UB(src);
1887 src += src_stride;
1888 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1889 inp8, inp9, inp10, inp11,
1890 const20, const6, const3);
1891 ST_UB(res0, dst);
1892 dst += dst_stride;
1893
1894 inp12 = LD_UB(src);
1895 src += src_stride;
1896 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1897 inp9, inp10, inp11, inp12,
1898 const20, const6, const3);
1899 ST_UB(res0, dst);
1900 dst += dst_stride;
1901
1902 inp13 = LD_UB(src);
1903 src += src_stride;
1904 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1905 inp10, inp11, inp12, inp13,
1906 const20, const6, const3);
1907 ST_UB(res0, dst);
1908 dst += dst_stride;
1909
1910 inp14 = LD_UB(src);
1911 src += src_stride;
1912 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1913 inp11, inp12, inp13, inp14,
1914 const20, const6, const3);
1915 ST_UB(res0, dst);
1916 dst += dst_stride;
1917
1918 inp15 = LD_UB(src);
1919 src += src_stride;
1920 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1921 inp12, inp13, inp14, inp15,
1922 const20, const6, const3);
1923 ST_UB(res0, dst);
1924 dst += dst_stride;
1925
1926 inp16 = LD_UB(src);
1927 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1928 inp13, inp14, inp15, inp16,
1929 const20, const6, const3);
1930 ST_UB(res0, dst);
1931 dst += dst_stride;
1932
1933 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1934 inp14, inp15, inp16, inp16,
1935 const20, const6, const3);
1936 ST_UB(res0, dst);
1937 dst += dst_stride;
1938
1939 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1940 inp15, inp16, inp16, inp15,
1941 const20, const6, const3);
1942 ST_UB(res0, dst);
1943 dst += dst_stride;
1944
1945 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1946 inp16, inp16, inp15, inp14,
1947 const20, const6, const3);
1948 ST_UB(res0, dst);
1949 }
1950
vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1951 static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src,
1952 int32_t src_stride,
1953 uint8_t *dst,
1954 int32_t dst_stride)
1955 {
1956 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1957 v16u8 tmp0, tmp1, res0, res1;
1958 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1959 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1960 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1961
1962 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1963 src += (4 * src_stride);
1964 LD_UB2(src, src_stride, inp4, inp5);
1965 src += (2 * src_stride);
1966 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1967 inp1, inp2, inp3, inp4,
1968 inp1, inp0, inp0, inp1,
1969 inp2, inp3, inp4, inp5,
1970 const20, const6, const3);
1971 LD_UB2(src, src_stride, inp6, inp7);
1972 src += (2 * src_stride);
1973 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1974 inp3, inp4, inp5, inp6,
1975 inp3, inp2, inp1, inp0,
1976 inp4, inp5, inp6, inp7,
1977 const20, const6, const3);
1978 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1979 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1980 res0 = __msa_ave_u_b(res0, tmp0);
1981 res1 = __msa_ave_u_b(res1, tmp1);
1982 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1983
1984 inp8 = LD_UB(src);
1985 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1986 inp5, inp6, inp7, inp8,
1987 inp5, inp4, inp3, inp2,
1988 inp6, inp7, inp8, inp8,
1989 const20, const6, const3);
1990 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1991 inp7, inp8, inp8, inp7,
1992 inp7, inp6, inp5, inp4,
1993 inp8, inp8, inp7, inp6,
1994 const20, const6, const3);
1995 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1996 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1997 res0 = __msa_ave_u_b(res0, tmp0);
1998 res1 = __msa_ave_u_b(res1, tmp1);
1999 ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2000 }
2001
vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2002 static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
2003 int32_t src_stride,
2004 uint8_t *dst,
2005 int32_t dst_stride)
2006 {
2007 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2008 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2009 v16u8 res0;
2010 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2011 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2012 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2013
2014 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2015 src += (5 * src_stride);
2016 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
2017 inp1, inp2, inp3, inp4,
2018 const20, const6, const3);
2019 res0 = __msa_ave_u_b(res0, inp1);
2020 ST_UB(res0, dst);
2021 dst += dst_stride;
2022
2023 inp5 = LD_UB(src);
2024 src += src_stride;
2025 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
2026 inp2, inp3, inp4, inp5,
2027 const20, const6, const3);
2028 res0 = __msa_ave_u_b(res0, inp2);
2029 ST_UB(res0, dst);
2030 dst += dst_stride;
2031
2032 inp6 = LD_UB(src);
2033 src += src_stride;
2034 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
2035 inp3, inp4, inp5, inp6,
2036 const20, const6, const3);
2037 res0 = __msa_ave_u_b(res0, inp3);
2038 ST_UB(res0, dst);
2039 dst += dst_stride;
2040
2041 inp7 = LD_UB(src);
2042 src += src_stride;
2043 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
2044 inp4, inp5, inp6, inp7,
2045 const20, const6, const3);
2046 res0 = __msa_ave_u_b(res0, inp4);
2047 ST_UB(res0, dst);
2048 dst += dst_stride;
2049
2050 inp8 = LD_UB(src);
2051 src += src_stride;
2052 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
2053 inp5, inp6, inp7, inp8,
2054 const20, const6, const3);
2055 res0 = __msa_ave_u_b(res0, inp5);
2056 ST_UB(res0, dst);
2057 dst += dst_stride;
2058
2059 inp9 = LD_UB(src);
2060 src += src_stride;
2061 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
2062 inp6, inp7, inp8, inp9,
2063 const20, const6, const3);
2064 res0 = __msa_ave_u_b(res0, inp6);
2065 ST_UB(res0, dst);
2066 dst += dst_stride;
2067
2068 inp10 = LD_UB(src);
2069 src += src_stride;
2070 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
2071 inp7, inp8, inp9, inp10,
2072 const20, const6, const3);
2073 res0 = __msa_ave_u_b(res0, inp7);
2074 ST_UB(res0, dst);
2075 dst += dst_stride;
2076
2077 inp11 = LD_UB(src);
2078 src += src_stride;
2079 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
2080 inp8, inp9, inp10, inp11,
2081 const20, const6, const3);
2082 res0 = __msa_ave_u_b(res0, inp8);
2083 ST_UB(res0, dst);
2084 dst += dst_stride;
2085
2086 inp12 = LD_UB(src);
2087 src += src_stride;
2088 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
2089 inp9, inp10, inp11, inp12,
2090 const20, const6, const3);
2091 res0 = __msa_ave_u_b(res0, inp9);
2092 ST_UB(res0, dst);
2093 dst += dst_stride;
2094
2095 inp13 = LD_UB(src);
2096 src += src_stride;
2097 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
2098 inp10, inp11, inp12, inp13,
2099 const20, const6, const3);
2100 res0 = __msa_ave_u_b(res0, inp10);
2101 ST_UB(res0, dst);
2102 dst += dst_stride;
2103
2104 inp14 = LD_UB(src);
2105 src += src_stride;
2106 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
2107 inp11, inp12, inp13, inp14,
2108 const20, const6, const3);
2109 res0 = __msa_ave_u_b(res0, inp11);
2110 ST_UB(res0, dst);
2111 dst += dst_stride;
2112
2113 inp15 = LD_UB(src);
2114 src += src_stride;
2115 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
2116 inp12, inp13, inp14, inp15,
2117 const20, const6, const3);
2118 res0 = __msa_ave_u_b(res0, inp12);
2119 ST_UB(res0, dst);
2120 dst += dst_stride;
2121
2122 inp16 = LD_UB(src);
2123 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
2124 inp13, inp14, inp15, inp16,
2125 const20, const6, const3);
2126 res0 = __msa_ave_u_b(res0, inp13);
2127 ST_UB(res0, dst);
2128 dst += dst_stride;
2129
2130 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
2131 inp14, inp15, inp16, inp16,
2132 const20, const6, const3);
2133 res0 = __msa_ave_u_b(res0, inp14);
2134 ST_UB(res0, dst);
2135 dst += dst_stride;
2136
2137 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
2138 inp15, inp16, inp16, inp15,
2139 const20, const6, const3);
2140 res0 = __msa_ave_u_b(res0, inp15);
2141 ST_UB(res0, dst);
2142 dst += dst_stride;
2143
2144 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
2145 inp16, inp16, inp15, inp14,
2146 const20, const6, const3);
2147 res0 = __msa_ave_u_b(res0, inp16);
2148 ST_UB(res0, dst);
2149 }
2150
vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2151 static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src,
2152 int32_t src_stride,
2153 uint8_t *dst,
2154 int32_t dst_stride)
2155 {
2156 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2157 v16u8 dst0, dst1, dst2, dst3;
2158 v16u8 tmp0, tmp1, res0, res1;
2159 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2160 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2161 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2162
2163 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2164 src += (4 * src_stride);
2165 LD_UB2(src, src_stride, inp4, inp5);
2166 src += (2 * src_stride);
2167 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2168 inp1, inp2, inp3, inp4,
2169 inp1, inp0, inp0, inp1,
2170 inp2, inp3, inp4, inp5,
2171 const20, const6, const3);
2172
2173 LD_UB2(src, src_stride, inp6, inp7);
2174 src += (2 * src_stride);
2175 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2176 inp3, inp4, inp5, inp6,
2177 inp3, inp2, inp1, inp0,
2178 inp4, inp5, inp6, inp7,
2179 const20, const6, const3);
2180
2181 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2182 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2183 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2184 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2185 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2186 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2187 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2188 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2189 dst += (4 * dst_stride);
2190
2191 inp8 = LD_UB(src);
2192 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2193 inp5, inp6, inp7, inp8,
2194 inp5, inp4, inp3, inp2,
2195 inp6, inp7, inp8, inp8,
2196 const20, const6, const3);
2197 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2198 inp7, inp8, inp8, inp7,
2199 inp7, inp6, inp5, inp4,
2200 inp8, inp8, inp7, inp6,
2201 const20, const6, const3);
2202
2203 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2204 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2205 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2206 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2207 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2208 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2209 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2210 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2211 }
2212
vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2213 static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
2214 int32_t src_stride,
2215 uint8_t *dst,
2216 int32_t dst_stride)
2217 {
2218 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2219 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2220 v16u8 res0, res1, dst0, dst1;
2221 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2222 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2223 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2224
2225 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2226 src += (5 * src_stride);
2227 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2228 inp1, inp2, inp3, inp4,
2229 const20, const6, const3);
2230
2231 inp5 = LD_UB(src);
2232 src += src_stride;
2233 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2234 inp2, inp3, inp4, inp5,
2235 const20, const6, const3);
2236
2237 LD_UB2(dst, dst_stride, dst0, dst1);
2238 AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
2239 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2240 ST_UB2(res0, res1, dst, dst_stride);
2241 dst += (2 * dst_stride);
2242
2243 inp6 = LD_UB(src);
2244 src += src_stride;
2245 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2246 inp3, inp4, inp5, inp6,
2247 const20, const6, const3);
2248
2249 inp7 = LD_UB(src);
2250 src += src_stride;
2251 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2252 inp4, inp5, inp6, inp7,
2253 const20, const6, const3);
2254
2255 LD_UB2(dst, dst_stride, dst0, dst1);
2256 AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
2257 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2258 ST_UB2(res0, res1, dst, dst_stride);
2259 dst += (2 * dst_stride);
2260
2261 LD_UB2(src, src_stride, inp8, inp9);
2262 src += (2 * src_stride);
2263 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2264 inp5, inp6, inp7, inp8,
2265 const20, const6, const3);
2266 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2267 inp6, inp7, inp8, inp9,
2268 const20, const6, const3);
2269
2270 LD_UB2(dst, dst_stride, dst0, dst1);
2271 AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
2272 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2273 ST_UB2(res0, res1, dst, dst_stride);
2274 dst += (2 * dst_stride);
2275
2276 LD_UB2(src, src_stride, inp10, inp11);
2277 src += (2 * src_stride);
2278 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2279 inp7, inp8, inp9, inp10,
2280 const20, const6, const3);
2281 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2282 inp8, inp9, inp10, inp11,
2283 const20, const6, const3);
2284
2285 LD_UB2(dst, dst_stride, dst0, dst1);
2286 AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
2287 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2288 ST_UB2(res0, res1, dst, dst_stride);
2289 dst += (2 * dst_stride);
2290
2291 LD_UB2(src, src_stride, inp12, inp13);
2292 src += (2 * src_stride);
2293 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2294 inp9, inp10, inp11, inp12,
2295 const20, const6, const3);
2296 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2297 inp10, inp11, inp12, inp13,
2298 const20, const6, const3);
2299 LD_UB2(dst, dst_stride, dst0, dst1);
2300 AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
2301 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2302 ST_UB2(res0, res1, dst, dst_stride);
2303 dst += (2 * dst_stride);
2304
2305 LD_UB2(src, src_stride, inp14, inp15);
2306 src += (2 * src_stride);
2307 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2308 inp11, inp12, inp13, inp14,
2309 const20, const6, const3);
2310 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2311 inp12, inp13, inp14, inp15,
2312 const20, const6, const3);
2313
2314 LD_UB2(dst, dst_stride, dst0, dst1);
2315 AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2316 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2317 ST_UB2(res0, res1, dst, dst_stride);
2318 dst += (2 * dst_stride);
2319
2320 inp16 = LD_UB(src);
2321 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2322 inp13, inp14, inp15, inp16,
2323 const20, const6, const3);
2324 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2325 inp14, inp15, inp16, inp16,
2326 const20, const6, const3);
2327 LD_UB2(dst, dst_stride, dst0, dst1);
2328 AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2329 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2330 ST_UB2(res0, res1, dst, dst_stride);
2331 dst += (2 * dst_stride);
2332
2333 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2334 inp15, inp16, inp16, inp15,
2335 const20, const6, const3);
2336 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2337 inp16, inp16, inp15, inp14,
2338 const20, const6, const3);
2339 LD_UB2(dst, dst_stride, dst0, dst1);
2340 AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2341 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2342 ST_UB2(res0, res1, dst, dst_stride);
2343 }
2344
vert_mc_qpel_avg_dst_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2345 static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src,
2346 int32_t src_stride,
2347 uint8_t *dst,
2348 int32_t dst_stride)
2349 {
2350 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2351 v16u8 dst0, dst1, dst2, dst3;
2352 v16u8 res0, res1;
2353 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2354 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2355 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2356
2357 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2358 src += (4 * src_stride);
2359 LD_UB2(src, src_stride, inp4, inp5);
2360 src += (2 * src_stride);
2361 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2362 inp1, inp2, inp3, inp4,
2363 inp1, inp0, inp0, inp1,
2364 inp2, inp3, inp4, inp5,
2365 const20, const6, const3);
2366 LD_UB2(src, src_stride, inp6, inp7);
2367 src += (2 * src_stride);
2368 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2369 inp3, inp4, inp5, inp6,
2370 inp3, inp2, inp1, inp0,
2371 inp4, inp5, inp6, inp7,
2372 const20, const6, const3);
2373 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2374 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2375 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2376 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2377 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2378 dst += (4 * dst_stride);
2379
2380 inp8 = LD_UB(src);
2381 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2382 inp5, inp6, inp7, inp8,
2383 inp5, inp4, inp3, inp2,
2384 inp6, inp7, inp8, inp8,
2385 const20, const6, const3);
2386 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2387 inp7, inp8, inp8, inp7,
2388 inp7, inp6, inp5, inp4,
2389 inp8, inp8, inp7, inp6,
2390 const20, const6, const3);
2391 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2392 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2393 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2394 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2395 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2396 }
2397
vert_mc_qpel_avg_dst_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2398 static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
2399 int32_t src_stride,
2400 uint8_t *dst,
2401 int32_t dst_stride)
2402 {
2403 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2404 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2405 v16u8 res0, res1, dst0, dst1;
2406 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2407 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2408 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2409
2410 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2411 src += (5 * src_stride);
2412 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2413 inp1, inp2, inp3, inp4,
2414 const20, const6, const3);
2415 inp5 = LD_UB(src);
2416 src += src_stride;
2417 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2418 inp2, inp3, inp4, inp5,
2419 const20, const6, const3);
2420 LD_UB2(dst, dst_stride, dst0, dst1);
2421 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2422 ST_UB2(res0, res1, dst, dst_stride);
2423 dst += (2 * dst_stride);
2424
2425 inp6 = LD_UB(src);
2426 src += src_stride;
2427 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2428 inp3, inp4, inp5, inp6,
2429 const20, const6, const3);
2430 inp7 = LD_UB(src);
2431 src += src_stride;
2432 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2433 inp4, inp5, inp6, inp7,
2434 const20, const6, const3);
2435 LD_UB2(dst, dst_stride, dst0, dst1);
2436 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2437 ST_UB2(res0, res1, dst, dst_stride);
2438 dst += (2 * dst_stride);
2439
2440 inp8 = LD_UB(src);
2441 src += src_stride;
2442 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2443 inp5, inp6, inp7, inp8,
2444 const20, const6, const3);
2445 inp9 = LD_UB(src);
2446 src += src_stride;
2447 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2448 inp6, inp7, inp8, inp9,
2449 const20, const6, const3);
2450 LD_UB2(dst, dst_stride, dst0, dst1);
2451 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2452 ST_UB2(res0, res1, dst, dst_stride);
2453 dst += (2 * dst_stride);
2454
2455 inp10 = LD_UB(src);
2456 src += src_stride;
2457 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2458 inp7, inp8, inp9, inp10,
2459 const20, const6, const3);
2460 inp11 = LD_UB(src);
2461 src += src_stride;
2462 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2463 inp8, inp9, inp10, inp11,
2464 const20, const6, const3);
2465 LD_UB2(dst, dst_stride, dst0, dst1);
2466 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2467 ST_UB2(res0, res1, dst, dst_stride);
2468 dst += (2 * dst_stride);
2469
2470 inp12 = LD_UB(src);
2471 src += src_stride;
2472 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2473 inp9, inp10, inp11, inp12,
2474 const20, const6, const3);
2475 inp13 = LD_UB(src);
2476 src += src_stride;
2477 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2478 inp10, inp11, inp12, inp13,
2479 const20, const6, const3);
2480 LD_UB2(dst, dst_stride, dst0, dst1);
2481 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2482 ST_UB2(res0, res1, dst, dst_stride);
2483 dst += (2 * dst_stride);
2484
2485 inp14 = LD_UB(src);
2486 src += src_stride;
2487 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2488 inp11, inp12, inp13, inp14,
2489 const20, const6, const3);
2490 inp15 = LD_UB(src);
2491 src += src_stride;
2492 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2493 inp12, inp13, inp14, inp15,
2494 const20, const6, const3);
2495 LD_UB2(dst, dst_stride, dst0, dst1);
2496 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2497 ST_UB2(res0, res1, dst, dst_stride);
2498 dst += (2 * dst_stride);
2499
2500 inp16 = LD_UB(src);
2501 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2502 inp13, inp14, inp15, inp16,
2503 const20, const6, const3);
2504 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2505 inp14, inp15, inp16, inp16,
2506 const20, const6, const3);
2507 LD_UB2(dst, dst_stride, dst0, dst1);
2508 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2509 ST_UB2(res0, res1, dst, dst_stride);
2510 dst += (2 * dst_stride);
2511
2512 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2513 inp15, inp16, inp16, inp15,
2514 const20, const6, const3);
2515 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2516 inp16, inp16, inp15, inp14,
2517 const20, const6, const3);
2518 LD_UB2(dst, dst_stride, dst0, dst1);
2519 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2520 ST_UB2(res0, res1, dst, dst_stride);
2521 }
2522
vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2523 static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src,
2524 int32_t src_stride,
2525 uint8_t *dst,
2526 int32_t dst_stride)
2527 {
2528 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2529 v16u8 dst0, dst1, dst2, dst3;
2530 v16u8 tmp0, tmp1, res0, res1;
2531 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2532 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2533 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2534
2535 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2536 src += (4 * src_stride);
2537 LD_UB2(src, src_stride, inp4, inp5);
2538 src += (2 * src_stride);
2539 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2540 inp1, inp2, inp3, inp4,
2541 inp1, inp0, inp0, inp1,
2542 inp2, inp3, inp4, inp5,
2543 const20, const6, const3);
2544 LD_UB2(src, src_stride, inp6, inp7);
2545 src += (2 * src_stride);
2546 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2547 inp3, inp4, inp5, inp6,
2548 inp3, inp2, inp1, inp0,
2549 inp4, inp5, inp6, inp7,
2550 const20, const6, const3);
2551 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2552 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2553 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2554 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2555 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2556 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2557 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2558 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2559 dst += (4 * dst_stride);
2560
2561 inp8 = LD_UB(src);
2562 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2563 inp5, inp6, inp7, inp8,
2564 inp5, inp4, inp3, inp2,
2565 inp6, inp7, inp8, inp8,
2566 const20, const6, const3);
2567 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2568 inp7, inp8, inp8, inp7,
2569 inp7, inp6, inp5, inp4,
2570 inp8, inp8, inp7, inp6,
2571 const20, const6, const3);
2572 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2573 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2574 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2575 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2576 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2577 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2578 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2579 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2580 }
2581
vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2582 static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
2583 int32_t src_stride,
2584 uint8_t *dst,
2585 int32_t dst_stride)
2586 {
2587 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2588 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2589 v16u8 res0, res1, dst0, dst1;
2590 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2591 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2592 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2593
2594 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2595 src += (5 * src_stride);
2596 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2597 inp1, inp2, inp3, inp4,
2598 const20, const6, const3);
2599 inp5 = LD_UB(src);
2600 src += src_stride;
2601 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2602 inp2, inp3, inp4, inp5,
2603 const20, const6, const3);
2604 LD_UB2(dst, dst_stride, dst0, dst1);
2605 AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
2606 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2607 ST_UB2(res0, res1, dst, dst_stride);
2608 dst += (2 * dst_stride);
2609
2610 inp6 = LD_UB(src);
2611 src += src_stride;
2612 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2613 inp3, inp4, inp5, inp6,
2614 const20, const6, const3);
2615 inp7 = LD_UB(src);
2616 src += src_stride;
2617 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2618 inp4, inp5, inp6, inp7,
2619 const20, const6, const3);
2620 LD_UB2(dst, dst_stride, dst0, dst1);
2621 AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
2622 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2623 ST_UB2(res0, res1, dst, dst_stride);
2624 dst += (2 * dst_stride);
2625
2626 inp8 = LD_UB(src);
2627 src += src_stride;
2628 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2629 inp5, inp6, inp7, inp8,
2630 const20, const6, const3);
2631 inp9 = LD_UB(src);
2632 src += src_stride;
2633 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2634 inp6, inp7, inp8, inp9,
2635 const20, const6, const3);
2636 LD_UB2(dst, dst_stride, dst0, dst1);
2637 AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
2638 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2639 ST_UB2(res0, res1, dst, dst_stride);
2640 dst += (2 * dst_stride);
2641
2642 inp10 = LD_UB(src);
2643 src += src_stride;
2644 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2645 inp7, inp8, inp9, inp10,
2646 const20, const6, const3);
2647 inp11 = LD_UB(src);
2648 src += src_stride;
2649 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2650 inp8, inp9, inp10, inp11,
2651 const20, const6, const3);
2652 LD_UB2(dst, dst_stride, dst0, dst1);
2653 AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
2654 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2655 ST_UB2(res0, res1, dst, dst_stride);
2656 dst += (2 * dst_stride);
2657
2658 inp12 = LD_UB(src);
2659 src += src_stride;
2660 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2661 inp9, inp10, inp11, inp12,
2662 const20, const6, const3);
2663 inp13 = LD_UB(src);
2664 src += src_stride;
2665 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2666 inp10, inp11, inp12, inp13,
2667 const20, const6, const3);
2668 LD_UB2(dst, dst_stride, dst0, dst1);
2669 AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
2670 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2671 ST_UB2(res0, res1, dst, dst_stride);
2672 dst += (2 * dst_stride);
2673
2674 inp14 = LD_UB(src);
2675 src += src_stride;
2676 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2677 inp11, inp12, inp13, inp14,
2678 const20, const6, const3);
2679 inp15 = LD_UB(src);
2680 src += src_stride;
2681 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2682 inp12, inp13, inp14, inp15,
2683 const20, const6, const3);
2684 LD_UB2(dst, dst_stride, dst0, dst1);
2685 AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2686 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2687 ST_UB2(res0, res1, dst, dst_stride);
2688 dst += (2 * dst_stride);
2689
2690 inp16 = LD_UB(src);
2691 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2692 inp13, inp14, inp15, inp16,
2693 const20, const6, const3);
2694 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2695 inp14, inp15, inp16, inp16,
2696 const20, const6, const3);
2697 LD_UB2(dst, dst_stride, dst0, dst1);
2698 AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2699 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2700 ST_UB2(res0, res1, dst, dst_stride);
2701 dst += (2 * dst_stride);
2702
2703 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2704 inp15, inp16, inp16, inp15,
2705 const20, const6, const3);
2706 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2707 inp16, inp16, inp15, inp14,
2708 const20, const6, const3);
2709 LD_UB2(dst, dst_stride, dst0, dst1);
2710 AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2711 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2712 ST_UB2(res0, res1, dst, dst_stride);
2713 }
2714
hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)2715 static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src,
2716 int32_t src_stride,
2717 uint8_t *dst,
2718 int32_t dst_stride,
2719 int32_t height)
2720 {
2721 uint8_t loop_count;
2722 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2723 v16u8 res;
2724 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2725 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2726 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2727 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2728
2729 for (loop_count = (height >> 2); loop_count--;) {
2730 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2731 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2732 src += (4 * src_stride);
2733 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2734 const20, const6, const3);
2735 res = __msa_ave_u_b(inp0, res);
2736 ST_UB(res, dst);
2737 dst += dst_stride;
2738
2739 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2740 const20, const6, const3);
2741 res = __msa_ave_u_b(inp2, res);
2742 ST_UB(res, dst);
2743 dst += dst_stride;
2744
2745 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2746 const20, const6, const3);
2747 res = __msa_ave_u_b(inp4, res);
2748 ST_UB(res, dst);
2749 dst += dst_stride;
2750
2751 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2752 const20, const6, const3);
2753 res = __msa_ave_u_b(inp6, res);
2754 ST_UB(res, dst);
2755 dst += dst_stride;
2756 }
2757
2758 LD_UB2(src, 1, inp0, inp1);
2759 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2760 const20, const6, const3);
2761 res = __msa_ave_u_b(inp0, res);
2762 ST_UB(res, dst);
2763 }
2764
hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2765 static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src,
2766 int32_t src_stride,
2767 uint8_t *dst,
2768 int32_t dst_stride)
2769 {
2770 uint8_t buff[272];
2771
2772 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
2773 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2774 }
2775
hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2776 static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src,
2777 int32_t src_stride,
2778 uint8_t *dst,
2779 int32_t dst_stride)
2780 {
2781 v16u8 inp0, inp1, inp2, inp3;
2782 v16u8 res0, res1, avg0, avg1;
2783 v16u8 horiz0, horiz1, horiz2, horiz3;
2784 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2785 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2786 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2787 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2788 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2789 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2790 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2791 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2792
2793 LD_UB2(src, src_stride, inp0, inp1);
2794 src += (2 * src_stride);
2795 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2796 mask2, mask3, const20,
2797 const6, const3);
2798 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2799 horiz0 = __msa_ave_u_b(inp0, res0);
2800 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2801 LD_UB2(src, src_stride, inp2, inp3);
2802 src += (2 * src_stride);
2803 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2804 mask2, mask3, const20,
2805 const6, const3);
2806 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2807 horiz2 = __msa_ave_u_b(inp2, res1);
2808 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2809 LD_UB2(src, src_stride, inp0, inp1);
2810 src += (2 * src_stride);
2811 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2812 mask2, mask3, const20,
2813 const6, const3);
2814 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2815 horiz4 = __msa_ave_u_b(inp0, res0);
2816 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2817 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2818 horiz1, horiz2, horiz3, horiz4,
2819 horiz1, horiz0, horiz0, horiz1,
2820 horiz2, horiz3, horiz4, horiz5,
2821 const20, const6, const3);
2822 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2823 res0 = __msa_ave_u_b(avg0, res0);
2824 ST_D2(res0, 0, 1, dst, dst_stride);
2825 dst += (2 * dst_stride);
2826
2827 LD_UB2(src, src_stride, inp2, inp3);
2828 src += (2 * src_stride);
2829 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2830 mask2, mask3, const20,
2831 const6, const3);
2832 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2833 horiz6 = __msa_ave_u_b(inp2, res1);
2834 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2835 inp0 = LD_UB(src);
2836 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2837 mask2, mask3, const20,
2838 const6, const3);
2839 horiz8 = __msa_ave_u_b(inp0, res0);
2840 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2841 horiz3, horiz4, horiz5, horiz6,
2842 horiz3, horiz2, horiz1, horiz0,
2843 horiz4, horiz5, horiz6, horiz7,
2844 const20, const6, const3);
2845 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2846 res1 = __msa_ave_u_b(avg1, res1);
2847 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2848 horiz5, horiz6, horiz7, horiz8,
2849 horiz5, horiz4, horiz3, horiz2,
2850 horiz6, horiz7, horiz8, horiz8,
2851 const20, const6, const3);
2852 ST_D2(res1, 0, 1, dst, dst_stride);
2853 dst += 2 * dst_stride;
2854
2855 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2856 res0 = __msa_ave_u_b(avg0, res0);
2857 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
2858 horiz7, horiz8, horiz8, horiz7,
2859 horiz7, horiz6, horiz5, horiz4,
2860 horiz8, horiz8, horiz7, horiz6,
2861 const20, const6, const3);
2862 ST_D2(res0, 0, 1, dst, dst_stride);
2863 dst += 2 * dst_stride;
2864
2865 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2866 res1 = __msa_ave_u_b(avg1, res1);
2867 ST_D2(res1, 0, 1, dst, dst_stride);
2868 }
2869
hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)2870 static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
2871 int32_t src_stride,
2872 uint8_t *dst,
2873 int32_t dst_stride,
2874 int32_t height)
2875 {
2876 uint8_t loop_count;
2877 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2878 v16u8 res;
2879 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2880 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2881 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2882 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2883
2884 for (loop_count = (height >> 2); loop_count--;) {
2885 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2886 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2887 src += (4 * src_stride);
2888 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2889 const20, const6, const3);
2890 ST_UB(res, dst);
2891 dst += dst_stride;
2892
2893 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2894 const20, const6, const3);
2895 ST_UB(res, dst);
2896 dst += dst_stride;
2897
2898 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2899 const20, const6, const3);
2900 ST_UB(res, dst);
2901 dst += dst_stride;
2902
2903 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2904 const20, const6, const3);
2905 ST_UB(res, dst);
2906 dst += dst_stride;
2907 }
2908
2909 LD_UB2(src, 1, inp0, inp1);
2910 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2911 const20, const6, const3);
2912 ST_UB(res, dst);
2913 }
2914
hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2915 static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src,
2916 int32_t src_stride,
2917 uint8_t *dst,
2918 int32_t dst_stride)
2919 {
2920 uint8_t buff[272];
2921
2922 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
2923 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2924 }
2925
hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2926 static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src,
2927 int32_t src_stride,
2928 uint8_t *dst,
2929 int32_t dst_stride)
2930 {
2931 v16u8 inp0, inp1, inp2, inp3;
2932 v16u8 res0, res1, avg0, avg1;
2933 v16u8 horiz0, horiz1, horiz2, horiz3;
2934 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2935 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2936 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2937 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2938 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2939 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2940 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2941 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2942
2943 LD_UB2(src, src_stride, inp0, inp1);
2944 src += (2 * src_stride);
2945 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2946 mask2, mask3, const20,
2947 const6, const3);
2948 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2949
2950 LD_UB2(src, src_stride, inp2, inp3);
2951 src += (2 * src_stride);
2952 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2953 mask2, mask3, const20,
2954 const6, const3);
2955 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2956 LD_UB2(src, src_stride, inp0, inp1);
2957 src += (2 * src_stride);
2958 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2959 mask2, mask3, const20,
2960 const6, const3);
2961 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2962 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2963 horiz1, horiz2, horiz3, horiz4,
2964 horiz1, horiz0, horiz0, horiz1,
2965 horiz2, horiz3, horiz4, horiz5,
2966 const20, const6, const3);
2967 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2968 res0 = __msa_ave_u_b(avg0, res0);
2969 ST_D2(res0, 0, 1, dst, dst_stride);
2970 dst += (2 * dst_stride);
2971
2972 LD_UB2(src, src_stride, inp2, inp3);
2973 src += (2 * src_stride);
2974 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2975 mask2, mask3, const20,
2976 const6, const3);
2977 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2978 inp0 = LD_UB(src);
2979 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2980 mask2, mask3, const20,
2981 const6, const3);
2982 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2983 horiz3, horiz4, horiz5, horiz6,
2984 horiz3, horiz2, horiz1, horiz0,
2985 horiz4, horiz5, horiz6, horiz7,
2986 const20, const6, const3);
2987 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2988 res1 = __msa_ave_u_b(avg1, res1);
2989 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2990 res0 = __msa_ave_u_b(avg0, res0);
2991 ST_D2(res1, 0, 1, dst, dst_stride);
2992 dst += (2 * dst_stride);
2993
2994 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2995 horiz5, horiz6, horiz7, horiz8,
2996 horiz5, horiz4, horiz3, horiz2,
2997 horiz6, horiz7, horiz8, horiz8,
2998 const20, const6, const3);
2999 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3000 res0 = __msa_ave_u_b(avg0, res0);
3001 ST_D2(res0, 0, 1, dst, dst_stride);
3002 dst += (2 * dst_stride);
3003
3004 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3005 horiz7, horiz8, horiz8, horiz7,
3006 horiz7, horiz6, horiz5, horiz4,
3007 horiz8, horiz8, horiz7, horiz6,
3008 const20, const6, const3);
3009 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3010 res1 = __msa_ave_u_b(avg1, res1);
3011 ST_D2(res1, 0, 1, dst, dst_stride);
3012 }
3013
hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)3014 static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
3015 int32_t src_stride,
3016 uint8_t *dst,
3017 int32_t dst_stride,
3018 int32_t height)
3019 {
3020 uint8_t loop_count;
3021 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3022 v16u8 res;
3023 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3024 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3025 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3026 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3027
3028 for (loop_count = (height >> 2); loop_count--;) {
3029 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3030 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3031 src += (4 * src_stride);
3032 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3033 const20, const6, const3);
3034 res = __msa_ave_u_b(res, inp1);
3035 ST_UB(res, dst);
3036 dst += dst_stride;
3037
3038 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
3039 const20, const6, const3);
3040 res = __msa_ave_u_b(res, inp3);
3041 ST_UB(res, dst);
3042 dst += dst_stride;
3043
3044 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
3045 const20, const6, const3);
3046 res = __msa_ave_u_b(res, inp5);
3047 ST_UB(res, dst);
3048 dst += dst_stride;
3049
3050 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
3051 const20, const6, const3);
3052 res = __msa_ave_u_b(res, inp7);
3053 ST_UB(res, dst);
3054 dst += dst_stride;
3055 }
3056
3057 LD_UB2(src, 1, inp0, inp1);
3058 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3059 const20, const6, const3);
3060 res = __msa_ave_u_b(inp1, res);
3061 ST_UB(res, dst);
3062 }
3063
hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3064 static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src,
3065 int32_t src_stride,
3066 uint8_t *dst,
3067 int32_t dst_stride)
3068 {
3069 uint8_t buff[272];
3070
3071 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3072 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3073 }
3074
hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3075 static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src,
3076 int32_t src_stride,
3077 uint8_t *dst,
3078 int32_t dst_stride)
3079 {
3080 v16u8 inp0, inp1, inp2, inp3;
3081 v16u8 res0, res1, avg0, avg1;
3082 v16u8 horiz0, horiz1, horiz2, horiz3;
3083 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3084 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3085 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3086 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3087 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3088 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3089 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3090 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3091
3092 LD_UB2(src, src_stride, inp0, inp1);
3093 src += (2 * src_stride);
3094 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3095 mask2, mask3, const20,
3096 const6, const3);
3097 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3098
3099 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3100 horiz0 = __msa_ave_u_b(inp0, res0);
3101 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3102 LD_UB2(src, src_stride, inp2, inp3);
3103 src += (2 * src_stride);
3104 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3105 mask2, mask3, const20,
3106 const6, const3);
3107 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3108
3109 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3110 horiz2 = __msa_ave_u_b(inp2, res1);
3111 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3112 LD_UB2(src, src_stride, inp0, inp1);
3113 src += (2 * src_stride);
3114 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3115 mask2, mask3, const20,
3116 const6, const3);
3117 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3118
3119 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3120 horiz4 = __msa_ave_u_b(inp0, res0);
3121 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3122 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3123 horiz1, horiz2, horiz3, horiz4,
3124 horiz1, horiz0, horiz0, horiz1,
3125 horiz2, horiz3, horiz4, horiz5,
3126 const20, const6, const3);
3127 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3128 res0 = __msa_ave_u_b(avg0, res0);
3129 ST_D2(res0, 0, 1, dst, dst_stride);
3130 dst += (2 * dst_stride);
3131
3132 LD_UB2(src, src_stride, inp2, inp3);
3133 src += (2 * src_stride);
3134 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3135 mask2, mask3, const20,
3136 const6, const3);
3137 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3138
3139 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3140 horiz6 = __msa_ave_u_b(inp2, res1);
3141 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3142 inp0 = LD_UB(src);
3143 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3144 mask2, mask3, const20,
3145 const6, const3);
3146 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3147 horiz8 = __msa_ave_u_b(inp0, res0);
3148 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3149 horiz3, horiz4, horiz5, horiz6,
3150 horiz3, horiz2, horiz1, horiz0,
3151 horiz4, horiz5, horiz6, horiz7,
3152 const20, const6, const3);
3153 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3154 res1 = __msa_ave_u_b(avg1, res1);
3155 ST_D2(res1, 0, 1, dst, dst_stride);
3156 dst += (2 * dst_stride);
3157
3158 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3159 horiz5, horiz6, horiz7, horiz8,
3160 horiz5, horiz4, horiz3, horiz2,
3161 horiz6, horiz7, horiz8, horiz8,
3162 const20, const6, const3);
3163 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3164 res0 = __msa_ave_u_b(avg0, res0);
3165 ST_D2(res0, 0, 1, dst, dst_stride);
3166 dst += (2 * dst_stride);
3167
3168 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3169 horiz7, horiz8, horiz8, horiz7,
3170 horiz7, horiz6, horiz5, horiz4,
3171 horiz8, horiz8, horiz7, horiz6,
3172 const20, const6, const3);
3173 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3174 res1 = __msa_ave_u_b(avg1, res1);
3175 ST_D2(res1, 0, 1, dst, dst_stride);
3176 }
3177
hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3178 static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
3179 int32_t src_stride,
3180 uint8_t *dst,
3181 int32_t dst_stride)
3182 {
3183 uint8_t buff[272];
3184
3185 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3186 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3187 }
3188
hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3189 static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src,
3190 int32_t src_stride,
3191 uint8_t *dst,
3192 int32_t dst_stride)
3193 {
3194 v16u8 inp0, inp1, inp2, inp3;
3195 v16u8 res0, res1;
3196 v16u8 horiz0, horiz1, horiz2, horiz3;
3197 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3198 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3199 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3200 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3201 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3202 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3203 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3204 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3205
3206 LD_UB2(src, src_stride, inp0, inp1);
3207 src += (2 * src_stride);
3208 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3209 mask2, mask3, const20,
3210 const6, const3);
3211 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3212 horiz0 = __msa_ave_u_b(inp0, res0);
3213 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3214 LD_UB2(src, src_stride, inp2, inp3);
3215 src += (2 * src_stride);
3216 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3217 mask2, mask3, const20,
3218 const6, const3);
3219 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3220 horiz2 = __msa_ave_u_b(inp2, res1);
3221 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3222 LD_UB2(src, src_stride, inp0, inp1);
3223 src += (2 * src_stride);
3224 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3225 mask2, mask3, const20,
3226 const6, const3);
3227 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3228 horiz4 = __msa_ave_u_b(inp0, res0);
3229 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3230 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3231 horiz1, horiz2, horiz3, horiz4,
3232 horiz1, horiz0, horiz0, horiz1,
3233 horiz2, horiz3, horiz4, horiz5,
3234 const20, const6, const3);
3235
3236 LD_UB2(src, src_stride, inp2, inp3);
3237 src += (2 * src_stride);
3238 ST_D2(res0, 0, 1, dst, dst_stride);
3239 dst += 2 * dst_stride;
3240
3241 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3242 mask2, mask3, const20,
3243 const6, const3);
3244 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3245 horiz6 = __msa_ave_u_b(inp2, res1);
3246 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3247 inp0 = LD_UB(src);
3248 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3249 mask2, mask3, const20,
3250 const6, const3);
3251 horiz8 = __msa_ave_u_b(inp0, res0);
3252 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3253 horiz3, horiz4, horiz5, horiz6,
3254 horiz3, horiz2, horiz1, horiz0,
3255 horiz4, horiz5, horiz6, horiz7,
3256 const20, const6, const3);
3257 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3258 horiz5, horiz6, horiz7, horiz8,
3259 horiz5, horiz4, horiz3, horiz2,
3260 horiz6, horiz7, horiz8, horiz8,
3261 const20, const6, const3);
3262 ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride);
3263 dst += (4 * dst_stride);
3264
3265 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3266 horiz7, horiz8, horiz8, horiz7,
3267 horiz7, horiz6, horiz5, horiz4,
3268 horiz8, horiz8, horiz7, horiz6,
3269 const20, const6, const3);
3270 ST_D2(res1, 0, 1, dst, dst_stride);
3271 }
3272
hv_mc_qpel_no_rnd_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3273 static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
3274 int32_t src_stride,
3275 uint8_t *dst,
3276 int32_t dst_stride)
3277 {
3278 uint8_t buff[272];
3279
3280 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3281 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3282 }
3283
hv_mc_qpel_no_rnd_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3284 static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
3285 int32_t src_stride,
3286 uint8_t *dst,
3287 int32_t dst_stride)
3288 {
3289 v16u8 inp0, inp1, inp2, inp3;
3290 v16u8 res0, res1;
3291 v16u8 horiz0, horiz1, horiz2, horiz3;
3292 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3293 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3294 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3295 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3296 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3297 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3298 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3299 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3300
3301 LD_UB2(src, src_stride, inp0, inp1);
3302 src += (2 * src_stride);
3303 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3304 mask2, mask3, const20,
3305 const6, const3);
3306 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3307 LD_UB2(src, src_stride, inp2, inp3);
3308 src += (2 * src_stride);
3309 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3310 mask2, mask3, const20,
3311 const6, const3);
3312 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3313 LD_UB2(src, src_stride, inp0, inp1);
3314 src += (2 * src_stride);
3315 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3316 mask2, mask3, const20,
3317 const6, const3);
3318 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3319 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3320 horiz1, horiz2, horiz3, horiz4,
3321 horiz1, horiz0, horiz0, horiz1,
3322 horiz2, horiz3, horiz4, horiz5,
3323 const20, const6, const3);
3324 LD_UB2(src, src_stride, inp2, inp3);
3325 src += (2 * src_stride);
3326 ST_D2(res0, 0, 1, dst, dst_stride);
3327 dst += 2 * dst_stride;
3328
3329 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3330 mask2, mask3, const20,
3331 const6, const3);
3332 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3333 inp0 = LD_UB(src);
3334 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3335 mask2, mask3, const20,
3336 const6, const3);
3337 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3338 horiz3, horiz4, horiz5, horiz6,
3339 horiz3, horiz2, horiz1, horiz0,
3340 horiz4, horiz5, horiz6, horiz7,
3341 const20, const6, const3);
3342 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3343 horiz5, horiz6, horiz7, horiz8,
3344 horiz5, horiz4, horiz3, horiz2,
3345 horiz6, horiz7, horiz8, horiz8,
3346 const20, const6, const3);
3347 ST_D2(res1, 0, 1, dst, dst_stride);
3348 dst += 2 * dst_stride;
3349
3350
3351 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3352 horiz7, horiz8, horiz8, horiz7,
3353 horiz7, horiz6, horiz5, horiz4,
3354 horiz8, horiz8, horiz7, horiz6,
3355 const20, const6, const3);
3356 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3357 }
3358
hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3359 static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
3360 int32_t src_stride,
3361 uint8_t *dst,
3362 int32_t dst_stride)
3363 {
3364 uint8_t buff[272];
3365
3366 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3367 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3368 }
3369
hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3370 static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src,
3371 int32_t src_stride,
3372 uint8_t *dst,
3373 int32_t dst_stride)
3374 {
3375 v16u8 inp0, inp1, inp2, inp3;
3376 v16u8 res0, res1;
3377 v16u8 horiz0, horiz1, horiz2, horiz3;
3378 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3379 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3380 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3381 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3382 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3383 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3384 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3385 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3386
3387 LD_UB2(src, src_stride, inp0, inp1);
3388 src += (2 * src_stride);
3389 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3390 mask2, mask3, const20,
3391 const6, const3);
3392 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3393
3394 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3395 horiz0 = __msa_ave_u_b(inp0, res0);
3396 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3397 LD_UB2(src, src_stride, inp2, inp3);
3398 src += (2 * src_stride);
3399 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3400 mask2, mask3, const20,
3401 const6, const3);
3402 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3403
3404 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3405 horiz2 = __msa_ave_u_b(inp2, res1);
3406 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3407 LD_UB2(src, src_stride, inp0, inp1);
3408 src += (2 * src_stride);
3409 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3410 mask2, mask3, const20,
3411 const6, const3);
3412 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3413
3414 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3415 horiz4 = __msa_ave_u_b(inp0, res0);
3416 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3417 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3418 horiz1, horiz2, horiz3, horiz4,
3419 horiz1, horiz0, horiz0, horiz1,
3420 horiz2, horiz3, horiz4, horiz5,
3421 const20, const6, const3);
3422 LD_UB2(src, src_stride, inp2, inp3);
3423 src += (2 * src_stride);
3424 ST_D2(res0, 0, 1, dst, dst_stride);
3425 dst += 2 * dst_stride;
3426
3427 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3428 mask2, mask3, const20,
3429 const6, const3);
3430 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3431
3432 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3433 horiz6 = __msa_ave_u_b(inp2, res1);
3434 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3435 inp0 = LD_UB(src);
3436 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3437 mask2, mask3, const20,
3438 const6, const3);
3439 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3440 horiz8 = __msa_ave_u_b(inp0, res0);
3441 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3442 horiz3, horiz4, horiz5, horiz6,
3443 horiz3, horiz2, horiz1, horiz0,
3444 horiz4, horiz5, horiz6, horiz7,
3445 const20, const6, const3);
3446 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3447 horiz5, horiz6, horiz7, horiz8,
3448 horiz5, horiz4, horiz3, horiz2,
3449 horiz6, horiz7, horiz8, horiz8,
3450 const20, const6, const3);
3451 ST_D2(res1, 0, 1, dst, dst_stride);
3452 dst += 2 * dst_stride;
3453
3454 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3455 horiz7, horiz8, horiz8, horiz7,
3456 horiz7, horiz6, horiz5, horiz4,
3457 horiz8, horiz8, horiz7, horiz6,
3458 const20, const6, const3);
3459 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3460 }
3461
hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3462 static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
3463 int32_t src_stride,
3464 uint8_t *dst,
3465 int32_t dst_stride)
3466 {
3467 uint8_t buff[272];
3468
3469 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3470 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3471 }
3472
hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3473 static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src,
3474 int32_t src_stride,
3475 uint8_t *dst,
3476 int32_t dst_stride)
3477 {
3478 v16u8 inp0, inp1, inp2, inp3;
3479 v16u8 res0, res1, avg0, avg1;
3480 v16u8 horiz0, horiz1, horiz2, horiz3;
3481 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3482 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3483 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3484 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3485 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3486 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3487 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3488 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3489
3490 LD_UB2(src, src_stride, inp0, inp1);
3491 src += (2 * src_stride);
3492 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3493 mask2, mask3, const20,
3494 const6, const3);
3495 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3496 horiz0 = __msa_ave_u_b(inp0, res0);
3497 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3498 LD_UB2(src, src_stride, inp2, inp3);
3499 src += (2 * src_stride);
3500 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3501 mask2, mask3, const20,
3502 const6, const3);
3503 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3504 horiz2 = __msa_ave_u_b(inp2, res1);
3505 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3506 LD_UB2(src, src_stride, inp0, inp1);
3507 src += (2 * src_stride);
3508 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3509 mask2, mask3, const20,
3510 const6, const3);
3511 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3512 horiz4 = __msa_ave_u_b(inp0, res0);
3513 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3514 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3515 horiz1, horiz2, horiz3, horiz4,
3516 horiz1, horiz0, horiz0, horiz1,
3517 horiz2, horiz3, horiz4, horiz5,
3518 const20, const6, const3);
3519 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3520 res0 = __msa_ave_u_b(avg0, res0);
3521 ST_D2(res0, 0, 1, dst, dst_stride);
3522 dst += (2 * dst_stride);
3523
3524 LD_UB2(src, src_stride, inp2, inp3);
3525 src += (2 * src_stride);
3526 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3527 mask2, mask3, const20,
3528 const6, const3);
3529 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3530 horiz6 = __msa_ave_u_b(inp2, res1);
3531 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3532 inp0 = LD_UB(src);
3533 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3534 mask2, mask3, const20,
3535 const6, const3);
3536 horiz8 = __msa_ave_u_b(inp0, res0);
3537 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3538 horiz3, horiz4, horiz5, horiz6,
3539 horiz3, horiz2, horiz1, horiz0,
3540 horiz4, horiz5, horiz6, horiz7,
3541 const20, const6, const3);
3542 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3543 res1 = __msa_ave_u_b(avg1, res1);
3544 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3545 horiz5, horiz6, horiz7, horiz8,
3546 horiz5, horiz4, horiz3, horiz2,
3547 horiz6, horiz7, horiz8, horiz8,
3548 const20, const6, const3);
3549 ST_D2(res1, 0, 1, dst, dst_stride);
3550 dst += 2 * dst_stride;
3551
3552 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3553 res0 = __msa_ave_u_b(avg0, res0);
3554
3555 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3556 horiz7, horiz8, horiz8, horiz7,
3557 horiz7, horiz6, horiz5, horiz4,
3558 horiz8, horiz8, horiz7, horiz6,
3559 const20, const6, const3);
3560 ST_D2(res0, 0, 1, dst, dst_stride);
3561 dst += 2 * dst_stride;
3562
3563 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3564 res1 = __msa_ave_u_b(avg1, res1);
3565 ST_D2(res1, 0, 1, dst, dst_stride);
3566 }
3567
hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3568 static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
3569 int32_t src_stride,
3570 uint8_t *dst,
3571 int32_t dst_stride)
3572 {
3573 uint8_t buff[272];
3574
3575 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3576 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3577 }
3578
hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3579 static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src,
3580 int32_t src_stride,
3581 uint8_t *dst,
3582 int32_t dst_stride)
3583 {
3584 v16u8 inp0, inp1, inp2, inp3;
3585 v16u8 res0, res1, avg0, avg1;
3586 v16u8 horiz0, horiz1, horiz2, horiz3;
3587 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3588 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3589 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3590 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3591 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3592 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3593 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3594 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3595
3596 LD_UB2(src, src_stride, inp0, inp1);
3597 src += (2 * src_stride);
3598 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3599 mask2, mask3, const20,
3600 const6, const3);
3601 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3602 LD_UB2(src, src_stride, inp2, inp3);
3603 src += (2 * src_stride);
3604 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3605 mask2, mask3, const20,
3606 const6, const3);
3607 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3608 LD_UB2(src, src_stride, inp0, inp1);
3609 src += (2 * src_stride);
3610 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3611 mask2, mask3, const20,
3612 const6, const3);
3613 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3614 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3615 horiz1, horiz2, horiz3, horiz4,
3616 horiz1, horiz0, horiz0, horiz1,
3617 horiz2, horiz3, horiz4, horiz5,
3618 const20, const6, const3);
3619 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3620 res0 = __msa_ave_u_b(avg0, res0);
3621 LD_UB2(src, src_stride, inp2, inp3);
3622 src += (2 * src_stride);
3623 ST_D2(res0, 0, 1, dst, dst_stride);
3624 dst += 2 * dst_stride;
3625
3626 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3627 mask2, mask3, const20,
3628 const6, const3);
3629 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3630 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3631 horiz3, horiz4, horiz5, horiz6,
3632 horiz3, horiz2, horiz1, horiz0,
3633 horiz4, horiz5, horiz6, horiz7,
3634 const20, const6, const3);
3635 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3636 res1 = __msa_ave_u_b(avg1, res1);
3637 inp0 = LD_UB(src);
3638 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3639 mask2, mask3, const20,
3640 const6, const3);
3641 ST_D2(res1, 0, 1, dst, dst_stride);
3642 dst += 2 * dst_stride;
3643
3644 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3645 horiz5, horiz6, horiz7, horiz8,
3646 horiz5, horiz4, horiz3, horiz2,
3647 horiz6, horiz7, horiz8, horiz8,
3648 const20, const6, const3);
3649 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3650 res0 = __msa_ave_u_b(avg0, res0);
3651 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3652 horiz7, horiz8, horiz8, horiz7,
3653 horiz7, horiz6, horiz5, horiz4,
3654 horiz8, horiz8, horiz7, horiz6,
3655 const20, const6, const3);
3656 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3657 res1 = __msa_ave_u_b(avg1, res1);
3658 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3659 }
3660
hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3661 static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
3662 int32_t src_stride,
3663 uint8_t *dst,
3664 int32_t dst_stride)
3665 {
3666 uint8_t buff[272];
3667
3668 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3669 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3670 }
3671
hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3672 static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src,
3673 int32_t src_stride,
3674 uint8_t *dst,
3675 int32_t dst_stride)
3676 {
3677 v16u8 inp0, inp1, inp2, inp3;
3678 v16u8 res0, res1, avg0, avg1;
3679 v16u8 horiz0, horiz1, horiz2, horiz3;
3680 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3681 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3682 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3683 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3684 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3685 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3686 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3687 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3688
3689 LD_UB2(src, src_stride, inp0, inp1);
3690 src += (2 * src_stride);
3691 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3692 mask2, mask3, const20,
3693 const6, const3);
3694 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3695
3696 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3697 horiz0 = __msa_ave_u_b(inp0, res0);
3698 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3699 LD_UB2(src, src_stride, inp2, inp3);
3700 src += (2 * src_stride);
3701 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3702 mask2, mask3, const20,
3703 const6, const3);
3704 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3705
3706 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3707 horiz2 = __msa_ave_u_b(inp2, res1);
3708 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3709 LD_UB2(src, src_stride, inp0, inp1);
3710 src += (2 * src_stride);
3711 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3712 mask2, mask3, const20,
3713 const6, const3);
3714
3715 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3716 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3717 horiz4 = __msa_ave_u_b(inp0, res0);
3718 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3719 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3720 horiz1, horiz2, horiz3, horiz4,
3721 horiz1, horiz0, horiz0, horiz1,
3722 horiz2, horiz3, horiz4, horiz5,
3723 const20, const6, const3);
3724 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3725 res0 = __msa_ave_u_b(avg0, res0);
3726 ST_D2(res0, 0, 1, dst, dst_stride);
3727 dst += (2 * dst_stride);
3728
3729 LD_UB2(src, src_stride, inp2, inp3);
3730 src += (2 * src_stride);
3731 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3732 mask2, mask3, const20,
3733 const6, const3);
3734 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3735
3736 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3737 horiz6 = __msa_ave_u_b(inp2, res1);
3738 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3739 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3740 horiz3, horiz4, horiz5, horiz6,
3741 horiz3, horiz2, horiz1, horiz0,
3742 horiz4, horiz5, horiz6, horiz7,
3743 const20, const6, const3);
3744 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3745 res1 = __msa_ave_u_b(avg1, res1);
3746 ST_D2(res1, 0, 1, dst, dst_stride);
3747 dst += (2 * dst_stride);
3748
3749 inp0 = LD_UB(src);
3750 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3751 mask2, mask3, const20,
3752 const6, const3);
3753 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3754 horiz8 = __msa_ave_u_b(inp0, res0);
3755 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3756 horiz5, horiz6, horiz7, horiz8,
3757 horiz5, horiz4, horiz3, horiz2,
3758 horiz6, horiz7, horiz8, horiz8,
3759 const20, const6, const3);
3760 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3761 horiz7, horiz8, horiz8, horiz7,
3762 horiz7, horiz6, horiz5, horiz4,
3763 horiz8, horiz8, horiz7, horiz6,
3764 const20, const6, const3);
3765 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3766 res0 = __msa_ave_u_b(avg0, res0);
3767 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3768 res1 = __msa_ave_u_b(avg1, res1);
3769 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3770 }
3771
hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)3772 static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
3773 int32_t src_stride,
3774 uint8_t *dst,
3775 int32_t dst_stride,
3776 int32_t height)
3777 {
3778 uint8_t loop_count;
3779 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3780 v16u8 res;
3781 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3782 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3783 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3784 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3785
3786 for (loop_count = (height >> 2); loop_count--;) {
3787 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3788 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3789 src += (4 * src_stride);
3790 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3791 const20, const6, const3);
3792 res = __msa_aver_u_b(inp0, res);
3793 ST_UB(res, dst);
3794 dst += dst_stride;
3795
3796 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3797 const20, const6, const3);
3798 res = __msa_aver_u_b(inp2, res);
3799 ST_UB(res, dst);
3800 dst += dst_stride;
3801
3802 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3803 const20, const6, const3);
3804 res = __msa_aver_u_b(inp4, res);
3805 ST_UB(res, dst);
3806 dst += dst_stride;
3807
3808 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3809 const20, const6, const3);
3810 res = __msa_aver_u_b(inp6, res);
3811 ST_UB(res, dst);
3812 dst += dst_stride;
3813 }
3814
3815 LD_UB2(src, 1, inp0, inp1);
3816 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3817 res = __msa_aver_u_b(inp0, res);
3818 ST_UB(res, dst);
3819 }
3820
hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3821 static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src,
3822 int32_t src_stride,
3823 uint8_t *dst,
3824 int32_t dst_stride)
3825 {
3826 uint8_t buff[272];
3827
3828 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3829 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3830 }
3831
hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3832 static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src,
3833 int32_t src_stride,
3834 uint8_t *dst,
3835 int32_t dst_stride)
3836 {
3837 v16u8 inp0, inp1, inp2, inp3;
3838 v16u8 res0, res1, avg0, avg1;
3839 v16u8 horiz0, horiz1, horiz2, horiz3;
3840 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3841 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3842 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3843 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3844 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3845 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3846 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3847 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3848
3849 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
3850 src += (4 * src_stride);
3851 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3852 const20, const6, const3);
3853 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3854 const20, const6, const3);
3855 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3856 horiz0 = __msa_aver_u_b(inp0, res0);
3857 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3858 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3859 horiz2 = __msa_aver_u_b(inp2, res1);
3860 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3861 LD_UB2(src, src_stride, inp0, inp1);
3862 src += (2 * src_stride);
3863 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3864 const20, const6, const3);
3865 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3866 horiz4 = __msa_aver_u_b(inp0, res0);
3867 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3868 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3869 horiz1, horiz2, horiz3, horiz4,
3870 horiz1, horiz0, horiz0, horiz1,
3871 horiz2, horiz3, horiz4, horiz5,
3872 const20, const6, const3);
3873 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3874 res0 = __msa_aver_u_b(avg0, res0);
3875 ST_D2(res0, 0, 1, dst, dst_stride);
3876 dst += (2 * dst_stride);
3877
3878 LD_UB2(src, src_stride, inp2, inp3);
3879 src += (2 * src_stride);
3880 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3881 const20, const6, const3);
3882 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3883 horiz6 = __msa_aver_u_b(inp2, res1);
3884 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3885 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3886 horiz3, horiz4, horiz5, horiz6,
3887 horiz3, horiz2, horiz1, horiz0,
3888 horiz4, horiz5, horiz6, horiz7,
3889 const20, const6, const3);
3890 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3891 res1 = __msa_aver_u_b(avg1, res1);
3892
3893 inp0 = LD_UB(src);
3894 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
3895 const20, const6, const3);
3896 horiz8 = __msa_aver_u_b(inp0, res0);
3897 ST_D2(res1, 0, 1, dst, dst_stride);
3898 dst += 2 * dst_stride;
3899
3900 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3901 horiz5, horiz6, horiz7, horiz8,
3902 horiz5, horiz4, horiz3, horiz2,
3903 horiz6, horiz7, horiz8, horiz8,
3904 const20, const6, const3);
3905 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3906 res0 = __msa_aver_u_b(avg0, res0);
3907 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3908 horiz7, horiz8, horiz8, horiz7,
3909 horiz7, horiz6, horiz5, horiz4,
3910 horiz8, horiz8, horiz7, horiz6,
3911 const20, const6, const3);
3912 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3913 res1 = __msa_aver_u_b(avg1, res1);
3914 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3915 }
3916
hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)3917 static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
3918 int32_t src_stride,
3919 uint8_t *dst,
3920 int32_t dst_stride,
3921 int32_t height)
3922 {
3923 uint8_t loop_count;
3924 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3925 v16u8 res;
3926 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3927 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3928 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3929 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3930
3931 for (loop_count = (height >> 2); loop_count--;) {
3932 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3933 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3934 src += (4 * src_stride);
3935 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3936 const20, const6, const3);
3937 ST_UB(res, dst);
3938 dst += dst_stride;
3939
3940 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3941 const20, const6, const3);
3942 ST_UB(res, dst);
3943 dst += dst_stride;
3944
3945 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3946 const20, const6, const3);
3947 ST_UB(res, dst);
3948 dst += dst_stride;
3949
3950 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3951 const20, const6, const3);
3952 ST_UB(res, dst);
3953 dst += dst_stride;
3954 }
3955
3956 LD_UB2(src, 1, inp0, inp1);
3957 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3958 ST_UB(res, dst);
3959 }
3960
hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3961 static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src,
3962 int32_t src_stride,
3963 uint8_t *dst,
3964 int32_t dst_stride)
3965 {
3966 uint8_t buff[272];
3967
3968 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3969 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3970 }
3971
hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3972 static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src,
3973 int32_t src_stride,
3974 uint8_t *dst,
3975 int32_t dst_stride)
3976 {
3977 v16u8 inp0, inp1, inp2, inp3;
3978 v16u8 res0, res1, avg0, avg1;
3979 v16u8 horiz0, horiz1, horiz2, horiz3;
3980 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3981 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3982 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3983 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3984 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3985 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3986 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3987 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3988
3989 LD_UB2(src, src_stride, inp0, inp1);
3990 src += (2 * src_stride);
3991 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
3992 mask0, mask1, mask2, mask3,
3993 const20, const6, const3);
3994 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3995 LD_UB2(src, src_stride, inp2, inp3);
3996 src += (2 * src_stride);
3997 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
3998 mask0, mask1, mask2, mask3,
3999 const20, const6, const3);
4000 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4001 LD_UB2(src, src_stride, inp0, inp1);
4002 src += (2 * src_stride);
4003 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4004 mask0, mask1, mask2, mask3,
4005 const20, const6, const3);
4006 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4007 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4008 horiz1, horiz2, horiz3, horiz4,
4009 horiz1, horiz0, horiz0, horiz1,
4010 horiz2, horiz3, horiz4, horiz5,
4011 const20, const6, const3);
4012 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4013 res0 = __msa_aver_u_b(avg0, res0);
4014 ST_D2(res0, 0, 1, dst, dst_stride);
4015 dst += (2 * dst_stride);
4016
4017 LD_UB2(src, src_stride, inp2, inp3);
4018 src += (2 * src_stride);
4019 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4020 mask0, mask1, mask2, mask3,
4021 const20, const6, const3);
4022 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4023 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4024 horiz3, horiz4, horiz5, horiz6,
4025 horiz3, horiz2, horiz1, horiz0,
4026 horiz4, horiz5, horiz6, horiz7,
4027 const20, const6, const3);
4028 inp0 = LD_UB(src);
4029 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4030 mask0, mask1, mask2, mask3,
4031 const20, const6, const3);
4032 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4033 res1 = __msa_aver_u_b(avg1, res1);
4034 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4035 horiz5, horiz6, horiz7, horiz8,
4036 horiz5, horiz4, horiz3, horiz2,
4037 horiz6, horiz7, horiz8, horiz8,
4038 const20, const6, const3);
4039 ST_D2(res1, 0, 1, dst, dst_stride);
4040 dst += 2 * dst_stride;
4041
4042 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4043 res0 = __msa_aver_u_b(avg0, res0);
4044 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4045 horiz7, horiz8, horiz8, horiz7,
4046 horiz7, horiz6, horiz5, horiz4,
4047 horiz8, horiz8, horiz7, horiz6,
4048 const20, const6, const3);
4049 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4050 res1 = __msa_aver_u_b(avg1, res1);
4051 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4052 }
4053
hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)4054 static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
4055 int32_t src_stride,
4056 uint8_t *dst,
4057 int32_t dst_stride,
4058 int32_t height)
4059 {
4060 uint8_t loop_count;
4061 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4062 v16u8 res;
4063 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4064 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4065 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4066 v8u16 const20 = (v8u16) __msa_ldi_h(20);
4067
4068 for (loop_count = (height >> 2); loop_count--;) {
4069 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
4070 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
4071 src += (4 * src_stride);
4072 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
4073 const20, const6, const3);
4074 res = __msa_aver_u_b(res, inp1);
4075 ST_UB(res, dst);
4076 dst += dst_stride;
4077
4078 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
4079 const20, const6, const3);
4080 res = __msa_aver_u_b(res, inp3);
4081 ST_UB(res, dst);
4082 dst += dst_stride;
4083
4084 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
4085 const20, const6, const3);
4086 res = __msa_aver_u_b(res, inp5);
4087 ST_UB(res, dst);
4088 dst += dst_stride;
4089
4090 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
4091 const20, const6, const3);
4092 res = __msa_aver_u_b(res, inp7);
4093 ST_UB(res, dst);
4094 dst += dst_stride;
4095 }
4096
4097 LD_UB2(src, 1, inp0, inp1);
4098 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
4099 res = __msa_aver_u_b(inp1, res);
4100 ST_UB(res, dst);
4101 }
4102
hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4103 static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src,
4104 int32_t src_stride,
4105 uint8_t *dst,
4106 int32_t dst_stride)
4107 {
4108 uint8_t buff[272];
4109
4110 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4111 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4112 }
4113
hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4114 static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src,
4115 int32_t src_stride,
4116 uint8_t *dst,
4117 int32_t dst_stride)
4118 {
4119 v16u8 inp0, inp1, inp2, inp3;
4120 v16u8 res0, res1, avg0, avg1;
4121 v16u8 horiz0, horiz1, horiz2, horiz3;
4122 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4123 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4124 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4125 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4126 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4127 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4128 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4129 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4130
4131 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4132 src += (4 * src_stride);
4133 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4134 const20, const6, const3);
4135 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4136 const20, const6, const3);
4137 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4138
4139 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4140 horiz0 = __msa_aver_u_b(inp0, res0);
4141 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4142 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4143
4144 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4145 horiz2 = __msa_aver_u_b(inp2, res1);
4146 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4147 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4148 src += (4 * src_stride);
4149 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4150 const20, const6, const3);
4151 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4152 const20, const6, const3);
4153 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4154
4155 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4156 horiz4 = __msa_aver_u_b(inp0, res0);
4157 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4158 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4159
4160 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4161 horiz6 = __msa_aver_u_b(inp2, res1);
4162 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4163 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4164 horiz1, horiz2, horiz3, horiz4,
4165 horiz1, horiz0, horiz0, horiz1,
4166 horiz2, horiz3, horiz4, horiz5,
4167 const20, const6, const3);
4168 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4169 res0 = __msa_aver_u_b(avg0, res0);
4170 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4171 horiz3, horiz4, horiz5, horiz6,
4172 horiz3, horiz2, horiz1, horiz0,
4173 horiz4, horiz5, horiz6, horiz7,
4174 const20, const6, const3);
4175 ST_D2(res0, 0, 1, dst, dst_stride);
4176 dst += 2 * dst_stride;
4177
4178 inp0 = LD_UB(src);
4179 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4180 const20, const6, const3);
4181 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4182 res1 = __msa_aver_u_b(avg1, res1);
4183 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4184 horiz8 = __msa_aver_u_b(inp0, res0);
4185 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4186 horiz5, horiz6, horiz7, horiz8,
4187 horiz5, horiz4, horiz3, horiz2,
4188 horiz6, horiz7, horiz8, horiz8,
4189 const20, const6, const3);
4190 ST_D2(res1, 0, 1, dst, dst_stride);
4191 dst += 2 * dst_stride;
4192
4193 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4194 res0 = __msa_aver_u_b(avg0, res0);
4195 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4196 horiz7, horiz8, horiz8, horiz7,
4197 horiz7, horiz6, horiz5, horiz4,
4198 horiz8, horiz8, horiz7, horiz6,
4199 const20, const6, const3);
4200 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4201 res1 = __msa_aver_u_b(avg1, res1);
4202 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4203 }
4204
hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4205 static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
4206 int32_t src_stride,
4207 uint8_t *dst,
4208 int32_t dst_stride)
4209 {
4210 uint8_t buff[272];
4211
4212 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4213 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4214 }
4215
hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4216 static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src,
4217 int32_t src_stride,
4218 uint8_t *dst,
4219 int32_t dst_stride)
4220 {
4221 v16u8 inp0, inp1, inp2, inp3;
4222 v16u8 res0, res1;
4223 v16u8 horiz0, horiz1, horiz2, horiz3;
4224 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4225 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4226 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4227 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4228 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4229 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4230 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4231 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4232
4233 LD_UB2(src, src_stride, inp0, inp1);
4234 src += (2 * src_stride);
4235 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4236 const20, const6, const3);
4237 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4238 horiz0 = __msa_aver_u_b(inp0, res0);
4239 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4240
4241 LD_UB2(src, src_stride, inp2, inp3);
4242 src += (2 * src_stride);
4243 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4244 const20, const6, const3);
4245 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4246 horiz2 = __msa_aver_u_b(inp2, res1);
4247 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4248 LD_UB2(src, src_stride, inp0, inp1);
4249 src += (2 * src_stride);
4250 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4251 const20, const6, const3);
4252 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4253 horiz4 = __msa_aver_u_b(inp0, res0);
4254 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4255 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4256 horiz1, horiz2, horiz3, horiz4,
4257 horiz1, horiz0, horiz0, horiz1,
4258 horiz2, horiz3, horiz4, horiz5,
4259 const20, const6, const3);
4260 ST_D2(res0, 0, 1, dst, dst_stride);
4261 dst += (2 * dst_stride);
4262
4263 LD_UB2(src, src_stride, inp2, inp3);
4264 src += (2 * src_stride);
4265 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4266 const20, const6, const3);
4267 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4268 horiz6 = __msa_aver_u_b(inp2, res1);
4269 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4270 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4271 horiz3, horiz4, horiz5, horiz6,
4272 horiz3, horiz2, horiz1, horiz0,
4273 horiz4, horiz5, horiz6, horiz7,
4274 const20, const6, const3);
4275 inp0 = LD_UB(src);
4276 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4277 const20, const6, const3);
4278 horiz8 = __msa_aver_u_b(inp0, res0);
4279 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4280 horiz5, horiz6, horiz7, horiz8,
4281 horiz5, horiz4, horiz3, horiz2,
4282 horiz6, horiz7, horiz8, horiz8,
4283 const20, const6, const3);
4284 ST_D2(res1, 0, 1, dst, dst_stride);
4285 dst += 2 * dst_stride;
4286
4287 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4288 horiz7, horiz8, horiz8, horiz7,
4289 horiz7, horiz6, horiz5, horiz4,
4290 horiz8, horiz8, horiz7, horiz6,
4291 const20, const6, const3);
4292 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4293 }
4294
hv_mc_qpel_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4295 static void hv_mc_qpel_16x16_msa(const uint8_t *src,
4296 int32_t src_stride,
4297 uint8_t *dst,
4298 int32_t dst_stride)
4299 {
4300 uint8_t buff[272];
4301
4302 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4303 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4304 }
4305
hv_mc_qpel_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4306 static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
4307 uint8_t *dst, int32_t dst_stride)
4308 {
4309 v16u8 inp0, inp1, inp2, inp3;
4310 v16u8 res0, res1;
4311 v16u8 horiz0, horiz1, horiz2, horiz3;
4312 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4313 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4314 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4315 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4316 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4317 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4318 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4319 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4320
4321 LD_UB2(src, src_stride, inp0, inp1);
4322 src += (2 * src_stride);
4323 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4324 mask0, mask1, mask2, mask3,
4325 const20, const6, const3);
4326 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4327 LD_UB2(src, src_stride, inp2, inp3);
4328 src += (2 * src_stride);
4329 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4330 mask0, mask1, mask2, mask3,
4331 const20, const6, const3);
4332 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4333 LD_UB2(src, src_stride, inp0, inp1);
4334 src += (2 * src_stride);
4335 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4336 mask0, mask1, mask2, mask3,
4337 const20, const6, const3);
4338 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4339 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4340 horiz1, horiz2, horiz3, horiz4,
4341 horiz1, horiz0, horiz0, horiz1,
4342 horiz2, horiz3, horiz4, horiz5,
4343 const20, const6, const3);
4344 ST_D2(res0, 0, 1, dst, dst_stride);
4345 dst += (2 * dst_stride);
4346
4347 LD_UB2(src, src_stride, inp2, inp3);
4348 src += (2 * src_stride);
4349 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4350 mask0, mask1, mask2, mask3,
4351 const20, const6, const3);
4352 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4353 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4354 horiz3, horiz4, horiz5, horiz6,
4355 horiz3, horiz2, horiz1, horiz0,
4356 horiz4, horiz5, horiz6, horiz7,
4357 const20, const6, const3);
4358 inp0 = LD_UB(src);
4359 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4360 mask0, mask1, mask2, mask3,
4361 const20, const6, const3);
4362 ST_D2(res1, 0, 1, dst, dst_stride);
4363 dst += 2 * dst_stride;
4364
4365 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4366 horiz5, horiz6, horiz7, horiz8,
4367 horiz5, horiz4, horiz3, horiz2,
4368 horiz6, horiz7, horiz8, horiz8,
4369 const20, const6, const3);
4370 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4371 horiz7, horiz8, horiz8, horiz7,
4372 horiz7, horiz6, horiz5, horiz4,
4373 horiz8, horiz8, horiz7, horiz6,
4374 const20, const6, const3);
4375 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4376 }
4377
hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4378 static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
4379 int32_t src_stride,
4380 uint8_t *dst,
4381 int32_t dst_stride)
4382 {
4383 uint8_t buff[272];
4384
4385 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4386 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4387 }
4388
hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4389 static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src,
4390 int32_t src_stride,
4391 uint8_t *dst,
4392 int32_t dst_stride)
4393 {
4394 v16u8 inp0, inp1, inp2, inp3;
4395 v16u8 res0, res1;
4396 v16u8 horiz0, horiz1, horiz2, horiz3;
4397 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4398 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4399 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4400 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4401 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4402 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4403 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4404 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4405
4406 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4407 src += (4 * src_stride);
4408
4409 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4410 const20, const6, const3);
4411 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4412 const20, const6, const3);
4413 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4414
4415 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4416 horiz0 = __msa_aver_u_b(inp0, res0);
4417 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4418 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4419
4420 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4421 horiz2 = __msa_aver_u_b(inp2, res1);
4422 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4423 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4424 src += (4 * src_stride);
4425 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4426 const20, const6, const3);
4427 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4428 const20, const6, const3);
4429 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4430
4431 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4432 horiz4 = __msa_aver_u_b(inp0, res0);
4433 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4434 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4435
4436 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4437 horiz6 = __msa_aver_u_b(inp2, res1);
4438 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4439 inp0 = LD_UB(src);
4440 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4441 const20, const6, const3);
4442 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4443 horiz8 = __msa_aver_u_b(inp0, res0);
4444 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4445 horiz1, horiz2, horiz3, horiz4,
4446 horiz1, horiz0, horiz0, horiz1,
4447 horiz2, horiz3, horiz4, horiz5,
4448 const20, const6, const3);
4449 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4450 horiz3, horiz4, horiz5, horiz6,
4451 horiz3, horiz2, horiz1, horiz0,
4452 horiz4, horiz5, horiz6, horiz7,
4453 const20, const6, const3);
4454 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4455 dst += (4 * dst_stride);
4456
4457 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4458 horiz5, horiz6, horiz7, horiz8,
4459 horiz5, horiz4, horiz3, horiz2,
4460 horiz6, horiz7, horiz8, horiz8,
4461 const20, const6, const3);
4462 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4463 horiz7, horiz8, horiz8, horiz7,
4464 horiz7, horiz6, horiz5, horiz4,
4465 horiz8, horiz8, horiz7, horiz6,
4466 const20, const6, const3);
4467 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4468 }
4469
hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4470 static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
4471 int32_t src_stride,
4472 uint8_t *dst,
4473 int32_t dst_stride)
4474 {
4475 uint8_t buff[272];
4476
4477 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4478 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4479 }
4480
hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4481 static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src,
4482 int32_t src_stride,
4483 uint8_t *dst,
4484 int32_t dst_stride)
4485 {
4486 v16u8 inp0, inp1, inp2, inp3;
4487 v16u8 res0, res1, avg0, avg1;
4488 v16u8 horiz0, horiz1, horiz2, horiz3;
4489 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4490 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4491 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4492 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4493 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4494 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4495 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4496 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4497
4498 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4499 src += (4 * src_stride);
4500
4501 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4502 const20, const6, const3);
4503 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4504 const20, const6, const3);
4505 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4506 horiz0 = __msa_aver_u_b(inp0, res0);
4507 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4508 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4509 horiz2 = __msa_aver_u_b(inp2, res1);
4510 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4511 LD_UB2(src, src_stride, inp0, inp1);
4512 src += (2 * src_stride);
4513
4514 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4515 const20, const6, const3);
4516 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4517 horiz4 = __msa_aver_u_b(inp0, res0);
4518 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4519 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4520 horiz1, horiz2, horiz3, horiz4,
4521 horiz1, horiz0, horiz0, horiz1,
4522 horiz2, horiz3, horiz4, horiz5,
4523 const20, const6, const3);
4524 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4525 res0 = __msa_aver_u_b(avg0, res0);
4526 ST_D2(res0, 0, 1, dst, dst_stride);
4527 dst += (2 * dst_stride);
4528
4529 LD_UB2(src, src_stride, inp2, inp3);
4530 src += (2 * src_stride);
4531 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4532 const20, const6, const3);
4533 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4534 horiz6 = __msa_aver_u_b(inp2, res1);
4535 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4536 inp0 = LD_UB(src);
4537 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4538 const20, const6, const3);
4539 horiz8 = __msa_aver_u_b(inp0, res0);
4540 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4541 horiz3, horiz4, horiz5, horiz6,
4542 horiz3, horiz2, horiz1, horiz0,
4543 horiz4, horiz5, horiz6, horiz7,
4544 const20, const6, const3);
4545 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4546 res1 = __msa_aver_u_b(avg1, res1);
4547 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4548 horiz5, horiz6, horiz7, horiz8,
4549 horiz5, horiz4, horiz3, horiz2,
4550 horiz6, horiz7, horiz8, horiz8,
4551 const20, const6, const3);
4552 ST_D2(res1, 0, 1, dst, dst_stride);
4553 dst += 2 * dst_stride;
4554
4555 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4556 res0 = __msa_aver_u_b(avg0, res0);
4557 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4558 horiz7, horiz8, horiz8, horiz7,
4559 horiz7, horiz6, horiz5, horiz4,
4560 horiz8, horiz8, horiz7, horiz6,
4561 const20, const6, const3);
4562 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4563 res1 = __msa_aver_u_b(avg1, res1);
4564 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4565 }
4566
hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4567 static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
4568 int32_t src_stride,
4569 uint8_t *dst,
4570 int32_t dst_stride)
4571 {
4572 uint8_t buff[272];
4573
4574 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4575 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4576 }
4577
hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4578 static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src,
4579 int32_t src_stride,
4580 uint8_t *dst,
4581 int32_t dst_stride)
4582 {
4583 v16u8 inp0, inp1, inp2, inp3;
4584 v16u8 res0, res1, avg0, avg1;
4585 v16u8 horiz0, horiz1, horiz2, horiz3;
4586 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4587 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4588 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4589 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4590 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4591 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4592 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4593 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4594
4595 LD_UB2(src, src_stride, inp0, inp1);
4596 src += (2 * src_stride);
4597 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4598 mask0, mask1, mask2, mask3,
4599 const20, const6, const3);
4600 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4601 LD_UB2(src, src_stride, inp2, inp3);
4602 src += (2 * src_stride);
4603 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4604 mask0, mask1, mask2, mask3,
4605 const20, const6, const3);
4606 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4607 LD_UB2(src, src_stride, inp0, inp1);
4608 src += (2 * src_stride);
4609 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4610 mask0, mask1, mask2, mask3,
4611 const20, const6, const3);
4612 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4613 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4614 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4615 horiz1, horiz2, horiz3, horiz4,
4616 horiz1, horiz0, horiz0, horiz1,
4617 horiz2, horiz3, horiz4, horiz5,
4618 const20, const6, const3);
4619 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4620 res0 = __msa_aver_u_b(avg0, res0);
4621 ST_D2(res0, 0, 1, dst, dst_stride);
4622 dst += (2 * dst_stride);
4623
4624 LD_UB2(src, src_stride, inp2, inp3);
4625 src += (2 * src_stride);
4626 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4627 mask0, mask1, mask2, mask3,
4628 const20, const6, const3);
4629 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4630 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4631 horiz3, horiz4, horiz5, horiz6,
4632 horiz3, horiz2, horiz1, horiz0,
4633 horiz4, horiz5, horiz6, horiz7,
4634 const20, const6, const3);
4635 inp0 = LD_UB(src);
4636 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4637 mask0, mask1, mask2, mask3,
4638 const20, const6, const3);
4639 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4640 res1 = __msa_aver_u_b(avg1, res1);
4641 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4642 horiz5, horiz6, horiz7, horiz8,
4643 horiz5, horiz4, horiz3, horiz2,
4644 horiz6, horiz7, horiz8, horiz8,
4645 const20, const6, const3);
4646 ST_D2(res1, 0, 1, dst, dst_stride);
4647 dst += 2 * dst_stride;
4648 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4649 res0 = __msa_aver_u_b(avg0, res0);
4650
4651 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4652 horiz7, horiz8, horiz8, horiz7,
4653 horiz7, horiz6, horiz5, horiz4,
4654 horiz8, horiz8, horiz7, horiz6,
4655 const20, const6, const3);
4656 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4657 res1 = __msa_aver_u_b(avg1, res1);
4658 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4659 }
4660
hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4661 static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
4662 int32_t src_stride,
4663 uint8_t *dst,
4664 int32_t dst_stride)
4665 {
4666 uint8_t buff[272];
4667
4668 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4669 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4670 }
4671
hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4672 static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src,
4673 int32_t src_stride,
4674 uint8_t *dst, int32_t dst_stride)
4675 {
4676 v16u8 inp0, inp1, inp2, inp3;
4677 v16u8 res0, res1, avg0, avg1;
4678 v16u8 horiz0, horiz1, horiz2, horiz3;
4679 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4680 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4681 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4682 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4683 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4684 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4685 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4686 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4687
4688 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4689 src += (4 * src_stride);
4690 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4691 mask0, mask1, mask2, mask3,
4692 const20, const6, const3);
4693 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4694
4695 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4696 horiz0 = __msa_aver_u_b(inp0, res0);
4697 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4698 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4699 const20, const6, const3);
4700 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4701
4702 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4703 horiz2 = __msa_aver_u_b(inp2, res1);
4704 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4705 LD_UB2(src, src_stride, inp0, inp1);
4706 src += (2 * src_stride);
4707 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4708 const20, const6, const3);
4709 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4710
4711 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4712 horiz4 = __msa_aver_u_b(inp0, res0);
4713 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4714 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4715 horiz1, horiz2, horiz3, horiz4,
4716 horiz1, horiz0, horiz0, horiz1,
4717 horiz2, horiz3, horiz4, horiz5,
4718 const20, const6, const3);
4719 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4720 res0 = __msa_aver_u_b(avg0, res0);
4721 LD_UB2(src, src_stride, inp2, inp3);
4722 src += (2 * src_stride);
4723 ST_D2(res0, 0, 1, dst, dst_stride);
4724 dst += 2 * dst_stride;
4725
4726 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4727 const20, const6, const3);
4728 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4729
4730 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4731 horiz6 = __msa_aver_u_b(inp2, res1);
4732 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4733 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4734 horiz3, horiz4, horiz5, horiz6,
4735 horiz3, horiz2, horiz1, horiz0,
4736 horiz4, horiz5, horiz6, horiz7,
4737 const20, const6, const3);
4738 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4739 res1 = __msa_aver_u_b(avg1, res1);
4740 inp0 = LD_UB(src);
4741 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4742 const20, const6, const3);
4743 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4744 horiz8 = __msa_aver_u_b(inp0, res0);
4745 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4746 horiz5, horiz6, horiz7, horiz8,
4747 horiz5, horiz4, horiz3, horiz2,
4748 horiz6, horiz7, horiz8, horiz8,
4749 const20, const6, const3);
4750 ST_D2(res1, 0, 1, dst, dst_stride);
4751 dst += 2 * dst_stride;
4752
4753 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4754 res0 = __msa_aver_u_b(avg0, res0);
4755 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4756 horiz7, horiz8, horiz8, horiz7,
4757 horiz7, horiz6, horiz5, horiz4,
4758 horiz8, horiz8, horiz7, horiz6,
4759 const20, const6, const3);
4760 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4761 res1 = __msa_aver_u_b(avg1, res1);
4762 ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4763 }
4764
hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4765 static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
4766 int32_t src_stride,
4767 uint8_t *dst,
4768 int32_t dst_stride)
4769 {
4770 uint8_t buff[272];
4771
4772 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4773 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4774 }
4775
hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4776 static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src,
4777 int32_t src_stride,
4778 uint8_t *dst,
4779 int32_t dst_stride)
4780 {
4781 v16u8 inp0, inp1, inp2, inp3;
4782 v16u8 res0, res1, avg0, avg1;
4783 v16u8 horiz0, horiz1, horiz2, horiz3;
4784 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4785 v16u8 dst0, dst1;
4786 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4787 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4788 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4789 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4790 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4791 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4792 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4793
4794 LD_UB2(src, src_stride, inp0, inp1);
4795 src += (2 * src_stride);
4796 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4797 const20, const6, const3);
4798 LD_UB2(src, src_stride, inp2, inp3);
4799 src += (2 * src_stride);
4800 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4801 horiz0 = __msa_aver_u_b(inp0, res0);
4802 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4803 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4804 const20, const6, const3);
4805 LD_UB2(src, src_stride, inp0, inp1);
4806 src += (2 * src_stride);
4807 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4808 horiz2 = __msa_aver_u_b(inp2, res1);
4809 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4810 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4811 const20, const6, const3);
4812 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4813 horiz4 = __msa_aver_u_b(inp0, res0);
4814 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4815 LD_UB2(dst, dst_stride, dst0, dst1);
4816 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4817 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4818 horiz1, horiz2, horiz3, horiz4,
4819 horiz1, horiz0, horiz0, horiz1,
4820 horiz2, horiz3, horiz4, horiz5,
4821 const20, const6, const3);
4822 res0 = __msa_aver_u_b(avg0, res0);
4823 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4824 res0 = __msa_aver_u_b(avg0, res0);
4825 ST_D2(res0, 0, 1, dst, dst_stride);
4826 dst += (2 * dst_stride);
4827
4828 LD_UB2(src, src_stride, inp2, inp3);
4829 src += (2 * src_stride);
4830 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4831 const20, const6, const3);
4832 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4833 horiz6 = __msa_aver_u_b(inp2, res1);
4834 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4835 LD_UB2(dst, dst_stride, dst0, dst1);
4836 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4837 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4838 horiz3, horiz4, horiz5, horiz6,
4839 horiz3, horiz2, horiz1, horiz0,
4840 horiz4, horiz5, horiz6, horiz7,
4841 const20, const6, const3);
4842 res1 = __msa_aver_u_b(avg1, res1);
4843 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4844 res1 = __msa_aver_u_b(avg1, res1);
4845 ST_D2(res1, 0, 1, dst, dst_stride);
4846 dst += (2 * dst_stride);
4847
4848 inp0 = LD_UB(src);
4849 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4850 const20, const6, const3);
4851 horiz8 = __msa_aver_u_b(inp0, res0);
4852 LD_UB2(dst, dst_stride, dst0, dst1);
4853 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4854 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4855 horiz5, horiz6, horiz7, horiz8,
4856 horiz5, horiz4, horiz3, horiz2,
4857 horiz6, horiz7, horiz8, horiz8,
4858 const20, const6, const3);
4859 res0 = __msa_aver_u_b(avg0, res0);
4860 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4861 res0 = __msa_aver_u_b(avg0, res0);
4862 ST_D2(res0, 0, 1, dst, dst_stride);
4863 dst += (2 * dst_stride);
4864
4865 LD_UB2(dst, dst_stride, dst0, dst1);
4866 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4867 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4868 horiz7, horiz8, horiz8, horiz7,
4869 horiz7, horiz6, horiz5, horiz4,
4870 horiz8, horiz8, horiz7, horiz6,
4871 const20, const6, const3);
4872 res1 = __msa_aver_u_b(avg1, res1);
4873 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4874 res1 = __msa_aver_u_b(avg1, res1);
4875 ST_D2(res1, 0, 1, dst, dst_stride);
4876 }
4877
hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4878 static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
4879 int32_t src_stride,
4880 uint8_t *dst,
4881 int32_t dst_stride)
4882 {
4883 uint8_t buff[272];
4884
4885 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4886 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4887 }
4888
hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4889 static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src,
4890 int32_t src_stride,
4891 uint8_t *dst,
4892 int32_t dst_stride)
4893 {
4894 v16u8 inp0, inp1, inp2, inp3;
4895 v16u8 res0, res1, avg0, avg1;
4896 v16u8 horiz0, horiz1, horiz2, horiz3;
4897 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4898 v16u8 dst0, dst1;
4899 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4900 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4901 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4902 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4903 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4904 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4905 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4906
4907 LD_UB2(src, src_stride, inp0, inp1);
4908 src += (2 * src_stride);
4909 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4910 mask0, mask1, mask2, mask3,
4911 const20, const6, const3);
4912 LD_UB2(src, src_stride, inp2, inp3);
4913 src += (2 * src_stride);
4914 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4915 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4916 mask0, mask1, mask2, mask3,
4917 const20, const6, const3);
4918 LD_UB2(src, src_stride, inp0, inp1);
4919 src += (2 * src_stride);
4920 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4921 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4922 mask0, mask1, mask2, mask3,
4923 const20, const6, const3);
4924 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4925 LD_UB2(dst, dst_stride, dst0, dst1);
4926 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4927 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4928 horiz1, horiz2, horiz3, horiz4,
4929 horiz1, horiz0, horiz0, horiz1,
4930 horiz2, horiz3, horiz4, horiz5,
4931 const20, const6, const3);
4932 res0 = __msa_aver_u_b(avg0, res0);
4933 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4934 res0 = __msa_aver_u_b(avg0, res0);
4935 ST_D2(res0, 0, 1, dst, dst_stride);
4936 dst += (2 * dst_stride);
4937
4938 LD_UB2(src, src_stride, inp2, inp3);
4939 src += (2 * src_stride);
4940 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4941 mask0, mask1, mask2, mask3,
4942 const20, const6, const3);
4943 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4944 LD_UB2(dst, dst_stride, dst0, dst1);
4945 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4946 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4947 horiz3, horiz4, horiz5, horiz6,
4948 horiz3, horiz2, horiz1, horiz0,
4949 horiz4, horiz5, horiz6, horiz7,
4950 const20, const6, const3);
4951 res1 = __msa_aver_u_b(avg1, res1);
4952 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4953 res1 = __msa_aver_u_b(avg1, res1);
4954 ST_D2(res1, 0, 1, dst, dst_stride);
4955 dst += (2 * dst_stride);
4956
4957 inp0 = LD_UB(src);
4958 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4959 mask0, mask1, mask2, mask3,
4960 const20, const6, const3);
4961 LD_UB2(dst, dst_stride, dst0, dst1);
4962 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4963 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4964 horiz5, horiz6, horiz7, horiz8,
4965 horiz5, horiz4, horiz3, horiz2,
4966 horiz6, horiz7, horiz8, horiz8,
4967 const20, const6, const3);
4968 res0 = __msa_aver_u_b(avg0, res0);
4969 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4970 res0 = __msa_aver_u_b(avg0, res0);
4971 ST_D2(res0, 0, 1, dst, dst_stride);
4972 dst += (2 * dst_stride);
4973
4974 LD_UB2(dst, dst_stride, dst0, dst1);
4975 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4976 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4977 horiz7, horiz8, horiz8, horiz7,
4978 horiz7, horiz6, horiz5, horiz4,
4979 horiz8, horiz8, horiz7, horiz6,
4980 const20, const6, const3);
4981 res1 = __msa_aver_u_b(avg1, res1);
4982 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4983 res1 = __msa_aver_u_b(avg1, res1);
4984 ST_D2(res1, 0, 1, dst, dst_stride);
4985 }
4986
hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4987 static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
4988 int32_t src_stride,
4989 uint8_t *dst,
4990 int32_t dst_stride)
4991 {
4992 uint8_t buff[272];
4993
4994 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4995 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4996 }
4997
hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4998 static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src,
4999 int32_t src_stride,
5000 uint8_t *dst,
5001 int32_t dst_stride)
5002 {
5003 v16u8 inp0, inp1, inp2, inp3;
5004 v16u8 res0, res1, avg0, avg1;
5005 v16u8 horiz0, horiz1, horiz2, horiz3;
5006 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5007 v16u8 dst0, dst1;
5008 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5009 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5010 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5011 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5012 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5013 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5014 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5015
5016 LD_UB2(src, src_stride, inp0, inp1);
5017 src += (2 * src_stride);
5018 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5019 const20, const6, const3);
5020
5021 LD_UB2(src, src_stride, inp2, inp3);
5022 src += (2 * src_stride);
5023 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5024
5025 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5026 horiz0 = __msa_aver_u_b(inp0, res0);
5027 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5028 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5029 const20, const6, const3);
5030 LD_UB2(src, src_stride, inp0, inp1);
5031 src += (2 * src_stride);
5032 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5033
5034 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5035 horiz2 = __msa_aver_u_b(inp2, res1);
5036 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5037 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5038 const20, const6, const3);
5039
5040 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5041
5042 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5043 horiz4 = __msa_aver_u_b(inp0, res0);
5044 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5045 LD_UB2(dst, dst_stride, dst0, dst1);
5046 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5047 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5048 horiz1, horiz2, horiz3, horiz4,
5049 horiz1, horiz0, horiz0, horiz1,
5050 horiz2, horiz3, horiz4, horiz5,
5051 const20, const6, const3);
5052 res0 = __msa_aver_u_b(avg0, res0);
5053 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5054 res0 = __msa_aver_u_b(avg0, res0);
5055 ST_D2(res0, 0, 1, dst, dst_stride);
5056 dst += (2 * dst_stride);
5057
5058 LD_UB2(src, src_stride, inp2, inp3);
5059 src += (2 * src_stride);
5060 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5061 const20, const6, const3);
5062
5063 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5064
5065 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5066 horiz6 = __msa_aver_u_b(inp2, res1);
5067 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5068 LD_UB2(dst, dst_stride, dst0, dst1);
5069 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5070 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5071 horiz3, horiz4, horiz5, horiz6,
5072 horiz3, horiz2, horiz1, horiz0,
5073 horiz4, horiz5, horiz6, horiz7,
5074 const20, const6, const3);
5075 res1 = __msa_aver_u_b(avg1, res1);
5076 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5077 res1 = __msa_aver_u_b(avg1, res1);
5078 ST_D2(res1, 0, 1, dst, dst_stride);
5079 dst += (2 * dst_stride);
5080
5081 inp0 = LD_UB(src);
5082 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5083 const20, const6, const3);
5084 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5085 horiz8 = __msa_aver_u_b(inp0, res0);
5086 LD_UB2(dst, dst_stride, dst0, dst1);
5087 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5088 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5089 horiz5, horiz6, horiz7, horiz8,
5090 horiz5, horiz4, horiz3, horiz2,
5091 horiz6, horiz7, horiz8, horiz8,
5092 const20, const6, const3);
5093 res0 = __msa_aver_u_b(avg0, res0);
5094 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5095 res0 = __msa_aver_u_b(avg0, res0);
5096 ST_D2(res0, 0, 1, dst, dst_stride);
5097 dst += (2 * dst_stride);
5098
5099 LD_UB2(dst, dst_stride, dst0, dst1);
5100 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5101 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5102 horiz7, horiz8, horiz8, horiz7,
5103 horiz7, horiz6, horiz5, horiz4,
5104 horiz8, horiz8, horiz7, horiz6,
5105 const20, const6, const3);
5106 res1 = __msa_aver_u_b(avg1, res1);
5107 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5108 res1 = __msa_aver_u_b(avg1, res1);
5109 ST_D2(res1, 0, 1, dst, dst_stride);
5110 }
5111
hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5112 static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
5113 int32_t src_stride,
5114 uint8_t *dst,
5115 int32_t dst_stride)
5116 {
5117 uint8_t buff[272];
5118
5119 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5120 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5121 }
5122
hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5123 static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src,
5124 int32_t src_stride,
5125 uint8_t *dst,
5126 int32_t dst_stride)
5127 {
5128 v16u8 inp0, inp1, inp2, inp3;
5129 v16u8 res0, res1, avg0, avg1;
5130 v16u8 horiz0, horiz1, horiz2, horiz3;
5131 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5132 v16u8 dst0, dst1;
5133 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5134 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5135 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5136 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5137 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5138 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5139 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5140
5141 LD_UB2(src, src_stride, inp0, inp1);
5142 src += (2 * src_stride);
5143 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5144 const20, const6, const3);
5145 LD_UB2(src, src_stride, inp2, inp3);
5146 src += (2 * src_stride);
5147 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5148 horiz0 = __msa_aver_u_b(inp0, res0);
5149 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5150 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5151 const20, const6, const3);
5152 LD_UB2(src, src_stride, inp0, inp1);
5153 src += (2 * src_stride);
5154 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5155 horiz2 = __msa_aver_u_b(inp2, res1);
5156 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5157 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5158 const20, const6, const3);
5159 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5160 horiz4 = __msa_aver_u_b(inp0, res0);
5161 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5162 LD_UB2(dst, dst_stride, dst0, dst1);
5163 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5164 horiz1, horiz2, horiz3, horiz4,
5165 horiz1, horiz0, horiz0, horiz1,
5166 horiz2, horiz3, horiz4, horiz5,
5167 const20, const6, const3);
5168 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5169 res0 = __msa_aver_u_b(avg0, res0);
5170 ST_D2(res0, 0, 1, dst, dst_stride);
5171 dst += (2 * dst_stride);
5172
5173 LD_UB2(src, src_stride, inp2, inp3);
5174 src += (2 * src_stride);
5175 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5176 const20, const6, const3);
5177 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5178 horiz6 = __msa_aver_u_b(inp2, res1);
5179 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5180 LD_UB2(dst, dst_stride, dst0, dst1);
5181 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5182 horiz3, horiz4, horiz5, horiz6,
5183 horiz3, horiz2, horiz1, horiz0,
5184 horiz4, horiz5, horiz6, horiz7,
5185 const20, const6, const3);
5186 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5187 res1 = __msa_aver_u_b(avg1, res1);
5188 ST_D2(res1, 0, 1, dst, dst_stride);
5189 dst += (2 * dst_stride);
5190
5191 inp0 = LD_UB(src);
5192 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5193 const20, const6, const3);
5194 horiz8 = __msa_aver_u_b(inp0, res0);
5195 LD_UB2(dst, dst_stride, dst0, dst1);
5196 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5197 horiz5, horiz6, horiz7, horiz8,
5198 horiz5, horiz4, horiz3, horiz2,
5199 horiz6, horiz7, horiz8, horiz8,
5200 const20, const6, const3);
5201 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5202 res0 = __msa_aver_u_b(avg0, res0);
5203 ST_D2(res0, 0, 1, dst, dst_stride);
5204 dst += (2 * dst_stride);
5205
5206 LD_UB2(dst, dst_stride, dst0, dst1);
5207 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5208 horiz7, horiz8, horiz8, horiz7,
5209 horiz7, horiz6, horiz5, horiz4,
5210 horiz8, horiz8, horiz7, horiz6,
5211 const20, const6, const3);
5212 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5213 res1 = __msa_aver_u_b(avg1, res1);
5214 ST_D2(res1, 0, 1, dst, dst_stride);
5215 }
5216
hv_mc_qpel_avg_dst_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5217 static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
5218 uint8_t *dst, int32_t dst_stride)
5219 {
5220 uint8_t buff[272];
5221
5222 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5223 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5224
5225 }
5226
hv_mc_qpel_avg_dst_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5227 static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
5228 uint8_t *dst, int32_t dst_stride)
5229 {
5230 v16u8 inp0, inp1, inp2, inp3;
5231 v16u8 res0, res1, avg0, avg1;
5232 v16u8 horiz0, horiz1, horiz2, horiz3;
5233 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5234 v16u8 dst0, dst1;
5235 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5236 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5237 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5238 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5239 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5240 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5241 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5242
5243 LD_UB2(src, src_stride, inp0, inp1);
5244 src += (2 * src_stride);
5245 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5246 mask0, mask1, mask2, mask3,
5247 const20, const6, const3);
5248 LD_UB2(src, src_stride, inp2, inp3);
5249 src += (2 * src_stride);
5250 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5251 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5252 mask0, mask1, mask2, mask3,
5253 const20, const6, const3);
5254 LD_UB2(src, src_stride, inp0, inp1);
5255 src += (2 * src_stride);
5256 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5257 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5258 mask0, mask1, mask2, mask3,
5259 const20, const6, const3);
5260 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5261 LD_UB2(src, src_stride, inp2, inp3);
5262 src += (2 * src_stride);
5263 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5264 mask0, mask1, mask2, mask3,
5265 const20, const6, const3);
5266 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5267 inp0 = LD_UB(src);
5268 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5269 mask0, mask1, mask2, mask3,
5270 const20, const6, const3);
5271 LD_UB2(dst, dst_stride, dst0, dst1);
5272 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5273 horiz1, horiz2, horiz3, horiz4,
5274 horiz1, horiz0, horiz0, horiz1,
5275 horiz2, horiz3, horiz4, horiz5,
5276 const20, const6, const3);
5277 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5278 res0 = __msa_aver_u_b(avg0, res0);
5279 ST_D2(res0, 0, 1, dst, dst_stride);
5280 dst += (2 * dst_stride);
5281
5282 LD_UB2(dst, dst_stride, dst0, dst1);
5283 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5284 horiz3, horiz4, horiz5, horiz6,
5285 horiz3, horiz2, horiz1, horiz0,
5286 horiz4, horiz5, horiz6, horiz7,
5287 const20, const6, const3);
5288 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5289 res1 = __msa_aver_u_b(avg1, res1);
5290 ST_D2(res1, 0, 1, dst, dst_stride);
5291 dst += (2 * dst_stride);
5292
5293 LD_UB2(dst, dst_stride, dst0, dst1);
5294 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5295 horiz5, horiz6, horiz7, horiz8,
5296 horiz5, horiz4, horiz3, horiz2,
5297 horiz6, horiz7, horiz8, horiz8,
5298 const20, const6, const3);
5299 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5300 res0 = __msa_aver_u_b(avg0, res0);
5301 ST_D2(res0, 0, 1, dst, dst_stride);
5302 dst += (2 * dst_stride);
5303
5304 LD_UB2(dst, dst_stride, dst0, dst1);
5305 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5306 horiz7, horiz8, horiz8, horiz7,
5307 horiz7, horiz6, horiz5, horiz4,
5308 horiz8, horiz8, horiz7, horiz6,
5309 const20, const6, const3);
5310 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5311 res1 = __msa_aver_u_b(avg1, res1);
5312 ST_D2(res1, 0, 1, dst, dst_stride);
5313 }
5314
hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5315 static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
5316 int32_t src_stride,
5317 uint8_t *dst,
5318 int32_t dst_stride)
5319 {
5320 uint8_t buff[272];
5321
5322 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5323 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5324 }
5325
hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5326 static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src,
5327 int32_t src_stride,
5328 uint8_t *dst,
5329 int32_t dst_stride)
5330 {
5331 v16u8 inp0, inp1, inp2, inp3;
5332 v16u8 res0, res1, avg0, avg1;
5333 v16u8 horiz0, horiz1, horiz2, horiz3;
5334 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5335 v16u8 dst0, dst1;
5336 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5337 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5338 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5339 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5340 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5341 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5342 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5343
5344 LD_UB2(src, src_stride, inp0, inp1);
5345 src += (2 * src_stride);
5346 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5347 const20, const6, const3);
5348 LD_UB2(src, src_stride, inp2, inp3);
5349 src += (2 * src_stride);
5350 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5351
5352 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5353 horiz0 = __msa_aver_u_b(inp0, res0);
5354 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5355 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5356 const20, const6, const3);
5357 LD_UB2(src, src_stride, inp0, inp1);
5358 src += (2 * src_stride);
5359 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5360
5361 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5362 horiz2 = __msa_aver_u_b(inp2, res1);
5363 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5364 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5365 const20, const6, const3);
5366
5367 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5368
5369 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5370 horiz4 = __msa_aver_u_b(inp0, res0);
5371 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5372 LD_UB2(dst, dst_stride, dst0, dst1);
5373 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5374 horiz1, horiz2, horiz3, horiz4,
5375 horiz1, horiz0, horiz0, horiz1,
5376 horiz2, horiz3, horiz4, horiz5,
5377 const20, const6, const3);
5378 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5379 res0 = __msa_aver_u_b(avg0, res0);
5380 ST_D2(res0, 0, 1, dst, dst_stride);
5381 dst += (2 * dst_stride);
5382
5383 LD_UB2(src, src_stride, inp2, inp3);
5384 src += (2 * src_stride);
5385 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5386 const20, const6, const3);
5387
5388 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5389
5390 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5391 horiz6 = __msa_aver_u_b(inp2, res1);
5392 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5393 LD_UB2(dst, dst_stride, dst0, dst1);
5394 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5395 horiz3, horiz4, horiz5, horiz6,
5396 horiz3, horiz2, horiz1, horiz0,
5397 horiz4, horiz5, horiz6, horiz7,
5398 const20, const6, const3);
5399 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5400 res1 = __msa_aver_u_b(avg1, res1);
5401 ST_D2(res1, 0, 1, dst, dst_stride);
5402 dst += (2 * dst_stride);
5403
5404 inp0 = LD_UB(src);
5405 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5406 const20, const6, const3);
5407 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5408 horiz8 = __msa_aver_u_b(inp0, res0);
5409 LD_UB2(dst, dst_stride, dst0, dst1);
5410 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5411 horiz5, horiz6, horiz7, horiz8,
5412 horiz5, horiz4, horiz3, horiz2,
5413 horiz6, horiz7, horiz8, horiz8,
5414 const20, const6, const3);
5415 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5416 res0 = __msa_aver_u_b(avg0, res0);
5417 ST_D2(res0, 0, 1, dst, dst_stride);
5418 dst += (2 * dst_stride);
5419
5420 LD_UB2(dst, dst_stride, dst0, dst1);
5421 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5422 horiz7, horiz8, horiz8, horiz7,
5423 horiz7, horiz6, horiz5, horiz4,
5424 horiz8, horiz8, horiz7, horiz6,
5425 const20, const6, const3);
5426 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427 res1 = __msa_aver_u_b(avg1, res1);
5428 ST_D2(res1, 0, 1, dst, dst_stride);
5429 }
5430
hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5431 static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
5432 int32_t src_stride,
5433 uint8_t *dst,
5434 int32_t dst_stride)
5435 {
5436 uint8_t buff[272];
5437
5438 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5439 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5440 }
5441
hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5442 static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src,
5443 int32_t src_stride,
5444 uint8_t *dst,
5445 int32_t dst_stride)
5446 {
5447 v16u8 inp0, inp1, inp2, inp3;
5448 v16u8 res0, res1, avg0, avg1;
5449 v16u8 horiz0, horiz1, horiz2, horiz3;
5450 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5451 v16u8 dst0, dst1;
5452 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5453 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5454 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5455 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5456 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5457 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5458 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5459
5460 LD_UB2(src, src_stride, inp0, inp1);
5461 src += (2 * src_stride);
5462
5463 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5464 const20, const6, const3);
5465 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5466 horiz0 = __msa_aver_u_b(inp0, res0);
5467 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5468 LD_UB2(src, src_stride, inp2, inp3);
5469 src += (2 * src_stride);
5470 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5471 const20, const6, const3);
5472 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5473 horiz2 = __msa_aver_u_b(inp2, res1);
5474 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5475 LD_UB2(dst, dst_stride, dst0, dst1);
5476 LD_UB2(src, src_stride, inp0, inp1);
5477 src += (2 * src_stride);
5478 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5479 const20, const6, const3);
5480 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5481 horiz4 = __msa_aver_u_b(inp0, res0);
5482 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5483 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5484 horiz1, horiz2, horiz3, horiz4,
5485 horiz1, horiz0, horiz0, horiz1,
5486 horiz2, horiz3, horiz4, horiz5,
5487 const20, const6, const3);
5488 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5489 res0 = __msa_aver_u_b(avg0, res0);
5490 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5491 res0 = __msa_aver_u_b(avg0, res0);
5492 ST_D2(res0, 0, 1, dst, dst_stride);
5493 dst += (2 * dst_stride);
5494
5495 LD_UB2(dst, dst_stride, dst0, dst1);
5496 LD_UB2(src, src_stride, inp2, inp3);
5497 src += (2 * src_stride);
5498 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5499 const20, const6, const3);
5500 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5501 horiz6 = __msa_aver_u_b(inp2, res1);
5502 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5503 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5504 horiz3, horiz4, horiz5, horiz6,
5505 horiz3, horiz2, horiz1, horiz0,
5506 horiz4, horiz5, horiz6, horiz7,
5507 const20, const6, const3);
5508 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5509 res1 = __msa_aver_u_b(avg1, res1);
5510 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5511 res1 = __msa_aver_u_b(avg1, res1);
5512 ST_D2(res1, 0, 1, dst, dst_stride);
5513 dst += (2 * dst_stride);
5514
5515 inp0 = LD_UB(src);
5516 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5517 const20, const6, const3);
5518 horiz8 = __msa_aver_u_b(inp0, res0);
5519 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5520 horiz5, horiz6, horiz7, horiz8,
5521 horiz5, horiz4, horiz3, horiz2,
5522 horiz6, horiz7, horiz8, horiz8,
5523 const20, const6, const3);
5524 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5525 horiz7, horiz8, horiz8, horiz7,
5526 horiz7, horiz6, horiz5, horiz4,
5527 horiz8, horiz8, horiz7, horiz6,
5528 const20, const6, const3);
5529 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5530 res0 = __msa_aver_u_b(avg0, res0);
5531 LD_UB2(dst, dst_stride, dst0, dst1);
5532 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5533 res0 = __msa_aver_u_b(avg0, res0);
5534 ST_D2(res0, 0, 1, dst, dst_stride);
5535 dst += (2 * dst_stride);
5536
5537 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5538 res1 = __msa_aver_u_b(avg1, res1);
5539 LD_UB2(dst, dst_stride, dst0, dst1);
5540 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5541 res1 = __msa_aver_u_b(avg1, res1);
5542 ST_D2(res1, 0, 1, dst, dst_stride);
5543 }
5544
hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5545 static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
5546 int32_t src_stride,
5547 uint8_t *dst,
5548 int32_t dst_stride)
5549 {
5550 uint8_t buff[272];
5551
5552 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5553 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5554 }
5555
hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5556 static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src,
5557 int32_t src_stride,
5558 uint8_t *dst,
5559 int32_t dst_stride)
5560 {
5561 v16u8 inp0, inp1, inp2, inp3;
5562 v16u8 res0, res1, avg0, avg1;
5563 v16u8 horiz0, horiz1, horiz2, horiz3;
5564 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5565 v16u8 dst0, dst1;
5566 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5567 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5568 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5569 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5570 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5571 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5572 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5573
5574 LD_UB2(src, src_stride, inp0, inp1);
5575 src += (2 * src_stride);
5576 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5577 mask0, mask1, mask2, mask3,
5578 const20, const6, const3);
5579 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5580 LD_UB2(src, src_stride, inp2, inp3);
5581 src += (2 * src_stride);
5582 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5583 mask0, mask1, mask2, mask3,
5584 const20, const6, const3);
5585 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5586 LD_UB2(dst, dst_stride, dst0, dst1);
5587 LD_UB2(src, src_stride, inp0, inp1);
5588 src += (2 * src_stride);
5589 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5590 mask0, mask1, mask2, mask3,
5591 const20, const6, const3);
5592 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5593 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5594 horiz1, horiz2, horiz3, horiz4,
5595 horiz1, horiz0, horiz0, horiz1,
5596 horiz2, horiz3, horiz4, horiz5,
5597 const20, const6, const3);
5598 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5599 res0 = __msa_aver_u_b(avg0, res0);
5600 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5601 res0 = __msa_aver_u_b(avg0, res0);
5602 ST_D2(res0, 0, 1, dst, dst_stride);
5603 dst += (2 * dst_stride);
5604
5605 LD_UB2(dst, dst_stride, dst0, dst1);
5606 LD_UB2(src, src_stride, inp2, inp3);
5607 src += (2 * src_stride);
5608 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5609 mask0, mask1, mask2, mask3,
5610 const20, const6, const3);
5611 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5612 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5613 horiz3, horiz4, horiz5, horiz6,
5614 horiz3, horiz2, horiz1, horiz0,
5615 horiz4, horiz5, horiz6, horiz7,
5616 const20, const6, const3);
5617 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5618 res1 = __msa_aver_u_b(avg1, res1);
5619 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5620 res1 = __msa_aver_u_b(avg1, res1);
5621 ST_D2(res1, 0, 1, dst, dst_stride);
5622 dst += (2 * dst_stride);
5623
5624 inp0 = LD_UB(src);
5625 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5626 mask0, mask1, mask2, mask3,
5627 const20, const6, const3);
5628 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5629 horiz6, horiz7, horiz8, horiz5, horiz4,
5630 horiz3, horiz2, horiz6, horiz7, horiz8,
5631 horiz8, const20, const6, const3);
5632 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5633 horiz8, horiz8, horiz7, horiz7, horiz6,
5634 horiz5, horiz4, horiz8, horiz8, horiz7,
5635 horiz6, const20, const6, const3);
5636 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5637 res0 = __msa_aver_u_b(avg0, res0);
5638 LD_UB2(dst, dst_stride, dst0, dst1);
5639 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5640 res0 = __msa_aver_u_b(avg0, res0);
5641 ST_D2(res0, 0, 1, dst, dst_stride);
5642 dst += (2 * dst_stride);
5643
5644 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5645 res1 = __msa_aver_u_b(avg1, res1);
5646 LD_UB2(dst, dst_stride, dst0, dst1);
5647 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5648 res1 = __msa_aver_u_b(avg1, res1);
5649 ST_D2(res1, 0, 1, dst, dst_stride);
5650 }
5651
hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5652 static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
5653 int32_t src_stride,
5654 uint8_t *dst,
5655 int32_t dst_stride)
5656 {
5657 uint8_t buff[272];
5658
5659 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5660 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5661 }
5662
hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5663 static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src,
5664 int32_t src_stride,
5665 uint8_t *dst,
5666 int32_t dst_stride)
5667 {
5668 v16u8 inp0, inp1, inp2, inp3;
5669 v16u8 res0, res1, avg0, avg1;
5670 v16u8 horiz0, horiz1, horiz2, horiz3;
5671 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5672 v16u8 dst0, dst1;
5673 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5674 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5675 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5676 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5677 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5678 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5679 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5680
5681 LD_UB2(src, src_stride, inp0, inp1);
5682 src += (2 * src_stride);
5683 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5684 const20, const6, const3);
5685 LD_UB2(src, src_stride, inp2, inp3);
5686 src += (2 * src_stride);
5687 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5688
5689 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5690 horiz0 = __msa_aver_u_b(inp0, res0);
5691 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5692 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5693 const20, const6, const3);
5694 LD_UB2(src, src_stride, inp0, inp1);
5695 src += (2 * src_stride);
5696 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5697
5698 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5699 horiz2 = __msa_aver_u_b(inp2, res1);
5700 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5701 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5702 const20, const6, const3);
5703 SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5704
5705 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5706 horiz4 = __msa_aver_u_b(inp0, res0);
5707 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5708 LD_UB2(dst, dst_stride, dst0, dst1);
5709 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5710 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
5711 horiz2, horiz3, horiz4, horiz1, horiz0,
5712 horiz0, horiz1, horiz2, horiz3, horiz4,
5713 horiz5, const20, const6, const3);
5714 res0 = __msa_aver_u_b(avg0, res0);
5715 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5716 res0 = __msa_aver_u_b(avg0, res0);
5717 ST_D2(res0, 0, 1, dst, dst_stride);
5718 dst += (2 * dst_stride);
5719
5720 LD_UB2(src, src_stride, inp2, inp3);
5721 src += (2 * src_stride);
5722 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5723 const20, const6, const3);
5724 SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5725
5726 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5727 horiz6 = __msa_aver_u_b(inp2, res1);
5728 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5729 LD_UB2(dst, dst_stride, dst0, dst1);
5730 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5731 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
5732 horiz4, horiz5, horiz6, horiz3, horiz2,
5733 horiz1, horiz0, horiz4, horiz5, horiz6,
5734 horiz7, const20, const6, const3);
5735 res1 = __msa_aver_u_b(avg1, res1);
5736 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5737 res1 = __msa_aver_u_b(avg1, res1);
5738 ST_D2(res1, 0, 1, dst, dst_stride);
5739 dst += (2 * dst_stride);
5740
5741 inp0 = LD_UB(src);
5742 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5743 const20, const6, const3);
5744 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5745 horiz8 = __msa_aver_u_b(inp0, res0);
5746 LD_UB2(dst, dst_stride, dst0, dst1);
5747 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5748 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5749 horiz6, horiz7, horiz8, horiz5, horiz4,
5750 horiz3, horiz2, horiz6, horiz7, horiz8,
5751 horiz8, const20, const6, const3);
5752 res0 = __msa_aver_u_b(avg0, res0);
5753 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5754 res0 = __msa_aver_u_b(avg0, res0);
5755 ST_D2(res0, 0, 1, dst, dst_stride);
5756 dst += (2 * dst_stride);
5757
5758 LD_UB2(dst, dst_stride, dst0, dst1);
5759 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5760 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5761 horiz8, horiz8, horiz7, horiz7, horiz6,
5762 horiz5, horiz4, horiz8, horiz8, horiz7,
5763 horiz6, const20, const6, const3);
5764 res1 = __msa_aver_u_b(avg1, res1);
5765 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5766 res1 = __msa_aver_u_b(avg1, res1);
5767 ST_D2(res1, 0, 1, dst, dst_stride);
5768 }
5769
copy_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5770 static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
5771 uint8_t *dst, int32_t dst_stride)
5772 {
5773 uint64_t src0, src1;
5774 int32_t loop_cnt;
5775
5776 for (loop_cnt = 4; loop_cnt--;) {
5777 src0 = LD(src);
5778 src += src_stride;
5779 src1 = LD(src);
5780 src += src_stride;
5781
5782 SD(src0, dst);
5783 dst += dst_stride;
5784 SD(src1, dst);
5785 dst += dst_stride;
5786 }
5787 }
5788
copy_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5789 static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
5790 uint8_t *dst, int32_t dst_stride)
5791 {
5792 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5793 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5794
5795 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5796 src += (8 * src_stride);
5797 LD_UB8(src, src_stride,
5798 src8, src9, src10, src11, src12, src13, src14, src15);
5799
5800 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5801 dst += (8 * dst_stride);
5802 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5803 dst, dst_stride);
5804 }
5805
avg_width8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)5806 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
5807 uint8_t *dst, int32_t dst_stride,
5808 int32_t height)
5809 {
5810 int32_t cnt;
5811 uint64_t out0, out1, out2, out3;
5812 v16u8 src0, src1, src2, src3;
5813 v16u8 dst0, dst1, dst2, dst3;
5814
5815 for (cnt = (height / 4); cnt--;) {
5816 LD_UB4(src, src_stride, src0, src1, src2, src3);
5817 src += (4 * src_stride);
5818 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5819
5820 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5821 dst0, dst1, dst2, dst3);
5822
5823 out0 = __msa_copy_u_d((v2i64) dst0, 0);
5824 out1 = __msa_copy_u_d((v2i64) dst1, 0);
5825 out2 = __msa_copy_u_d((v2i64) dst2, 0);
5826 out3 = __msa_copy_u_d((v2i64) dst3, 0);
5827 SD4(out0, out1, out2, out3, dst, dst_stride);
5828 dst += (4 * dst_stride);
5829 }
5830 }
5831
avg_width16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)5832 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
5833 uint8_t *dst, int32_t dst_stride,
5834 int32_t height)
5835 {
5836 int32_t cnt;
5837 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5838 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5839
5840 for (cnt = (height / 8); cnt--;) {
5841 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5842 src += (8 * src_stride);
5843 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5844
5845 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5846 dst0, dst1, dst2, dst3);
5847 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5848 dst4, dst5, dst6, dst7);
5849 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5850 dst += (8 * dst_stride);
5851 }
5852 }
5853
ff_copy_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5854 void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5855 {
5856 copy_16x16_msa(src, stride, dest, stride);
5857 }
5858
ff_copy_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5859 void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5860 {
5861 copy_8x8_msa(src, stride, dest, stride);
5862 }
5863
ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5864 void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest,
5865 const uint8_t *src,
5866 ptrdiff_t stride)
5867 {
5868 horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
5869 }
5870
ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5871 void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest,
5872 const uint8_t *src,
5873 ptrdiff_t stride)
5874 {
5875 horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
5876 }
5877
ff_horiz_mc_qpel_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5878 void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src,
5879 ptrdiff_t stride)
5880 {
5881 horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
5882 }
5883
ff_horiz_mc_qpel_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5884 void ff_horiz_mc_qpel_16width_msa(uint8_t *dest,
5885 const uint8_t *src, ptrdiff_t stride)
5886 {
5887 horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
5888 }
5889
ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5890 void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest,
5891 const uint8_t *src,
5892 ptrdiff_t stride)
5893 {
5894 horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
5895 }
5896
ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5897 void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest,
5898 const uint8_t *src,
5899 ptrdiff_t stride)
5900 {
5901 horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
5902 }
5903
ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5904 void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest,
5905 const uint8_t *src,
5906 ptrdiff_t stride)
5907 {
5908 horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
5909 }
5910
ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5911 void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest,
5912 const uint8_t *src,
5913 ptrdiff_t stride)
5914 {
5915 horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
5916 }
5917
ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5918 void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest,
5919 const uint8_t *src, ptrdiff_t stride)
5920 {
5921 horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
5922 }
5923
ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5924 void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest,
5925 const uint8_t *src, ptrdiff_t stride)
5926 {
5927 horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
5928 }
5929
ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5930 void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest,
5931 const uint8_t *src,
5932 ptrdiff_t stride)
5933 {
5934 horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
5935 }
5936
ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5937 void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest,
5938 const uint8_t *src,
5939 ptrdiff_t stride)
5940 {
5941 horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
5942 }
5943
ff_avg_width8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5944 void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5945 {
5946 avg_width8_msa(src, stride, dest, stride, 8);
5947 }
5948
ff_avg_width16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5949 void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5950 {
5951 avg_width16_msa(src, stride, dest, stride, 16);
5952 }
5953
ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5954 void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest,
5955 const uint8_t *src,
5956 ptrdiff_t stride)
5957 {
5958 horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
5959 }
5960
ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5961 void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest,
5962 const uint8_t *src,
5963 ptrdiff_t stride)
5964 {
5965 horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
5966 }
5967
ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5968 void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest,
5969 const uint8_t *src, ptrdiff_t stride)
5970 {
5971 horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
5972 }
5973
ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5974 void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest,
5975 const uint8_t *src, ptrdiff_t stride)
5976 {
5977 horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
5978 }
5979
ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5980 void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest,
5981 const uint8_t *src,
5982 ptrdiff_t stride)
5983 {
5984 horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
5985 }
5986
ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5987 void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest,
5988 const uint8_t *src,
5989 ptrdiff_t stride)
5990 {
5991 horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
5992 }
5993
5994
ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5995 void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest,
5996 const uint8_t *src, ptrdiff_t stride)
5997 {
5998 vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
5999 }
6000
ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6001 void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest,
6002 const uint8_t *src, ptrdiff_t stride)
6003 {
6004 vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
6005 }
6006
ff_vert_mc_qpel_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6007 void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6008 ptrdiff_t stride)
6009 {
6010 vert_mc_qpel_8x8_msa(src, stride, dest, stride);
6011 }
6012
ff_vert_mc_qpel_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6013 void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6014 ptrdiff_t stride)
6015 {
6016 vert_mc_qpel_16x16_msa(src, stride, dest, stride);
6017 }
6018
ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6019 void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest,
6020 const uint8_t *src, ptrdiff_t stride)
6021 {
6022 vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
6023 }
6024
ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6025 void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest,
6026 const uint8_t *src, ptrdiff_t stride)
6027 {
6028 vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
6029 }
6030
ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6031 void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest,
6032 const uint8_t *src,
6033 ptrdiff_t stride)
6034 {
6035 vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
6036 }
6037
ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6038 void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest,
6039 const uint8_t *src,
6040 ptrdiff_t stride)
6041 {
6042 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
6043 }
6044
ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6045 void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6046 const uint8_t *src, ptrdiff_t stride)
6047 {
6048 vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6049 }
6050
ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6051 void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6052 const uint8_t *src, ptrdiff_t stride)
6053 {
6054 vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6055 }
6056
ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6057 void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest,
6058 const uint8_t *src,
6059 ptrdiff_t stride)
6060 {
6061 vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
6062 }
6063
ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6064 void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest,
6065 const uint8_t *src,
6066 ptrdiff_t stride)
6067 {
6068 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
6069 }
6070
ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6071 void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest,
6072 const uint8_t *src,
6073 ptrdiff_t stride)
6074 {
6075 vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
6076 }
6077
ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6078 void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest,
6079 const uint8_t *src,
6080 ptrdiff_t stride)
6081 {
6082 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
6083 }
6084
ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6085 void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6086 const uint8_t *src, ptrdiff_t stride)
6087 {
6088 vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6089 }
6090
ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6091 void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6092 const uint8_t *src, ptrdiff_t stride)
6093 {
6094 vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6095 }
6096
ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6097 void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest,
6098 const uint8_t *src,
6099 ptrdiff_t stride)
6100 {
6101 vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
6102 }
6103
ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6104 void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest,
6105 const uint8_t *src,
6106 ptrdiff_t stride)
6107 {
6108 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
6109 }
6110
6111 /* HV cases */
ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6112 void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest,
6113 const uint8_t *src,
6114 ptrdiff_t stride)
6115 {
6116 hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6117 }
6118
ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6119 void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest,
6120 const uint8_t *src, ptrdiff_t stride)
6121 {
6122 hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6123 }
6124
ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6125 void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest,
6126 const uint8_t *src, ptrdiff_t stride)
6127 {
6128 hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
6129 }
6130
ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6131 void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest,
6132 const uint8_t *src, ptrdiff_t stride)
6133 {
6134 hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
6135 }
6136
ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6137 void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest,
6138 const uint8_t *src,
6139 ptrdiff_t stride)
6140 {
6141 hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6142 }
6143
ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6144 void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest,
6145 const uint8_t *src, ptrdiff_t stride)
6146 {
6147 hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6148 }
6149
ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6150 void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest,
6151 const uint8_t *src, ptrdiff_t stride)
6152 {
6153 hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
6154 }
6155
ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6156 void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest,
6157 const uint8_t *src, ptrdiff_t stride)
6158 {
6159 hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride);
6160 }
6161
ff_hv_mc_qpel_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6162 void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6163 ptrdiff_t stride)
6164 {
6165 hv_mc_qpel_16x16_msa(src, stride, dest, stride);
6166 }
6167
ff_hv_mc_qpel_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6168 void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6169 ptrdiff_t stride)
6170 {
6171 hv_mc_qpel_8x8_msa(src, stride, dest, stride);
6172 }
6173
ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6174 void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest,
6175 const uint8_t *src, ptrdiff_t stride)
6176 {
6177 hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride);
6178 }
6179
ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6180 void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest,
6181 const uint8_t *src, ptrdiff_t stride)
6182 {
6183 hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride);
6184 }
6185
ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6186 void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest,
6187 const uint8_t *src,
6188 ptrdiff_t stride)
6189 {
6190 hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6191 }
6192
ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6193 void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest,
6194 const uint8_t *src, ptrdiff_t stride)
6195 {
6196 hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6197 }
6198
ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6199 void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest,
6200 const uint8_t *src, ptrdiff_t stride)
6201 {
6202 hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride);
6203 }
6204
ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6205 void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest,
6206 const uint8_t *src, ptrdiff_t stride)
6207 {
6208 hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride);
6209 }
6210
ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6211 void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest,
6212 const uint8_t *src,
6213 ptrdiff_t stride)
6214 {
6215 hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6216 }
6217
ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6218 void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest,
6219 const uint8_t *src, ptrdiff_t stride)
6220 {
6221 hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6222 }
6223
ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6224 void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest,
6225 const uint8_t *src,
6226 ptrdiff_t stride)
6227 {
6228 hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6229 }
6230
ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6231 void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest,
6232 const uint8_t *src,
6233 ptrdiff_t stride)
6234 {
6235 hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6236 }
6237
ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6238 void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest,
6239 const uint8_t *src,
6240 ptrdiff_t stride)
6241 {
6242 hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride);
6243 }
6244
ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6245 void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest,
6246 const uint8_t *src,
6247 ptrdiff_t stride)
6248 {
6249 hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride);
6250 }
6251
ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6252 void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest,
6253 const uint8_t *src,
6254 ptrdiff_t stride)
6255 {
6256 hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6257 }
6258
ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6259 void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest,
6260 const uint8_t *src,
6261 ptrdiff_t stride)
6262 {
6263 hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6264 }
6265
ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6266 void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest,
6267 const uint8_t *src,
6268 ptrdiff_t stride)
6269 {
6270 hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride);
6271 }
6272
ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6273 void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest,
6274 const uint8_t *src,
6275 ptrdiff_t stride)
6276 {
6277 hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride);
6278 }
6279
ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6280 void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6281 const uint8_t *src, ptrdiff_t stride)
6282 {
6283 hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6284 }
6285
ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6286 void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6287 const uint8_t *src, ptrdiff_t stride)
6288 {
6289 hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6290 }
6291
ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6292 void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest,
6293 const uint8_t *src,
6294 ptrdiff_t stride)
6295 {
6296 hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride);
6297 }
6298
ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6299 void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest,
6300 const uint8_t *src,
6301 ptrdiff_t stride)
6302 {
6303 hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride);
6304 }
6305
ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6306 void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest,
6307 const uint8_t *src,
6308 ptrdiff_t stride)
6309 {
6310 hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6311 }
6312
ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6313 void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest,
6314 const uint8_t *src,
6315 ptrdiff_t stride)
6316 {
6317 hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6318 }
6319
ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6320 void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest,
6321 const uint8_t *src,
6322 ptrdiff_t stride)
6323 {
6324 hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride);
6325 }
6326
ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6327 void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest,
6328 const uint8_t *src,
6329 ptrdiff_t stride)
6330 {
6331 hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride);
6332 }
6333
ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6334 void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest,
6335 const uint8_t *src,
6336 ptrdiff_t stride)
6337 {
6338 hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6339 }
6340
ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6341 void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest,
6342 const uint8_t *src,
6343 ptrdiff_t stride)
6344 {
6345 hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6346 }
6347
ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6348 void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest,
6349 const uint8_t *src,
6350 ptrdiff_t stride)
6351 {
6352 hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6353 }
6354
ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6355 void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest,
6356 const uint8_t *src,
6357 ptrdiff_t stride)
6358 {
6359 hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6360 }
6361
ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6362 void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest,
6363 const uint8_t *src,
6364 ptrdiff_t stride)
6365 {
6366 hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride);
6367 }
6368
ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6369 void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest,
6370 const uint8_t *src,
6371 ptrdiff_t stride)
6372 {
6373 hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride);
6374 }
6375
ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6376 void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest,
6377 const uint8_t *src,
6378 ptrdiff_t stride)
6379 {
6380 hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6381 }
6382
ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6383 void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest,
6384 const uint8_t *src,
6385 ptrdiff_t stride)
6386 {
6387 hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6388 }
6389
ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6390 void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest,
6391 const uint8_t *src,
6392 ptrdiff_t stride)
6393 {
6394 hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride);
6395 }
6396
ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6397 void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest,
6398 const uint8_t *src,
6399 ptrdiff_t stride)
6400 {
6401 hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride);
6402 }
6403
ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6404 void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6405 const uint8_t *src, ptrdiff_t stride)
6406 {
6407 hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6408 }
6409
ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6410 void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6411 const uint8_t *src, ptrdiff_t stride)
6412 {
6413 hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6414 }
6415
ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6416 void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest,
6417 const uint8_t *src,
6418 ptrdiff_t stride)
6419 {
6420 hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride);
6421 }
6422
ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6423 void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest,
6424 const uint8_t *src,
6425 ptrdiff_t stride)
6426 {
6427 hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride);
6428 }
6429
ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6430 void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest,
6431 const uint8_t *src,
6432 ptrdiff_t stride)
6433 {
6434 hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6435 }
6436
ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6437 void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest,
6438 const uint8_t *src,
6439 ptrdiff_t stride)
6440 {
6441 hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6442 }
6443
ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6444 void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest,
6445 const uint8_t *src,
6446 ptrdiff_t stride)
6447 {
6448 hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride);
6449 }
6450
ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6451 void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest,
6452 const uint8_t *src,
6453 ptrdiff_t stride)
6454 {
6455 hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride);
6456 }
6457
ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6458 void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest,
6459 const uint8_t *src,
6460 ptrdiff_t stride)
6461 {
6462 hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6463 }
6464
ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6465 void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest,
6466 const uint8_t *src,
6467 ptrdiff_t stride)
6468 {
6469 hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6470 }
6471