1 /*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hpeldsp_mips.h"
23
24 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
25 { \
26 v16u8 tmp_m; \
27 \
28 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
29 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
30 ST_UB(tmp_m, (pdst)); \
31 }
32
33 #define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
34 { \
35 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
36 uint8_t *pdst_m = (uint8_t *) (pdst); \
37 \
38 PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \
39 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
40 ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \
41 }
42
43 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
44 pdst, stride) \
45 { \
46 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
47 uint8_t *pdst_m = (uint8_t *) (pdst); \
48 \
49 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
50 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
51 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
52 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
53 }
54
common_hz_bil_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)55 static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
56 uint8_t *dst, int32_t dst_stride,
57 uint8_t height)
58 {
59 uint8_t loop_cnt;
60 uint32_t out0, out1;
61 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
62 v16i8 zeros = { 0 };
63
64 for (loop_cnt = (height >> 1); loop_cnt--;) {
65 LD_UB2(src, src_stride, src0, src1);
66 src += (2 * src_stride);
67
68 SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
69 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
70
71 out0 = __msa_copy_u_w((v4i32) res0, 0);
72 out1 = __msa_copy_u_w((v4i32) res1, 0);
73 SW(out0, dst);
74 dst += dst_stride;
75 SW(out1, dst);
76 dst += dst_stride;
77 }
78 }
79
common_hz_bil_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)80 static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
81 uint8_t *dst, int32_t dst_stride,
82 uint8_t height)
83 {
84 uint8_t loop_cnt;
85 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
86 v16i8 zeros = { 0 };
87
88 for (loop_cnt = (height >> 2); loop_cnt--;) {
89 LD_SB4(src, src_stride, src0, src1, src2, src3);
90 src += (4 * src_stride);
91
92 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
93 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
94 AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
95 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
96 dst += (4 * dst_stride);
97 }
98 }
99
common_hz_bil_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)100 static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
101 uint8_t *dst, int32_t dst_stride,
102 uint8_t height)
103 {
104 uint8_t loop_cnt;
105 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
106 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
107
108 for (loop_cnt = (height >> 3); loop_cnt--;) {
109 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
110 LD_UB8((src + 1), src_stride,
111 src8, src9, src10, src11, src12, src13, src14, src15);
112 src += (8 * src_stride);
113
114 AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
115 dst, dst_stride);
116 dst += (4 * dst_stride);
117
118 AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
119 dst, dst_stride);
120 dst += (4 * dst_stride);
121 }
122 }
123
common_hz_bil_no_rnd_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)124 static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
125 uint8_t *dst, int32_t dst_stride)
126 {
127 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
128 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
129 v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
130 v16i8 zeros = { 0 };
131
132 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
133 src += (8 * src_stride);
134
135 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
136 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
137 SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
138 src4_sld1, src5_sld1, src6_sld1, src7_sld1);
139
140 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
141 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
142 dst += (4 * dst_stride);
143 AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
144 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
145 }
146
common_hz_bil_no_rnd_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)147 static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
148 uint8_t *dst, int32_t dst_stride)
149 {
150 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
151 v16i8 zeros = { 0 };
152
153 LD_SB4(src, src_stride, src0, src1, src2, src3);
154 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
155 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
156 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
157 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
158 }
159
common_hz_bil_no_rnd_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)160 static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
161 int32_t src_stride,
162 uint8_t *dst, int32_t dst_stride)
163 {
164 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
165 v16u8 src9, src10, src11, src12, src13, src14, src15;
166
167 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
168 LD_UB8((src + 1), src_stride,
169 src8, src9, src10, src11, src12, src13, src14, src15);
170 src += (8 * src_stride);
171
172 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
173 dst, dst_stride);
174 dst += (4 * dst_stride);
175
176 LD_UB4(src, src_stride, src0, src1, src2, src3);
177 LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
178 src += (4 * src_stride);
179
180 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
181 dst, dst_stride);
182 dst += (4 * dst_stride);
183
184 LD_UB4(src, src_stride, src4, src5, src6, src7);
185 LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
186 src += (4 * src_stride);
187
188 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
189 dst, dst_stride);
190 dst += (4 * dst_stride);
191 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
192 dst, dst_stride);
193 }
194
common_hz_bil_no_rnd_8x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)195 static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
196 int32_t src_stride,
197 uint8_t *dst, int32_t dst_stride)
198 {
199 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
200 v16u8 src9, src10, src11, src12, src13, src14, src15;
201
202 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
203 LD_UB8((src + 1), src_stride,
204 src8, src9, src10, src11, src12, src13, src14, src15);
205
206 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
207 dst, dst_stride);
208 dst += (4 * dst_stride);
209 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
210 dst, dst_stride);
211 }
212
common_hz_bil_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)213 static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
214 int32_t src_stride,
215 uint8_t *dst, int32_t dst_stride,
216 uint8_t height)
217 {
218 uint8_t loop_cnt;
219 uint32_t dst0, dst1, out0, out1;
220 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
221 v16u8 tmp0 = { 0 };
222 v16u8 tmp1 = { 0 };
223 v16i8 zeros = { 0 };
224
225 for (loop_cnt = (height >> 1); loop_cnt--;) {
226 LD_UB2(src, src_stride, src0, src1);
227 src += (2 * src_stride);
228
229 SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
230
231 dst0 = LW(dst);
232 dst1 = LW(dst + dst_stride);
233 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
234 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
235
236 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
237 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
238
239 out0 = __msa_copy_u_w((v4i32) res0, 0);
240 out1 = __msa_copy_u_w((v4i32) res1, 0);
241 SW(out0, dst);
242 dst += dst_stride;
243 SW(out1, dst);
244 dst += dst_stride;
245 }
246 }
247
common_hz_bil_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)248 static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
249 int32_t src_stride,
250 uint8_t *dst, int32_t dst_stride,
251 uint8_t height)
252 {
253 uint8_t loop_cnt;
254 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
255 v16i8 zeros = { 0 };
256
257 for (loop_cnt = (height >> 2); loop_cnt--;) {
258 LD_SB4(src, src_stride, src0, src1, src2, src3);
259 src += (4 * src_stride);
260
261 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
262 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
263
264 AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
265 src3, src3_sld1, dst, dst_stride);
266 dst += (4 * dst_stride);
267 }
268 }
269
common_hz_bil_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)270 static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
271 int32_t src_stride,
272 uint8_t *dst, int32_t dst_stride,
273 uint8_t height)
274 {
275 uint8_t loop_cnt;
276 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
277 v16u8 src9, src10, src11, src12, src13, src14, src15;
278
279 for (loop_cnt = (height >> 3); loop_cnt--;) {
280 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
281 LD_UB8((src + 1), src_stride,
282 src8, src9, src10, src11, src12, src13, src14, src15);
283 src += (8 * src_stride);
284
285 AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
286 dst, dst_stride);
287 dst += (4 * dst_stride);
288 AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
289 dst, dst_stride);
290 dst += (4 * dst_stride);
291 }
292 }
293
common_vt_bil_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)294 static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
295 uint8_t *dst, int32_t dst_stride,
296 uint8_t height)
297 {
298 uint8_t loop_cnt;
299 uint32_t out0, out1;
300 v16u8 src0, src1, src2, res0, res1;
301
302 src0 = LD_UB(src);
303 src += src_stride;
304
305 for (loop_cnt = (height >> 1); loop_cnt--;) {
306 LD_UB2(src, src_stride, src1, src2);
307 src += (2 * src_stride);
308
309 AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
310
311 out0 = __msa_copy_u_w((v4i32) res0, 0);
312 out1 = __msa_copy_u_w((v4i32) res1, 0);
313 SW(out0, dst);
314 dst += dst_stride;
315 SW(out1, dst);
316 dst += dst_stride;
317
318 src0 = src2;
319 }
320 }
321
common_vt_bil_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)322 static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
323 uint8_t *dst, int32_t dst_stride,
324 uint8_t height)
325 {
326 uint8_t loop_cnt;
327 v16u8 src0, src1, src2, src3, src4;
328
329 src0 = LD_UB(src);
330 src += src_stride;
331
332 for (loop_cnt = (height >> 2); loop_cnt--;) {
333 LD_UB4(src, src_stride, src1, src2, src3, src4);
334 src += (4 * src_stride);
335
336 AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
337 dst, dst_stride);
338 dst += (4 * dst_stride);
339
340 src0 = src4;
341 }
342 }
343
common_vt_bil_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)344 static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
345 uint8_t *dst, int32_t dst_stride,
346 uint8_t height)
347 {
348 uint8_t loop_cnt;
349 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
350
351 src0 = LD_UB(src);
352 src += src_stride;
353
354 for (loop_cnt = (height >> 3); loop_cnt--;) {
355 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
356 src += (8 * src_stride);
357
358 AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
359 dst, dst_stride);
360 dst += (4 * dst_stride);
361 AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
362 dst, dst_stride);
363 dst += (4 * dst_stride);
364
365 src0 = src8;
366 }
367 }
368
common_vt_bil_no_rnd_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)369 static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
370 uint8_t *dst, int32_t dst_stride)
371 {
372 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
373
374 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
375 src += (8 * src_stride);
376 src8 = LD_UB(src);
377
378 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
379 dst, dst_stride);
380 dst += (4 * dst_stride);
381
382 AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
383 dst, dst_stride);
384 }
385
common_vt_bil_no_rnd_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)386 static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
387 uint8_t *dst, int32_t dst_stride)
388 {
389 v16u8 src0, src1, src2, src3, src4;
390
391 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
392 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
393 dst, dst_stride);
394 }
395
common_vt_bil_no_rnd_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)396 static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
397 int32_t src_stride,
398 uint8_t *dst, int32_t dst_stride)
399 {
400 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
401 v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
402
403 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
404 src += (8 * src_stride);
405 LD_UB8(src, src_stride,
406 src8, src9, src10, src11, src12, src13, src14, src15);
407 src += (8 * src_stride);
408 src16 = LD_UB(src);
409
410 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
411 dst, dst_stride);
412 dst += (4 * dst_stride);
413 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
414 dst, dst_stride);
415 dst += (4 * dst_stride);
416 AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
417 dst, dst_stride);
418 dst += (4 * dst_stride);
419 AVE_ST16x4_UB(src12, src13, src13, src14,
420 src14, src15, src15, src16, dst, dst_stride);
421 }
422
common_vt_bil_no_rnd_8x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)423 static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
424 int32_t src_stride,
425 uint8_t *dst, int32_t dst_stride)
426 {
427 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428
429 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
430 src += (8 * src_stride);
431 src8 = LD_UB(src);
432
433 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
434 dst, dst_stride);
435 dst += (4 * dst_stride);
436 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
437 dst, dst_stride);
438 }
439
common_vt_bil_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)440 static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
441 int32_t src_stride,
442 uint8_t *dst, int32_t dst_stride,
443 uint8_t height)
444 {
445 uint8_t loop_cnt;
446 uint32_t out0, out1, dst0, dst1;
447 v16u8 src0, src1, src2;
448 v16u8 tmp0 = { 0 };
449 v16u8 tmp1 = { 0 };
450 v16u8 res0, res1;
451
452 src0 = LD_UB(src);
453 src += src_stride;
454
455 for (loop_cnt = (height >> 1); loop_cnt--;) {
456 LD_UB2(src, src_stride, src1, src2);
457 src += (2 * src_stride);
458 dst0 = LW(dst);
459 dst1 = LW(dst + dst_stride);
460 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
461 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
462 AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
463 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
464 out0 = __msa_copy_u_w((v4i32) res0, 0);
465 out1 = __msa_copy_u_w((v4i32) res1, 0);
466 SW(out0, dst);
467 dst += dst_stride;
468 SW(out1, dst);
469 dst += dst_stride;
470 src0 = src2;
471 }
472 }
473
common_vt_bil_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)474 static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
475 int32_t src_stride,
476 uint8_t *dst, int32_t dst_stride,
477 uint8_t height)
478 {
479 uint8_t loop_cnt;
480 v16u8 src0, src1, src2, src3, src4;
481
482 src0 = LD_UB(src);
483 src += src_stride;
484
485 for (loop_cnt = (height >> 2); loop_cnt--;) {
486 LD_UB4(src, src_stride, src1, src2, src3, src4);
487 src += (4 * src_stride);
488
489 AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
490 dst, dst_stride);
491 dst += (4 * dst_stride);
492 src0 = src4;
493 }
494 }
495
common_vt_bil_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)496 static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
497 int32_t src_stride,
498 uint8_t *dst, int32_t dst_stride,
499 uint8_t height)
500 {
501 uint8_t loop_cnt;
502 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
503 v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
504 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
505
506 src0 = LD_UB(src);
507 src += src_stride;
508
509 for (loop_cnt = (height >> 3); loop_cnt--;) {
510 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
511 src += (8 * src_stride);
512 AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
513 res0, res1, res2, res3);
514 AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
515 res4, res5, res6, res7);
516
517 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
518 AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
519 res0, res1, res2, res3);
520 AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
521 res4, res5, res6, res7);
522 ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
523 dst += (8 * dst_stride);
524
525 src0 = src8;
526 }
527 }
528
common_hv_bil_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)529 static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
530 uint8_t *dst, int32_t dst_stride,
531 uint8_t height)
532 {
533 uint8_t loop_cnt;
534 uint32_t res0, res1;
535 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
536 v16u8 src0_r, src1_r, src2_r, res;
537 v8u16 add0, add1, add2, sum0, sum1;
538 v16i8 zeros = { 0 };
539
540 src0 = LD_SB(src);
541 src += src_stride;
542
543 for (loop_cnt = (height >> 1); loop_cnt--;) {
544 LD_SB2(src, src_stride, src1, src2);
545 src += (2 * src_stride);
546
547 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
548 src1_sld1, src2_sld1);
549 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
550 src0_r, src1_r, src2_r);
551 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
552 ADD2(add0, add1, add1, add2, sum0, sum1);
553 SRARI_H2_UH(sum0, sum1, 2);
554 res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
555 res0 = __msa_copy_u_w((v4i32) res, 0);
556 res1 = __msa_copy_u_w((v4i32) res, 2);
557 SW(res0, dst);
558 dst += dst_stride;
559 SW(res1, dst);
560 dst += dst_stride;
561
562 src0 = src2;
563 }
564 }
565
common_hv_bil_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)566 static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
567 uint8_t *dst, int32_t dst_stride,
568 uint8_t height)
569 {
570 uint8_t loop_cnt;
571 v16i8 src0, src1, src2, src3, src4;
572 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
573 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
574 v8u16 add0, add1, add2, add3, add4;
575 v8u16 sum0, sum1, sum2, sum3;
576 v16i8 zeros = { 0 };
577
578 src0 = LD_SB(src);
579 src += src_stride;
580
581 for (loop_cnt = (height >> 2); loop_cnt--;) {
582 LD_SB4(src, src_stride, src1, src2, src3, src4);
583 src += (4 * src_stride);
584
585 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
586 src1_sld1, src2_sld1);
587 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
588 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
589 src1_r, src2_r);
590 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
591 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
592 HADD_UB2_UH(src3_r, src4_r, add3, add4);
593 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
594 sum0, sum1, sum2, sum3);
595 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
596 PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
597 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
598 dst += (4 * dst_stride);
599 src0 = src4;
600 }
601 }
602
common_hv_bil_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)603 static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
604 uint8_t *dst, int32_t dst_stride,
605 uint8_t height)
606 {
607 uint8_t loop_cnt;
608 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
609 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
610 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
611 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
612 v8u16 src7_l, src8_l;
613 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
614 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
615
616 for (loop_cnt = (height >> 3); loop_cnt--;) {
617 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
618 LD_UB8((src + 1), src_stride,
619 src9, src10, src11, src12, src13, src14, src15, src16);
620 src += (8 * src_stride);
621
622 src8 = LD_UB(src);
623 src17 = LD_UB(src + 1);
624
625 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
626 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
627 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
628 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
629 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
630 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
631 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
632 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
633 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
634 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
635 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
636 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
637 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
638 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
639 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
640 ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
641 sum0_r, sum1_r, sum2_r, sum3_r);
642 ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
643 sum4_r, sum5_r, sum6_r, sum7_r);
644 ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
645 sum0_l, sum1_l, sum2_l, sum3_l);
646 ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
647 sum4_l, sum5_l, sum6_l, sum7_l);
648 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
649 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
650 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
651 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
652 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
653 sum3_l, sum3_r, dst, dst_stride);
654 dst += (4 * dst_stride);
655 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
656 sum7_l, sum7_r, dst, dst_stride);
657 dst += (4 * dst_stride);
658 }
659 }
660
common_hv_bil_no_rnd_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)661 static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
662 uint8_t *dst, int32_t dst_stride)
663 {
664 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
665 v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
666 v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
667 v8u16 src0_r, src1_r, src2_r, src3_r;
668 v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
669 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
670 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
671 v16i8 out0, out1;
672 v16i8 zeros = { 0 };
673
674 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
675 src += (8 * src_stride);
676 src8 = LD_UB(src);
677
678 SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
679 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
680 SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
681 src5_sld1, src6_sld1);
682 SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
683 ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
684 src3, src0_r, src1_r, src2_r, src3_r);
685 ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
686 src5_r, src6_r);
687 ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
688 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
689 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
690 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
691
692 sum0 = add0 + add1 + 1;
693 sum1 = add1 + add2 + 1;
694 sum2 = add2 + add3 + 1;
695 sum3 = add3 + add4 + 1;
696 sum4 = add4 + add5 + 1;
697 sum5 = add5 + add6 + 1;
698 sum6 = add6 + add7 + 1;
699 sum7 = add7 + add8 + 1;
700
701 SRA_4V(sum0, sum1, sum2, sum3, 2);
702 SRA_4V(sum4, sum5, sum6, sum7, 2);
703 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
704 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
705 PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
706 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
707 }
708
common_hv_bil_no_rnd_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)709 static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
710 uint8_t *dst, int32_t dst_stride)
711 {
712 v16i8 src0, src1, src2, src3, src4;
713 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
714 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
715 v8u16 add0, add1, add2, add3, add4;
716 v8u16 sum0, sum1, sum2, sum3;
717 v16i8 out0, out1;
718 v16i8 zeros = { 0 };
719
720 LD_SB4(src, src_stride, src0, src1, src2, src3);
721 src += (4 * src_stride);
722 src4 = LD_SB(src);
723
724 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
725 src1_sld1, src2_sld1);
726 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
727 ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
728 src1_r, src2_r);
729 ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
730 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
731 HADD_UB2_UH(src3_r, src4_r, add3, add4);
732
733 sum0 = add0 + add1 + 1;
734 sum1 = add1 + add2 + 1;
735 sum2 = add2 + add3 + 1;
736 sum3 = add3 + add4 + 1;
737
738 SRA_4V(sum0, sum1, sum2, sum3, 2);
739 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
740 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
741 }
742
common_hv_bil_no_rnd_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)743 static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
744 int32_t src_stride,
745 uint8_t *dst, int32_t dst_stride)
746 {
747 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
748 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
749 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
750 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
751 v8u16 src7_l, src8_l;
752 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
753 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
754
755 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
756 LD_UB8((src + 1), src_stride,
757 src9, src10, src11, src12, src13, src14, src15, src16);
758 src += (8 * src_stride);
759 src8 = LD_UB(src);
760 src17 = LD_UB(src + 1);
761
762 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
763 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
764 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
765 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
766 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
767 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
768 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
769 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
770 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
771
772 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
773 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
774 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
775 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
776 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
777 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
778
779 sum0_r = src0_r + src1_r + 1;
780 sum1_r = src1_r + src2_r + 1;
781 sum2_r = src2_r + src3_r + 1;
782 sum3_r = src3_r + src4_r + 1;
783 sum4_r = src4_r + src5_r + 1;
784 sum5_r = src5_r + src6_r + 1;
785 sum6_r = src6_r + src7_r + 1;
786 sum7_r = src7_r + src8_r + 1;
787 sum0_l = src0_l + src1_l + 1;
788 sum1_l = src1_l + src2_l + 1;
789 sum2_l = src2_l + src3_l + 1;
790 sum3_l = src3_l + src4_l + 1;
791 sum4_l = src4_l + src5_l + 1;
792 sum5_l = src5_l + src6_l + 1;
793 sum6_l = src6_l + src7_l + 1;
794 sum7_l = src7_l + src8_l + 1;
795
796 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
797 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
798 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
799 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
800 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
801 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
802 dst += (4 * dst_stride);
803
804 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
805 LD_UB8((src + 1), src_stride,
806 src9, src10, src11, src12, src13, src14, src15, src16);
807 src += (8 * src_stride);
808 src8 = LD_UB(src);
809 src17 = LD_UB(src + 1);
810
811 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
812 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
813 dst += (4 * dst_stride);
814
815 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
816 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
817 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
818 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
819 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
820 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
821 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
822 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
823 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
824
825 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
826 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
827 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
828 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
829 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
830 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
831
832 sum0_r = src0_r + src1_r + 1;
833 sum1_r = src1_r + src2_r + 1;
834 sum2_r = src2_r + src3_r + 1;
835 sum3_r = src3_r + src4_r + 1;
836 sum4_r = src4_r + src5_r + 1;
837 sum5_r = src5_r + src6_r + 1;
838 sum6_r = src6_r + src7_r + 1;
839 sum7_r = src7_r + src8_r + 1;
840 sum0_l = src0_l + src1_l + 1;
841 sum1_l = src1_l + src2_l + 1;
842 sum2_l = src2_l + src3_l + 1;
843 sum3_l = src3_l + src4_l + 1;
844 sum4_l = src4_l + src5_l + 1;
845 sum5_l = src5_l + src6_l + 1;
846 sum6_l = src6_l + src7_l + 1;
847 sum7_l = src7_l + src8_l + 1;
848
849 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
850 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
851 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
852 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
853 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
854 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
855 dst += (4 * dst_stride);
856 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
857 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
858 }
859
common_hv_bil_no_rnd_8x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)860 static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
861 int32_t src_stride,
862 uint8_t *dst, int32_t dst_stride)
863 {
864 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
865 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
866 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
867 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
868 v8u16 src7_l, src8_l;
869 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
870 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
871
872 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
873 LD_UB8((src + 1), src_stride,
874 src9, src10, src11, src12, src13, src14, src15, src16);
875 src += (8 * src_stride);
876 src8 = LD_UB(src);
877 src17 = LD_UB(src + 1);
878
879 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
880 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
881 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
882 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
883 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
884 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
885 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
886 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
887 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
888
889 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
890 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
891 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
892 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
893 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
894 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
895
896 sum0_r = src0_r + src1_r + 1;
897 sum1_r = src1_r + src2_r + 1;
898 sum2_r = src2_r + src3_r + 1;
899 sum3_r = src3_r + src4_r + 1;
900 sum4_r = src4_r + src5_r + 1;
901 sum5_r = src5_r + src6_r + 1;
902 sum6_r = src6_r + src7_r + 1;
903 sum7_r = src7_r + src8_r + 1;
904 sum0_l = src0_l + src1_l + 1;
905 sum1_l = src1_l + src2_l + 1;
906 sum2_l = src2_l + src3_l + 1;
907 sum3_l = src3_l + src4_l + 1;
908 sum4_l = src4_l + src5_l + 1;
909 sum5_l = src5_l + src6_l + 1;
910 sum6_l = src6_l + src7_l + 1;
911 sum7_l = src7_l + src8_l + 1;
912
913 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
914 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
915 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
916 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
917 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
918 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
919 dst += (4 * dst_stride);
920 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
921 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
922 }
923
common_hv_bil_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)924 static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
925 int32_t src_stride,
926 uint8_t *dst, int32_t dst_stride,
927 uint8_t height)
928 {
929 uint8_t loop_cnt;
930 uint32_t out0, out1;
931 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
932 v16u8 src0_r, src1_r, src2_r;
933 v8u16 add0, add1, add2, sum0, sum1;
934 v16u8 dst0, dst1, res0, res1;
935 v16i8 zeros = { 0 };
936
937 src0 = LD_SB(src);
938 src += src_stride;
939
940 for (loop_cnt = (height >> 1); loop_cnt--;) {
941 LD_SB2(src, src_stride, src1, src2);
942 src += (2 * src_stride);
943
944 LD_UB2(dst, dst_stride, dst0, dst1);
945 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
946 src1_sld1, src2_sld1);
947 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
948 src1_r, src2_r);
949 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
950 ADD2(add0, add1, add1, add2, sum0, sum1);
951 SRARI_H2_UH(sum0, sum1, 2);
952 PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
953 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
954
955 out0 = __msa_copy_u_w((v4i32) res0, 0);
956 out1 = __msa_copy_u_w((v4i32) res1, 0);
957 SW(out0, dst);
958 dst += dst_stride;
959 SW(out1, dst);
960 dst += dst_stride;
961
962 src0 = src2;
963 }
964 }
965
common_hv_bil_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)966 static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
967 int32_t src_stride,
968 uint8_t *dst, int32_t dst_stride,
969 uint8_t height)
970 {
971 uint8_t loop_cnt;
972 v16i8 src0, src1, src2, src3, src4;
973 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
974 v16u8 dst0, dst1, dst2, dst3;
975 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
976 v8u16 add0, add1, add2, add3, add4;
977 v8u16 sum0, sum1, sum2, sum3;
978 v16i8 zeros = { 0 };
979
980 src0 = LD_SB(src);
981 src += src_stride;
982
983 for (loop_cnt = (height >> 2); loop_cnt--;) {
984 LD_SB4(src, src_stride, src1, src2, src3, src4);
985 src += (4 * src_stride);
986
987 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
988 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
989 src1_sld1, src2_sld1);
990 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
991 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
992 src1_r, src2_r);
993 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
994 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
995 HADD_UB2_UH(src3_r, src4_r, add3, add4);
996 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
997 sum0, sum1, sum2, sum3);
998 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
999 PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
1000 sum2, dst2, sum3, dst3, dst, dst_stride);
1001 dst += (4 * dst_stride);
1002 src0 = src4;
1003 }
1004 }
1005
common_hv_bil_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,uint8_t height)1006 static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
1007 int32_t src_stride,
1008 uint8_t *dst, int32_t dst_stride,
1009 uint8_t height)
1010 {
1011 uint8_t loop_cnt;
1012 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1013 v16u8 src11, src12, src13, src14, src15, src16, src17;
1014 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1015 v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
1016 v16u8 src7_l, src8_l;
1017 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1018 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1019 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1020 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1021
1022 for (loop_cnt = (height >> 3); loop_cnt--;) {
1023 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1024 LD_UB8((src + 1), src_stride,
1025 src9, src10, src11, src12, src13, src14, src15, src16);
1026 src += (8 * src_stride);
1027
1028 src8 = LD_UB(src);
1029 src17 = LD_UB(src + 1);
1030
1031 ILVRL_B2_UB(src9, src0, src0_r, src0_l);
1032 ILVRL_B2_UB(src10, src1, src1_r, src1_l);
1033 ILVRL_B2_UB(src11, src2, src2_r, src2_l);
1034 ILVRL_B2_UB(src12, src3, src3_r, src3_l);
1035 ILVRL_B2_UB(src13, src4, src4_r, src4_l);
1036 ILVRL_B2_UB(src14, src5, src5_r, src5_l);
1037 ILVRL_B2_UB(src15, src6, src6_r, src6_l);
1038 ILVRL_B2_UB(src16, src7, src7_r, src7_l);
1039 ILVRL_B2_UB(src17, src8, src8_r, src8_l);
1040 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1041 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1042 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1043 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1044 sum2_r, sum3_r);
1045 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1046 sum6_r, sum7_r);
1047 HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1048 HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1049 HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1050 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1051 sum2_l, sum3_l);
1052 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1053 sum6_l, sum7_l);
1054 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
1055 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
1056 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
1057 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
1058 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1059 PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
1060 dst += dst_stride;
1061 PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
1062 dst += dst_stride;
1063 PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
1064 dst += dst_stride;
1065 PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
1066 dst += dst_stride;
1067 PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
1068 dst += dst_stride;
1069 PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
1070 dst += dst_stride;
1071 PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
1072 dst += dst_stride;
1073 PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
1074 dst += dst_stride;
1075 }
1076 }
1077
copy_width8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)1078 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
1079 uint8_t *dst, int32_t dst_stride,
1080 int32_t height)
1081 {
1082 int32_t cnt;
1083 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1084 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1085
1086 if (0 == height % 12) {
1087 for (cnt = (height / 12); cnt--;) {
1088 LD_UB8(src, src_stride,
1089 src0, src1, src2, src3, src4, src5, src6, src7);
1090 src += (8 * src_stride);
1091
1092 out0 = __msa_copy_u_d((v2i64) src0, 0);
1093 out1 = __msa_copy_u_d((v2i64) src1, 0);
1094 out2 = __msa_copy_u_d((v2i64) src2, 0);
1095 out3 = __msa_copy_u_d((v2i64) src3, 0);
1096 out4 = __msa_copy_u_d((v2i64) src4, 0);
1097 out5 = __msa_copy_u_d((v2i64) src5, 0);
1098 out6 = __msa_copy_u_d((v2i64) src6, 0);
1099 out7 = __msa_copy_u_d((v2i64) src7, 0);
1100
1101 SD4(out0, out1, out2, out3, dst, dst_stride);
1102 dst += (4 * dst_stride);
1103 SD4(out4, out5, out6, out7, dst, dst_stride);
1104 dst += (4 * dst_stride);
1105
1106 LD_UB4(src, src_stride, src0, src1, src2, src3);
1107 src += (4 * src_stride);
1108
1109 out0 = __msa_copy_u_d((v2i64) src0, 0);
1110 out1 = __msa_copy_u_d((v2i64) src1, 0);
1111 out2 = __msa_copy_u_d((v2i64) src2, 0);
1112 out3 = __msa_copy_u_d((v2i64) src3, 0);
1113
1114 SD4(out0, out1, out2, out3, dst, dst_stride);
1115 dst += (4 * dst_stride);
1116 }
1117 } else if (0 == height % 8) {
1118 for (cnt = height >> 3; cnt--;) {
1119 LD_UB8(src, src_stride,
1120 src0, src1, src2, src3, src4, src5, src6, src7);
1121 src += (8 * src_stride);
1122
1123 out0 = __msa_copy_u_d((v2i64) src0, 0);
1124 out1 = __msa_copy_u_d((v2i64) src1, 0);
1125 out2 = __msa_copy_u_d((v2i64) src2, 0);
1126 out3 = __msa_copy_u_d((v2i64) src3, 0);
1127 out4 = __msa_copy_u_d((v2i64) src4, 0);
1128 out5 = __msa_copy_u_d((v2i64) src5, 0);
1129 out6 = __msa_copy_u_d((v2i64) src6, 0);
1130 out7 = __msa_copy_u_d((v2i64) src7, 0);
1131
1132 SD4(out0, out1, out2, out3, dst, dst_stride);
1133 dst += (4 * dst_stride);
1134 SD4(out4, out5, out6, out7, dst, dst_stride);
1135 dst += (4 * dst_stride);
1136 }
1137 } else if (0 == height % 4) {
1138 for (cnt = (height / 4); cnt--;) {
1139 LD_UB4(src, src_stride, src0, src1, src2, src3);
1140 src += (4 * src_stride);
1141 out0 = __msa_copy_u_d((v2i64) src0, 0);
1142 out1 = __msa_copy_u_d((v2i64) src1, 0);
1143 out2 = __msa_copy_u_d((v2i64) src2, 0);
1144 out3 = __msa_copy_u_d((v2i64) src3, 0);
1145
1146 SD4(out0, out1, out2, out3, dst, dst_stride);
1147 dst += (4 * dst_stride);
1148 }
1149 } else if (0 == height % 2) {
1150 for (cnt = (height / 2); cnt--;) {
1151 LD_UB2(src, src_stride, src0, src1);
1152 src += (2 * src_stride);
1153 out0 = __msa_copy_u_d((v2i64) src0, 0);
1154 out1 = __msa_copy_u_d((v2i64) src1, 0);
1155
1156 SD(out0, dst);
1157 dst += dst_stride;
1158 SD(out1, dst);
1159 dst += dst_stride;
1160 }
1161 }
1162 }
1163
copy_16multx8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t width)1164 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
1165 uint8_t *dst, int32_t dst_stride,
1166 int32_t height, int32_t width)
1167 {
1168 int32_t cnt, loop_cnt;
1169 const uint8_t *src_tmp;
1170 uint8_t *dst_tmp;
1171 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1172
1173 for (cnt = (width >> 4); cnt--;) {
1174 src_tmp = src;
1175 dst_tmp = dst;
1176
1177 for (loop_cnt = (height >> 3); loop_cnt--;) {
1178 LD_UB8(src_tmp, src_stride,
1179 src0, src1, src2, src3, src4, src5, src6, src7);
1180 src_tmp += (8 * src_stride);
1181
1182 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1183 dst_tmp, dst_stride);
1184 dst_tmp += (8 * dst_stride);
1185 }
1186
1187 src += 16;
1188 dst += 16;
1189 }
1190 }
1191
copy_width16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)1192 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
1193 uint8_t *dst, int32_t dst_stride,
1194 int32_t height)
1195 {
1196 int32_t cnt;
1197 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1198
1199 if (0 == height % 12) {
1200 for (cnt = (height / 12); cnt--;) {
1201 LD_UB8(src, src_stride,
1202 src0, src1, src2, src3, src4, src5, src6, src7);
1203 src += (8 * src_stride);
1204 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1205 dst, dst_stride);
1206 dst += (8 * dst_stride);
1207
1208 LD_UB4(src, src_stride, src0, src1, src2, src3);
1209 src += (4 * src_stride);
1210 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1211 dst += (4 * dst_stride);
1212 }
1213 } else if (0 == height % 8) {
1214 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
1215 } else if (0 == height % 4) {
1216 for (cnt = (height >> 2); cnt--;) {
1217 LD_UB4(src, src_stride, src0, src1, src2, src3);
1218 src += (4 * src_stride);
1219
1220 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1221 dst += (4 * dst_stride);
1222 }
1223 }
1224 }
1225
avg_width4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)1226 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
1227 uint8_t *dst, int32_t dst_stride,
1228 int32_t height)
1229 {
1230 int32_t cnt;
1231 uint32_t out0, out1, out2, out3;
1232 v16u8 src0, src1, src2, src3;
1233 v16u8 dst0, dst1, dst2, dst3;
1234
1235 if (0 == (height % 4)) {
1236 for (cnt = (height / 4); cnt--;) {
1237 LD_UB4(src, src_stride, src0, src1, src2, src3);
1238 src += (4 * src_stride);
1239
1240 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1241
1242 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1243 dst0, dst1, dst2, dst3);
1244
1245 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1246 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1247 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1248 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1249 SW4(out0, out1, out2, out3, dst, dst_stride);
1250 dst += (4 * dst_stride);
1251 }
1252 } else if (0 == (height % 2)) {
1253 for (cnt = (height / 2); cnt--;) {
1254 LD_UB2(src, src_stride, src0, src1);
1255 src += (2 * src_stride);
1256
1257 LD_UB2(dst, dst_stride, dst0, dst1);
1258
1259 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1260
1261 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1262 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1263 SW(out0, dst);
1264 dst += dst_stride;
1265 SW(out1, dst);
1266 dst += dst_stride;
1267 }
1268 }
1269 }
1270
avg_width8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)1271 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
1272 uint8_t *dst, int32_t dst_stride,
1273 int32_t height)
1274 {
1275 int32_t cnt;
1276 uint64_t out0, out1, out2, out3;
1277 v16u8 src0, src1, src2, src3;
1278 v16u8 dst0, dst1, dst2, dst3;
1279
1280 for (cnt = (height / 4); cnt--;) {
1281 LD_UB4(src, src_stride, src0, src1, src2, src3);
1282 src += (4 * src_stride);
1283 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1284
1285 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1286 dst0, dst1, dst2, dst3);
1287
1288 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1289 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1290 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1291 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1292 SD4(out0, out1, out2, out3, dst, dst_stride);
1293 dst += (4 * dst_stride);
1294 }
1295 }
1296
avg_width16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)1297 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
1298 uint8_t *dst, int32_t dst_stride,
1299 int32_t height)
1300 {
1301 int32_t cnt;
1302 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1303 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1304
1305 for (cnt = (height / 8); cnt--;) {
1306 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1307 src += (8 * src_stride);
1308 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1309
1310 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1311 dst0, dst1, dst2, dst3);
1312 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1313 dst4, dst5, dst6, dst7);
1314 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
1315 dst += (8 * dst_stride);
1316 }
1317 }
1318
ff_put_pixels16_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1319 void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1320 ptrdiff_t line_size, int h)
1321 {
1322 copy_width16_msa(pixels, line_size, block, line_size, h);
1323 }
1324
ff_put_pixels16_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1325 void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1326 ptrdiff_t line_size, int h)
1327 {
1328 common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
1329 }
1330
ff_put_pixels16_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1331 void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1332 ptrdiff_t line_size, int h)
1333 {
1334 common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
1335 }
1336
ff_put_pixels16_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1337 void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1338 ptrdiff_t line_size, int h)
1339 {
1340 common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
1341 }
1342
ff_put_pixels8_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1343 void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1344 ptrdiff_t line_size, int h)
1345 {
1346 copy_width8_msa(pixels, line_size, block, line_size, h);
1347 }
1348
ff_put_pixels8_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1349 void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1350 ptrdiff_t line_size, int h)
1351 {
1352 common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
1353 }
1354
ff_put_pixels8_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1355 void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1356 ptrdiff_t line_size, int h)
1357 {
1358 common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
1359 }
1360
ff_put_pixels8_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1361 void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1362 ptrdiff_t line_size, int h)
1363 {
1364 common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
1365 }
1366
ff_put_pixels4_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1367 void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1368 ptrdiff_t line_size, int h)
1369 {
1370 common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
1371 }
1372
ff_put_pixels4_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1373 void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1374 ptrdiff_t line_size, int h)
1375 {
1376 common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
1377 }
1378
ff_put_pixels4_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1379 void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1380 ptrdiff_t line_size, int h)
1381 {
1382 common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
1383 }
1384
ff_put_no_rnd_pixels16_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1385 void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1386 ptrdiff_t line_size, int h)
1387 {
1388 if (h == 16) {
1389 common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1390 } else if (h == 8) {
1391 common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1392 }
1393 }
1394
ff_put_no_rnd_pixels16_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1395 void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1396 ptrdiff_t line_size, int h)
1397 {
1398 if (h == 16) {
1399 common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1400 } else if (h == 8) {
1401 common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1402 }
1403 }
1404
ff_put_no_rnd_pixels16_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1405 void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
1406 const uint8_t *pixels,
1407 ptrdiff_t line_size, int h)
1408 {
1409 if (h == 16) {
1410 common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1411 } else if (h == 8) {
1412 common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1413 }
1414 }
1415
ff_put_no_rnd_pixels8_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1416 void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1417 ptrdiff_t line_size, int h)
1418 {
1419 if (h == 8) {
1420 common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1421 } else if (h == 4) {
1422 common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1423 }
1424 }
1425
ff_put_no_rnd_pixels8_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1426 void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1427 ptrdiff_t line_size, int h)
1428 {
1429 if (h == 8) {
1430 common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1431 } else if (h == 4) {
1432 common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1433 }
1434 }
1435
ff_put_no_rnd_pixels8_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1436 void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1437 ptrdiff_t line_size, int h)
1438 {
1439 if (h == 8) {
1440 common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1441 } else if (h == 4) {
1442 common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1443 }
1444 }
1445
ff_avg_pixels16_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1446 void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1447 ptrdiff_t line_size, int h)
1448 {
1449 avg_width16_msa(pixels, line_size, block, line_size, h);
1450 }
1451
ff_avg_pixels16_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1452 void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1453 ptrdiff_t line_size, int h)
1454 {
1455 common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1456 }
1457
ff_avg_pixels16_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1458 void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1459 ptrdiff_t line_size, int h)
1460 {
1461 common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1462 }
1463
ff_avg_pixels16_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1464 void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1465 ptrdiff_t line_size, int h)
1466 {
1467 common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1468 }
1469
ff_avg_pixels8_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1470 void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1471 ptrdiff_t line_size, int h)
1472 {
1473 avg_width8_msa(pixels, line_size, block, line_size, h);
1474 }
1475
ff_avg_pixels8_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1476 void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1477 ptrdiff_t line_size, int h)
1478 {
1479 common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1480 }
1481
ff_avg_pixels8_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1482 void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1483 ptrdiff_t line_size, int h)
1484 {
1485 common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1486 }
1487
ff_avg_pixels8_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1488 void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1489 ptrdiff_t line_size, int h)
1490 {
1491 common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1492 }
1493
ff_avg_pixels4_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1494 void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
1495 ptrdiff_t line_size, int h)
1496 {
1497 avg_width4_msa(pixels, line_size, block, line_size, h);
1498 }
1499
ff_avg_pixels4_x2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1500 void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1501 ptrdiff_t line_size, int h)
1502 {
1503 common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1504 }
1505
ff_avg_pixels4_y2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1506 void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1507 ptrdiff_t line_size, int h)
1508 {
1509 common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1510 }
1511
ff_avg_pixels4_xy2_msa(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1512 void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1513 ptrdiff_t line_size, int h)
1514 {
1515 common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1516 }
1517