1 /*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "me_cmp_mips.h"
23
sad_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)24 static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
25 uint8_t *ref, int32_t ref_stride,
26 int32_t height)
27 {
28 int32_t ht_cnt;
29 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
30 v8u16 sad = { 0 };
31
32 for (ht_cnt = (height >> 2); ht_cnt--;) {
33 LD_UB4(src, src_stride, src0, src1, src2, src3);
34 src += (4 * src_stride);
35 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
36 ref += (4 * ref_stride);
37
38 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
39 src0, src1, ref0, ref1);
40 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
41 }
42
43 return (HADD_UH_U32(sad));
44 }
45
sad_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)46 static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
47 uint8_t *ref, int32_t ref_stride,
48 int32_t height)
49 {
50 int32_t ht_cnt;
51 v16u8 src0, src1, ref0, ref1;
52 v8u16 sad = { 0 };
53
54 for (ht_cnt = (height >> 2); ht_cnt--;) {
55 LD_UB2(src, src_stride, src0, src1);
56 src += (2 * src_stride);
57 LD_UB2(ref, ref_stride, ref0, ref1);
58 ref += (2 * ref_stride);
59 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
60
61 LD_UB2(src, src_stride, src0, src1);
62 src += (2 * src_stride);
63 LD_UB2(ref, ref_stride, ref0, ref1);
64 ref += (2 * ref_stride);
65 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
66 }
67
68 return (HADD_UH_U32(sad));
69 }
70
sad_horiz_bilinear_filter_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)71 static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
72 int32_t src_stride,
73 uint8_t *ref,
74 int32_t ref_stride,
75 int32_t height)
76 {
77 int32_t ht_cnt;
78 v16u8 src0, src1, src2, src3, comp0, comp1;
79 v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
80 v8u16 sad = { 0 };
81
82 for (ht_cnt = (height >> 3); ht_cnt--;) {
83 LD_UB4(src, src_stride, src0, src1, src2, src3);
84 src += (4 * src_stride);
85 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
86 ref += (4 * ref_stride);
87
88 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
89 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
90 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
91 ref0, ref1, ref2, ref3);
92 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
93 AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
94 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
95
96 LD_UB4(src, src_stride, src0, src1, src2, src3);
97 src += (4 * src_stride);
98 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
99 ref += (4 * ref_stride);
100
101 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
102 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
103 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
104 ref0, ref1, ref2, ref3);
105 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
106 AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
107 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
108 }
109
110 return (HADD_UH_U32(sad));
111 }
112
sad_horiz_bilinear_filter_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)113 static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
114 int32_t src_stride,
115 uint8_t *ref,
116 int32_t ref_stride,
117 int32_t height)
118 {
119 int32_t ht_cnt;
120 v16u8 src0, src1, src2, src3, comp0, comp1;
121 v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
122 v8u16 sad = { 0 };
123
124 for (ht_cnt = (height >> 3); ht_cnt--;) {
125 LD_UB4(src, src_stride, src0, src1, src2, src3);
126 src += (4 * src_stride);
127 LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
128 LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
129 ref += (4 * ref_stride);
130
131 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
132 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
133 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
134 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
135
136 LD_UB4(src, src_stride, src0, src1, src2, src3);
137 src += (4 * src_stride);
138 LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
139 LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
140 ref += (4 * ref_stride);
141
142 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
143 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
144 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
145 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
146 }
147
148 return (HADD_UH_U32(sad));
149 }
150
sad_vert_bilinear_filter_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)151 static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
152 int32_t src_stride,
153 uint8_t *ref,
154 int32_t ref_stride,
155 int32_t height)
156 {
157 int32_t ht_cnt;
158 v16u8 src0, src1, src2, src3, comp0, comp1;
159 v16u8 ref0, ref1, ref2, ref3, ref4;
160 v8u16 sad = { 0 };
161
162 for (ht_cnt = (height >> 3); ht_cnt--;) {
163 LD_UB4(src, src_stride, src0, src1, src2, src3);
164 src += (4 * src_stride);
165 LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
166 ref += (4 * ref_stride);
167
168 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
169 PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
170 PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
171 AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
172 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
173
174 LD_UB4(src, src_stride, src0, src1, src2, src3);
175 src += (4 * src_stride);
176 LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
177 ref += (4 * ref_stride);
178
179 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
180 PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
181 PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
182 AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
183 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
184 }
185
186 return (HADD_UH_U32(sad));
187 }
188
sad_vert_bilinear_filter_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)189 static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
190 int32_t src_stride,
191 uint8_t *ref,
192 int32_t ref_stride,
193 int32_t height)
194 {
195 int32_t ht_cnt;
196 v16u8 src0, src1, src2, src3, comp0, comp1;
197 v16u8 ref0, ref1, ref2, ref3, ref4;
198 v8u16 sad = { 0 };
199
200 for (ht_cnt = (height >> 3); ht_cnt--;) {
201 LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
202 ref += (5 * ref_stride);
203 LD_UB4(src, src_stride, src0, src1, src2, src3);
204 src += (4 * src_stride);
205
206 AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
207 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
208 AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
209 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
210
211 ref4 = ref3;
212
213 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
214 ref += (3 * ref_stride);
215 LD_UB4(src, src_stride, src0, src1, src2, src3);
216 src += (4 * src_stride);
217
218 AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
219 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
220 AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
221 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
222 }
223
224 return (HADD_UH_U32(sad));
225 }
226
sad_hv_bilinear_filter_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)227 static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
228 int32_t src_stride,
229 uint8_t *ref,
230 int32_t ref_stride,
231 int32_t height)
232 {
233 int32_t ht_cnt;
234 v16u8 src0, src1, src2, src3, temp0, temp1, diff;
235 v16u8 ref0, ref1, ref2, ref3, ref4;
236 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
237 v8u16 comp0, comp1, comp2, comp3;
238 v8u16 sad = { 0 };
239
240 for (ht_cnt = (height >> 2); ht_cnt--;) {
241 LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
242 ref += (4 * ref_stride);
243 LD_UB4(src, src_stride, src0, src1, src2, src3);
244 src += (4 * src_stride);
245
246 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
247
248 VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
249 comp0 = __msa_hadd_u_h(temp0, temp0);
250 comp1 = __msa_hadd_u_h(temp1, temp1);
251 comp0 += comp1;
252 comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
253 comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
254
255 temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
256 comp2 = __msa_hadd_u_h(temp0, temp0);
257 comp1 += comp2;
258 comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
259 comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
260 comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
261 diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
262 sad += __msa_hadd_u_h(diff, diff);
263
264 temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
265 comp3 = __msa_hadd_u_h(temp1, temp1);
266 comp2 += comp3;
267 comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
268 comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
269
270 temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
271 comp0 = __msa_hadd_u_h(temp0, temp0);
272 comp3 += comp0;
273 comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
274 comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
275 comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
276 diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
277 sad += __msa_hadd_u_h(diff, diff);
278 }
279
280 return (HADD_UH_U32(sad));
281 }
282
sad_hv_bilinear_filter_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)283 static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
284 int32_t src_stride,
285 uint8_t *ref,
286 int32_t ref_stride,
287 int32_t height)
288 {
289 int32_t ht_cnt;
290 v16u8 src0, src1, src2, src3, comp, diff;
291 v16u8 temp0, temp1, temp2, temp3;
292 v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
293 v8u16 comp0, comp1, comp2, comp3;
294 v8u16 sad = { 0 };
295
296 for (ht_cnt = (height >> 3); ht_cnt--;) {
297 LD_UB4(src, src_stride, src0, src1, src2, src3);
298 src += (4 * src_stride);
299 LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
300 LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
301 ref += (5 * ref_stride);
302
303 ILVRL_B2_UB(ref14, ref04, temp0, temp1);
304 comp0 = __msa_hadd_u_h(temp0, temp0);
305 comp1 = __msa_hadd_u_h(temp1, temp1);
306 ILVRL_B2_UB(ref10, ref00, temp2, temp3);
307 comp2 = __msa_hadd_u_h(temp2, temp2);
308 comp3 = __msa_hadd_u_h(temp3, temp3);
309 comp0 += comp2;
310 comp1 += comp3;
311 SRARI_H2_UH(comp0, comp1, 2);
312 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
313 diff = __msa_asub_u_b(src0, comp);
314 sad += __msa_hadd_u_h(diff, diff);
315
316 ILVRL_B2_UB(ref11, ref01, temp0, temp1);
317 comp0 = __msa_hadd_u_h(temp0, temp0);
318 comp1 = __msa_hadd_u_h(temp1, temp1);
319 comp2 += comp0;
320 comp3 += comp1;
321 SRARI_H2_UH(comp2, comp3, 2);
322 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
323 diff = __msa_asub_u_b(src1, comp);
324 sad += __msa_hadd_u_h(diff, diff);
325
326 ILVRL_B2_UB(ref12, ref02, temp2, temp3);
327 comp2 = __msa_hadd_u_h(temp2, temp2);
328 comp3 = __msa_hadd_u_h(temp3, temp3);
329 comp0 += comp2;
330 comp1 += comp3;
331 SRARI_H2_UH(comp0, comp1, 2);
332 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
333 diff = __msa_asub_u_b(src2, comp);
334 sad += __msa_hadd_u_h(diff, diff);
335
336 ILVRL_B2_UB(ref13, ref03, temp0, temp1);
337 comp0 = __msa_hadd_u_h(temp0, temp0);
338 comp1 = __msa_hadd_u_h(temp1, temp1);
339 comp2 += comp0;
340 comp3 += comp1;
341 SRARI_H2_UH(comp2, comp3, 2);
342 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
343 diff = __msa_asub_u_b(src3, comp);
344 sad += __msa_hadd_u_h(diff, diff);
345
346 LD_UB4(src, src_stride, src0, src1, src2, src3);
347 src += (4 * src_stride);
348 LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
349 LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
350 ref += (3 * ref_stride);
351
352 ILVRL_B2_UB(ref10, ref00, temp2, temp3);
353 comp2 = __msa_hadd_u_h(temp2, temp2);
354 comp3 = __msa_hadd_u_h(temp3, temp3);
355 comp0 += comp2;
356 comp1 += comp3;
357 SRARI_H2_UH(comp0, comp1, 2);
358 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
359 diff = __msa_asub_u_b(src0, comp);
360 sad += __msa_hadd_u_h(diff, diff);
361
362 ILVRL_B2_UB(ref11, ref01, temp0, temp1);
363 comp0 = __msa_hadd_u_h(temp0, temp0);
364 comp1 = __msa_hadd_u_h(temp1, temp1);
365 comp2 += comp0;
366 comp3 += comp1;
367 SRARI_H2_UH(comp2, comp3, 2);
368 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
369 diff = __msa_asub_u_b(src1, comp);
370 sad += __msa_hadd_u_h(diff, diff);
371
372 ILVRL_B2_UB(ref12, ref02, temp2, temp3);
373 comp2 = __msa_hadd_u_h(temp2, temp2);
374 comp3 = __msa_hadd_u_h(temp3, temp3);
375 comp0 += comp2;
376 comp1 += comp3;
377 SRARI_H2_UH(comp0, comp1, 2);
378 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
379 diff = __msa_asub_u_b(src2, comp);
380 sad += __msa_hadd_u_h(diff, diff);
381
382 ILVRL_B2_UB(ref13, ref03, temp0, temp1);
383 comp0 = __msa_hadd_u_h(temp0, temp0);
384 comp1 = __msa_hadd_u_h(temp1, temp1);
385 comp2 += comp0;
386 comp3 += comp1;
387 SRARI_H2_UH(comp2, comp3, 2);
388 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
389 diff = __msa_asub_u_b(src3, comp);
390 sad += __msa_hadd_u_h(diff, diff);
391 }
392
393 return (HADD_UH_U32(sad));
394 }
395
396 #define CALC_MSE_B(src, ref, var) \
397 { \
398 v16u8 src_l0_m, src_l1_m; \
399 v8i16 res_l0_m, res_l1_m; \
400 \
401 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
402 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
403 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
404 }
405
sse_4width_msa(uint8_t * src_ptr,int32_t src_stride,uint8_t * ref_ptr,int32_t ref_stride,int32_t height)406 static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
407 uint8_t *ref_ptr, int32_t ref_stride,
408 int32_t height)
409 {
410 int32_t ht_cnt;
411 uint32_t sse;
412 uint32_t src0, src1, src2, src3;
413 uint32_t ref0, ref1, ref2, ref3;
414 v16u8 src = { 0 };
415 v16u8 ref = { 0 };
416 v4i32 var = { 0 };
417
418 for (ht_cnt = (height >> 2); ht_cnt--;) {
419 LW4(src_ptr, src_stride, src0, src1, src2, src3);
420 src_ptr += (4 * src_stride);
421 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
422 ref_ptr += (4 * ref_stride);
423
424 INSERT_W4_UB(src0, src1, src2, src3, src);
425 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
426 CALC_MSE_B(src, ref, var);
427 }
428
429 sse = HADD_SW_S32(var);
430
431 return sse;
432 }
433
sse_8width_msa(uint8_t * src_ptr,int32_t src_stride,uint8_t * ref_ptr,int32_t ref_stride,int32_t height)434 static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
435 uint8_t *ref_ptr, int32_t ref_stride,
436 int32_t height)
437 {
438 int32_t ht_cnt;
439 uint32_t sse;
440 v16u8 src0, src1, src2, src3;
441 v16u8 ref0, ref1, ref2, ref3;
442 v4i32 var = { 0 };
443
444 for (ht_cnt = (height >> 2); ht_cnt--;) {
445 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
446 src_ptr += (4 * src_stride);
447 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
448 ref_ptr += (4 * ref_stride);
449
450 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
451 src0, src1, ref0, ref1);
452 CALC_MSE_B(src0, ref0, var);
453 CALC_MSE_B(src1, ref1, var);
454 }
455
456 sse = HADD_SW_S32(var);
457
458 return sse;
459 }
460
sse_16width_msa(uint8_t * src_ptr,int32_t src_stride,uint8_t * ref_ptr,int32_t ref_stride,int32_t height)461 static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
462 uint8_t *ref_ptr, int32_t ref_stride,
463 int32_t height)
464 {
465 int32_t ht_cnt;
466 uint32_t sse;
467 v16u8 src, ref;
468 v4i32 var = { 0 };
469
470 for (ht_cnt = (height >> 2); ht_cnt--;) {
471 src = LD_UB(src_ptr);
472 src_ptr += src_stride;
473 ref = LD_UB(ref_ptr);
474 ref_ptr += ref_stride;
475 CALC_MSE_B(src, ref, var);
476
477 src = LD_UB(src_ptr);
478 src_ptr += src_stride;
479 ref = LD_UB(ref_ptr);
480 ref_ptr += ref_stride;
481 CALC_MSE_B(src, ref, var);
482
483 src = LD_UB(src_ptr);
484 src_ptr += src_stride;
485 ref = LD_UB(ref_ptr);
486 ref_ptr += ref_stride;
487 CALC_MSE_B(src, ref, var);
488
489 src = LD_UB(src_ptr);
490 src_ptr += src_stride;
491 ref = LD_UB(ref_ptr);
492 ref_ptr += ref_stride;
493 CALC_MSE_B(src, ref, var);
494 }
495
496 sse = HADD_SW_S32(var);
497
498 return sse;
499 }
500
hadamard_diff_8x8_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride)501 static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
502 uint8_t *ref, int32_t ref_stride)
503 {
504 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
505 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
506 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
507 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
508 v8i16 sum = { 0 };
509 v8i16 zero = { 0 };
510
511 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
512 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
513 ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
514 src4, ref4, src5, ref5, src6, ref6, src7, ref7,
515 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
516 HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
517 HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
518 TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
519 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
520 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
521 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
522 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
523 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
524 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
525 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
526 TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
527 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
528 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
529 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
530 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
531 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
532 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
533 diff0, diff1, diff2, diff3);
534 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
535 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
536 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
537 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
538 sum += __msa_add_a_h((v8i16) diff0, zero);
539 sum += __msa_add_a_h((v8i16) diff1, zero);
540 sum += __msa_add_a_h((v8i16) diff2, zero);
541 sum += __msa_add_a_h((v8i16) diff3, zero);
542
543 return (HADD_UH_U32(sum));
544 }
545
hadamard_intra_8x8_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride)546 static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
547 uint8_t *ref, int32_t ref_stride)
548 {
549 int32_t sum_res = 0;
550 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
551 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
552 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
553 v8i16 sum = { 0 };
554 v16i8 zero = { 0 };
555
556 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
557 TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
558 src0, src1, src2, src3, src4, src5, src6, src7);
559 ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
560 zero, src4, zero, src5, zero, src6, zero, src7,
561 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
562 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
563 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
564 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
565 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
566 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
567 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
568 TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
569 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
570 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
571 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
572 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
573 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
574 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
575 diff0, diff1, diff2, diff3);
576 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
577 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
578 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
579 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
580 sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
581 sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
582 sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
583 sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
584 sum_res = (HADD_UH_U32(sum));
585 sum_res -= abs(temp0[0] + temp4[0]);
586
587 return sum_res;
588 }
589
ff_pix_abs16_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)590 int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
591 ptrdiff_t stride, int height)
592 {
593 return sad_16width_msa(src, stride, ref, stride, height);
594 }
595
ff_pix_abs8_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)596 int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
597 ptrdiff_t stride, int height)
598 {
599 return sad_8width_msa(src, stride, ref, stride, height);
600 }
601
ff_pix_abs16_x2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)602 int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
603 ptrdiff_t stride, int h)
604 {
605 return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
606 }
607
ff_pix_abs16_y2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)608 int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
609 ptrdiff_t stride, int h)
610 {
611 return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
612 }
613
ff_pix_abs16_xy2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)614 int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
615 ptrdiff_t stride, int h)
616 {
617 return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
618 }
619
ff_pix_abs8_x2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)620 int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
621 ptrdiff_t stride, int h)
622 {
623 return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
624 }
625
ff_pix_abs8_y2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)626 int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
627 ptrdiff_t stride, int h)
628 {
629 return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
630 }
631
ff_pix_abs8_xy2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)632 int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
633 ptrdiff_t stride, int h)
634 {
635 return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
636 }
637
ff_sse16_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)638 int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
639 ptrdiff_t stride, int height)
640 {
641 return sse_16width_msa(src, stride, ref, stride, height);
642 }
643
ff_sse8_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)644 int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
645 ptrdiff_t stride, int height)
646 {
647 return sse_8width_msa(src, stride, ref, stride, height);
648 }
649
ff_sse4_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)650 int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
651 ptrdiff_t stride, int height)
652 {
653 return sse_4width_msa(src, stride, ref, stride, height);
654 }
655
ff_hadamard8_diff8x8_msa(MpegEncContext * s,uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h)656 int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
657 ptrdiff_t stride, int h)
658 {
659 return hadamard_diff_8x8_msa(src, stride, dst, stride);
660 }
661
ff_hadamard8_intra8x8_msa(MpegEncContext * s,uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h)662 int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
663 ptrdiff_t stride, int h)
664 {
665 return hadamard_intra_8x8_msa(src, stride, dst, stride);
666 }
667
668 /* Hadamard Transform functions */
669 #define WRAPPER8_16_SQ(name8, name16) \
670 int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
671 ptrdiff_t stride, int h) \
672 { \
673 int score = 0; \
674 score += name8(s, dst, src, stride, 8); \
675 score += name8(s, dst + 8, src + 8, stride, 8); \
676 if(h == 16) { \
677 dst += 8 * stride; \
678 src += 8 * stride; \
679 score +=name8(s, dst, src, stride, 8); \
680 score +=name8(s, dst + 8, src + 8, stride, 8); \
681 } \
682 return score; \
683 }
684
685 WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
686 WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
687