1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) { \
15 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
16 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
17 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
18 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
19 }
20 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
21
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)22 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
23 const uint8_t *ref_ptr, int32_t ref_stride,
24 int32_t height) {
25 int32_t ht_cnt;
26 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
27 v16u8 src = { 0 };
28 v16u8 ref = { 0 };
29 v16u8 diff;
30 v8u16 sad = { 0 };
31
32 for (ht_cnt = (height >> 2); ht_cnt--;) {
33 LW4(src_ptr, src_stride, src0, src1, src2, src3);
34 src_ptr += (4 * src_stride);
35 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
36 ref_ptr += (4 * ref_stride);
37
38 INSERT_W4_UB(src0, src1, src2, src3, src);
39 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
40
41 diff = __msa_asub_u_b(src, ref);
42 sad += __msa_hadd_u_h(diff, diff);
43 }
44
45 return HADD_UH_U32(sad);
46 }
47
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)48 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
49 const uint8_t *ref, int32_t ref_stride,
50 int32_t height) {
51 int32_t ht_cnt;
52 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
53 v8u16 sad = { 0 };
54
55 for (ht_cnt = (height >> 2); ht_cnt--;) {
56 LD_UB4(src, src_stride, src0, src1, src2, src3);
57 src += (4 * src_stride);
58 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
59 ref += (4 * ref_stride);
60
61 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
62 src0, src1, ref0, ref1);
63 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
64 }
65
66 return HADD_UH_U32(sad);
67 }
68
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)69 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
70 const uint8_t *ref, int32_t ref_stride,
71 int32_t height) {
72 int32_t ht_cnt;
73 v16u8 src0, src1, ref0, ref1;
74 v8u16 sad = { 0 };
75
76 for (ht_cnt = (height >> 2); ht_cnt--;) {
77 LD_UB2(src, src_stride, src0, src1);
78 src += (2 * src_stride);
79 LD_UB2(ref, ref_stride, ref0, ref1);
80 ref += (2 * ref_stride);
81 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
82
83 LD_UB2(src, src_stride, src0, src1);
84 src += (2 * src_stride);
85 LD_UB2(ref, ref_stride, ref0, ref1);
86 ref += (2 * ref_stride);
87 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
88 }
89
90 return HADD_UH_U32(sad);
91 }
92
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)93 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
94 const uint8_t *ref, int32_t ref_stride,
95 int32_t height) {
96 int32_t ht_cnt;
97 v16u8 src0, src1, ref0, ref1;
98 v8u16 sad = { 0 };
99
100 for (ht_cnt = (height >> 2); ht_cnt--;) {
101 LD_UB2(src, 16, src0, src1);
102 src += src_stride;
103 LD_UB2(ref, 16, ref0, ref1);
104 ref += ref_stride;
105 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
106
107 LD_UB2(src, 16, src0, src1);
108 src += src_stride;
109 LD_UB2(ref, 16, ref0, ref1);
110 ref += ref_stride;
111 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
112
113 LD_UB2(src, 16, src0, src1);
114 src += src_stride;
115 LD_UB2(ref, 16, ref0, ref1);
116 ref += ref_stride;
117 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
118
119 LD_UB2(src, 16, src0, src1);
120 src += src_stride;
121 LD_UB2(ref, 16, ref0, ref1);
122 ref += ref_stride;
123 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
124 }
125
126 return HADD_UH_U32(sad);
127 }
128
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)129 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
130 const uint8_t *ref, int32_t ref_stride,
131 int32_t height) {
132 int32_t ht_cnt;
133 uint32_t sad = 0;
134 v16u8 src0, src1, src2, src3;
135 v16u8 ref0, ref1, ref2, ref3;
136 v8u16 sad0 = { 0 };
137 v8u16 sad1 = { 0 };
138
139 for (ht_cnt = (height >> 1); ht_cnt--;) {
140 LD_UB4(src, 16, src0, src1, src2, src3);
141 src += src_stride;
142 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
143 ref += ref_stride;
144 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
145 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
146
147 LD_UB4(src, 16, src0, src1, src2, src3);
148 src += src_stride;
149 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
150 ref += ref_stride;
151 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
152 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
153 }
154
155 sad = HADD_UH_U32(sad0);
156 sad += HADD_UH_U32(sad1);
157
158 return sad;
159 }
160
sad_4width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)161 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
162 const uint8_t *ref_ptr, int32_t ref_stride,
163 int32_t height, uint32_t *sad_array) {
164 int32_t ht_cnt;
165 uint32_t src0, src1, src2, src3;
166 v16u8 src = { 0 };
167 v16u8 ref = { 0 };
168 v16u8 ref0, ref1, ref2, ref3, diff;
169 v8u16 sad0 = { 0 };
170 v8u16 sad1 = { 0 };
171 v8u16 sad2 = { 0 };
172
173 for (ht_cnt = (height >> 2); ht_cnt--;) {
174 LW4(src_ptr, src_stride, src0, src1, src2, src3);
175 src_ptr += (4 * src_stride);
176 INSERT_W4_UB(src0, src1, src2, src3, src);
177
178 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
179 ref_ptr += (4 * ref_stride);
180 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
181 diff = __msa_asub_u_b(src, ref);
182 sad0 += __msa_hadd_u_h(diff, diff);
183
184 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
185 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
186 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
187 diff = __msa_asub_u_b(src, ref);
188 sad1 += __msa_hadd_u_h(diff, diff);
189
190 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
191 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
192 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
193 diff = __msa_asub_u_b(src, ref);
194 sad2 += __msa_hadd_u_h(diff, diff);
195 }
196
197 sad_array[0] = HADD_UH_U32(sad0);
198 sad_array[1] = HADD_UH_U32(sad1);
199 sad_array[2] = HADD_UH_U32(sad2);
200 }
201
sad_8width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)202 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
203 const uint8_t *ref, int32_t ref_stride,
204 int32_t height, uint32_t *sad_array) {
205 int32_t ht_cnt;
206 v16u8 src0, src1, src2, src3;
207 v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
208 v8u16 sad0 = { 0 };
209 v8u16 sad1 = { 0 };
210 v8u16 sad2 = { 0 };
211
212 for (ht_cnt = (height >> 2); ht_cnt--;) {
213 LD_UB4(src, src_stride, src0, src1, src2, src3);
214 src += (4 * src_stride);
215 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
216 ref += (4 * ref_stride);
217 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
218 src0, src1, ref0, ref1);
219 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
220
221 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
222 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
223 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
224 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
225
226 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
227 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
228 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
229 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
230 }
231
232 sad_array[0] = HADD_UH_U32(sad0);
233 sad_array[1] = HADD_UH_U32(sad1);
234 sad_array[2] = HADD_UH_U32(sad2);
235 }
236
sad_16width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)237 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
238 const uint8_t *ref_ptr, int32_t ref_stride,
239 int32_t height, uint32_t *sad_array) {
240 int32_t ht_cnt;
241 v16u8 src, ref, ref0, ref1, diff;
242 v8u16 sad0 = { 0 };
243 v8u16 sad1 = { 0 };
244 v8u16 sad2 = { 0 };
245
246 for (ht_cnt = (height >> 1); ht_cnt--;) {
247 src = LD_UB(src_ptr);
248 src_ptr += src_stride;
249 LD_UB2(ref_ptr, 16, ref0, ref1);
250 ref_ptr += ref_stride;
251
252 diff = __msa_asub_u_b(src, ref0);
253 sad0 += __msa_hadd_u_h(diff, diff);
254
255 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
256 diff = __msa_asub_u_b(src, ref);
257 sad1 += __msa_hadd_u_h(diff, diff);
258
259 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
260 diff = __msa_asub_u_b(src, ref);
261 sad2 += __msa_hadd_u_h(diff, diff);
262
263 src = LD_UB(src_ptr);
264 src_ptr += src_stride;
265 LD_UB2(ref_ptr, 16, ref0, ref1);
266 ref_ptr += ref_stride;
267
268 diff = __msa_asub_u_b(src, ref0);
269 sad0 += __msa_hadd_u_h(diff, diff);
270
271 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
272 diff = __msa_asub_u_b(src, ref);
273 sad1 += __msa_hadd_u_h(diff, diff);
274
275 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
276 diff = __msa_asub_u_b(src, ref);
277 sad2 += __msa_hadd_u_h(diff, diff);
278 }
279
280 sad_array[0] = HADD_UH_U32(sad0);
281 sad_array[1] = HADD_UH_U32(sad1);
282 sad_array[2] = HADD_UH_U32(sad2);
283 }
284
sad_32width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)285 static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
286 const uint8_t *ref, int32_t ref_stride,
287 int32_t height, uint32_t *sad_array) {
288 int32_t ht_cnt;
289 v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
290 v8u16 sad0 = { 0 };
291 v8u16 sad1 = { 0 };
292 v8u16 sad2 = { 0 };
293
294 for (ht_cnt = height >> 1; ht_cnt--;) {
295 LD_UB2(src, 16, src0, src1);
296 src += src_stride;
297 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
298 ref += ref_stride;
299
300 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
301
302 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
303 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
304
305 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
306 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
307
308 LD_UB2(src, 16, src0, src1);
309 src += src_stride;
310 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
311 ref += ref_stride;
312
313 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
314
315 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
316 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
317
318 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
319 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
320 }
321
322 sad_array[0] = HADD_UH_U32(sad0);
323 sad_array[1] = HADD_UH_U32(sad1);
324 sad_array[2] = HADD_UH_U32(sad2);
325 }
326
sad_64width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)327 static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
328 const uint8_t *ref, int32_t ref_stride,
329 int32_t height, uint32_t *sad_array) {
330 int32_t ht_cnt;
331 v16u8 src0, src1, src2, src3;
332 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
333 v8u16 sad0_0 = { 0 };
334 v8u16 sad0_1 = { 0 };
335 v8u16 sad1_0 = { 0 };
336 v8u16 sad1_1 = { 0 };
337 v8u16 sad2_0 = { 0 };
338 v8u16 sad2_1 = { 0 };
339 v4u32 sad;
340
341 for (ht_cnt = height; ht_cnt--;) {
342 LD_UB4(src, 16, src0, src1, src2, src3);
343 src += src_stride;
344 LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
345 ref0_4 = LD_UB(ref + 64);
346 ref += ref_stride;
347
348 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
349 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
350
351 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
352 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
353 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
354 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
355
356 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
357 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
358 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
359 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
360 }
361
362 sad = __msa_hadd_u_w(sad0_0, sad0_0);
363 sad += __msa_hadd_u_w(sad0_1, sad0_1);
364 sad_array[0] = HADD_SW_S32((v4i32)sad);
365
366 sad = __msa_hadd_u_w(sad1_0, sad1_0);
367 sad += __msa_hadd_u_w(sad1_1, sad1_1);
368 sad_array[1] = HADD_SW_S32((v4i32)sad);
369
370 sad = __msa_hadd_u_w(sad2_0, sad2_0);
371 sad += __msa_hadd_u_w(sad2_1, sad2_1);
372 sad_array[2] = HADD_SW_S32((v4i32)sad);
373 }
374
sad_4width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)375 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
376 const uint8_t *ref_ptr, int32_t ref_stride,
377 int32_t height, uint32_t *sad_array) {
378 int32_t ht_cnt;
379 uint32_t src0, src1, src2, src3;
380 v16u8 ref0, ref1, ref2, ref3, diff;
381 v16u8 src = { 0 };
382 v16u8 ref = { 0 };
383 v8u16 sad0 = { 0 };
384 v8u16 sad1 = { 0 };
385 v8u16 sad2 = { 0 };
386 v8u16 sad3 = { 0 };
387 v8u16 sad4 = { 0 };
388 v8u16 sad5 = { 0 };
389 v8u16 sad6 = { 0 };
390 v8u16 sad7 = { 0 };
391
392 for (ht_cnt = (height >> 2); ht_cnt--;) {
393 LW4(src_ptr, src_stride, src0, src1, src2, src3);
394 INSERT_W4_UB(src0, src1, src2, src3, src);
395 src_ptr += (4 * src_stride);
396 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
397 ref_ptr += (4 * ref_stride);
398
399 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
400 diff = __msa_asub_u_b(src, ref);
401 sad0 += __msa_hadd_u_h(diff, diff);
402
403 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
404 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
405 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
406 diff = __msa_asub_u_b(src, ref);
407 sad1 += __msa_hadd_u_h(diff, diff);
408
409 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
410 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
411 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
412 diff = __msa_asub_u_b(src, ref);
413 sad2 += __msa_hadd_u_h(diff, diff);
414
415 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
416 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
417 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
418 diff = __msa_asub_u_b(src, ref);
419 sad3 += __msa_hadd_u_h(diff, diff);
420
421 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
422 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
423 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
424 diff = __msa_asub_u_b(src, ref);
425 sad4 += __msa_hadd_u_h(diff, diff);
426
427 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
428 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
429 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
430 diff = __msa_asub_u_b(src, ref);
431 sad5 += __msa_hadd_u_h(diff, diff);
432
433 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
434 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
435 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
436 diff = __msa_asub_u_b(src, ref);
437 sad6 += __msa_hadd_u_h(diff, diff);
438
439 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
440 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
441 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
442 diff = __msa_asub_u_b(src, ref);
443 sad7 += __msa_hadd_u_h(diff, diff);
444 }
445
446 sad_array[0] = HADD_UH_U32(sad0);
447 sad_array[1] = HADD_UH_U32(sad1);
448 sad_array[2] = HADD_UH_U32(sad2);
449 sad_array[3] = HADD_UH_U32(sad3);
450 sad_array[4] = HADD_UH_U32(sad4);
451 sad_array[5] = HADD_UH_U32(sad5);
452 sad_array[6] = HADD_UH_U32(sad6);
453 sad_array[7] = HADD_UH_U32(sad7);
454 }
455
sad_8width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)456 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
457 const uint8_t *ref, int32_t ref_stride,
458 int32_t height, uint32_t *sad_array) {
459 int32_t ht_cnt;
460 v16u8 src0, src1, src2, src3;
461 v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
462 v8u16 sad0 = { 0 };
463 v8u16 sad1 = { 0 };
464 v8u16 sad2 = { 0 };
465 v8u16 sad3 = { 0 };
466 v8u16 sad4 = { 0 };
467 v8u16 sad5 = { 0 };
468 v8u16 sad6 = { 0 };
469 v8u16 sad7 = { 0 };
470
471 for (ht_cnt = (height >> 2); ht_cnt--;) {
472 LD_UB4(src, src_stride, src0, src1, src2, src3);
473 src += (4 * src_stride);
474 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
475 ref += (4 * ref_stride);
476 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
477 src0, src1, ref0, ref1);
478 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
479
480 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
481 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
482 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
483 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
484
485 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
486 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
487 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
488 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
489
490 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
491 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
492 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
493 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
494
495 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
496 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
497 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
498 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
499
500 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
501 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
502 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
503 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
504
505 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
506 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
507 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
508 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
509
510 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
511 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
512 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
513 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
514 }
515
516 sad_array[0] = HADD_UH_U32(sad0);
517 sad_array[1] = HADD_UH_U32(sad1);
518 sad_array[2] = HADD_UH_U32(sad2);
519 sad_array[3] = HADD_UH_U32(sad3);
520 sad_array[4] = HADD_UH_U32(sad4);
521 sad_array[5] = HADD_UH_U32(sad5);
522 sad_array[6] = HADD_UH_U32(sad6);
523 sad_array[7] = HADD_UH_U32(sad7);
524 }
525
sad_16width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)526 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
527 const uint8_t *ref_ptr, int32_t ref_stride,
528 int32_t height, uint32_t *sad_array) {
529 int32_t ht_cnt;
530 v16u8 src, ref0, ref1, ref;
531 v16u8 diff;
532 v8u16 sad0 = { 0 };
533 v8u16 sad1 = { 0 };
534 v8u16 sad2 = { 0 };
535 v8u16 sad3 = { 0 };
536 v8u16 sad4 = { 0 };
537 v8u16 sad5 = { 0 };
538 v8u16 sad6 = { 0 };
539 v8u16 sad7 = { 0 };
540
541 for (ht_cnt = (height >> 1); ht_cnt--;) {
542 src = LD_UB(src_ptr);
543 src_ptr += src_stride;
544 LD_UB2(ref_ptr, 16, ref0, ref1);
545 ref_ptr += ref_stride;
546
547 diff = __msa_asub_u_b(src, ref0);
548 sad0 += __msa_hadd_u_h(diff, diff);
549
550 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
551 diff = __msa_asub_u_b(src, ref);
552 sad1 += __msa_hadd_u_h(diff, diff);
553
554 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
555 diff = __msa_asub_u_b(src, ref);
556 sad2 += __msa_hadd_u_h(diff, diff);
557
558 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
559 diff = __msa_asub_u_b(src, ref);
560 sad3 += __msa_hadd_u_h(diff, diff);
561
562 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
563 diff = __msa_asub_u_b(src, ref);
564 sad4 += __msa_hadd_u_h(diff, diff);
565
566 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
567 diff = __msa_asub_u_b(src, ref);
568 sad5 += __msa_hadd_u_h(diff, diff);
569
570 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
571 diff = __msa_asub_u_b(src, ref);
572 sad6 += __msa_hadd_u_h(diff, diff);
573
574 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
575 diff = __msa_asub_u_b(src, ref);
576 sad7 += __msa_hadd_u_h(diff, diff);
577
578 src = LD_UB(src_ptr);
579 src_ptr += src_stride;
580 LD_UB2(ref_ptr, 16, ref0, ref1);
581 ref_ptr += ref_stride;
582
583 diff = __msa_asub_u_b(src, ref0);
584 sad0 += __msa_hadd_u_h(diff, diff);
585
586 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
587 diff = __msa_asub_u_b(src, ref);
588 sad1 += __msa_hadd_u_h(diff, diff);
589
590 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
591 diff = __msa_asub_u_b(src, ref);
592 sad2 += __msa_hadd_u_h(diff, diff);
593
594 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
595 diff = __msa_asub_u_b(src, ref);
596 sad3 += __msa_hadd_u_h(diff, diff);
597
598 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
599 diff = __msa_asub_u_b(src, ref);
600 sad4 += __msa_hadd_u_h(diff, diff);
601
602 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
603 diff = __msa_asub_u_b(src, ref);
604 sad5 += __msa_hadd_u_h(diff, diff);
605
606 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
607 diff = __msa_asub_u_b(src, ref);
608 sad6 += __msa_hadd_u_h(diff, diff);
609
610 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
611 diff = __msa_asub_u_b(src, ref);
612 sad7 += __msa_hadd_u_h(diff, diff);
613 }
614
615 sad_array[0] = HADD_UH_U32(sad0);
616 sad_array[1] = HADD_UH_U32(sad1);
617 sad_array[2] = HADD_UH_U32(sad2);
618 sad_array[3] = HADD_UH_U32(sad3);
619 sad_array[4] = HADD_UH_U32(sad4);
620 sad_array[5] = HADD_UH_U32(sad5);
621 sad_array[6] = HADD_UH_U32(sad6);
622 sad_array[7] = HADD_UH_U32(sad7);
623 }
624
sad_32width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)625 static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
626 const uint8_t *ref, int32_t ref_stride,
627 int32_t height, uint32_t *sad_array) {
628 int32_t ht_cnt;
629 v16u8 src0, src1;
630 v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
631 v8u16 sad0 = { 0 };
632 v8u16 sad1 = { 0 };
633 v8u16 sad2 = { 0 };
634 v8u16 sad3 = { 0 };
635 v8u16 sad4 = { 0 };
636 v8u16 sad5 = { 0 };
637 v8u16 sad6 = { 0 };
638 v8u16 sad7 = { 0 };
639
640 for (ht_cnt = height; ht_cnt--;) {
641 LD_UB2(src, 16, src0, src1);
642 src += src_stride;
643 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
644 ref += ref_stride;
645
646 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
647
648 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
649 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
650
651 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
652 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
653
654 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
655 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
656
657 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
658 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
659
660 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
661 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
662
663 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
664 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
665
666 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
667 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
668 }
669
670 sad_array[0] = HADD_UH_U32(sad0);
671 sad_array[1] = HADD_UH_U32(sad1);
672 sad_array[2] = HADD_UH_U32(sad2);
673 sad_array[3] = HADD_UH_U32(sad3);
674 sad_array[4] = HADD_UH_U32(sad4);
675 sad_array[5] = HADD_UH_U32(sad5);
676 sad_array[6] = HADD_UH_U32(sad6);
677 sad_array[7] = HADD_UH_U32(sad7);
678 }
679
sad_64width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)680 static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
681 const uint8_t *ref, int32_t ref_stride,
682 int32_t height, uint32_t *sad_array) {
683 const uint8_t *src_dup, *ref_dup;
684 int32_t ht_cnt;
685 v16u8 src0, src1, src2, src3;
686 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
687 v16u8 ref0, ref1, ref2, ref3;
688 v8u16 sad0_0 = { 0 };
689 v8u16 sad0_1 = { 0 };
690 v8u16 sad1_0 = { 0 };
691 v8u16 sad1_1 = { 0 };
692 v8u16 sad2_0 = { 0 };
693 v8u16 sad2_1 = { 0 };
694 v8u16 sad3_0 = { 0 };
695 v8u16 sad3_1 = { 0 };
696 v4u32 sad;
697
698 src_dup = src;
699 ref_dup = ref;
700
701 for (ht_cnt = height; ht_cnt--;) {
702 LD_UB4(src, 16, src0, src1, src2, src3);
703 src += src_stride;
704 LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
705 ref += ref_stride;
706
707 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
708 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
709
710 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
711 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
712 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
713 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
714
715 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
716 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
717 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
718 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
719
720 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
721 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
722 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
723 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
724 }
725
726 sad = __msa_hadd_u_w(sad0_0, sad0_0);
727 sad += __msa_hadd_u_w(sad0_1, sad0_1);
728 sad_array[0] = HADD_SW_S32(sad);
729
730 sad = __msa_hadd_u_w(sad1_0, sad1_0);
731 sad += __msa_hadd_u_w(sad1_1, sad1_1);
732 sad_array[1] = HADD_SW_S32(sad);
733
734 sad = __msa_hadd_u_w(sad2_0, sad2_0);
735 sad += __msa_hadd_u_w(sad2_1, sad2_1);
736 sad_array[2] = HADD_SW_S32(sad);
737
738 sad = __msa_hadd_u_w(sad3_0, sad3_0);
739 sad += __msa_hadd_u_w(sad3_1, sad3_1);
740 sad_array[3] = HADD_SW_S32(sad);
741
742 sad0_0 = (v8u16)__msa_ldi_h(0);
743 sad0_1 = (v8u16)__msa_ldi_h(0);
744 sad1_0 = (v8u16)__msa_ldi_h(0);
745 sad1_1 = (v8u16)__msa_ldi_h(0);
746 sad2_0 = (v8u16)__msa_ldi_h(0);
747 sad2_1 = (v8u16)__msa_ldi_h(0);
748 sad3_0 = (v8u16)__msa_ldi_h(0);
749 sad3_1 = (v8u16)__msa_ldi_h(0);
750
751 for (ht_cnt = 64; ht_cnt--;) {
752 LD_UB4(src_dup, 16, src0, src1, src2, src3);
753 src_dup += src_stride;
754 LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
755 ref_dup += ref_stride;
756
757 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
758 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
759 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
760 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
761
762 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
763 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
764 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
765 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
766
767 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
768 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
769 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
770 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
771
772 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
773 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
774 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
775 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
776 }
777
778 sad = __msa_hadd_u_w(sad0_0, sad0_0);
779 sad += __msa_hadd_u_w(sad0_1, sad0_1);
780 sad_array[4] = HADD_SW_S32(sad);
781
782 sad = __msa_hadd_u_w(sad1_0, sad1_0);
783 sad += __msa_hadd_u_w(sad1_1, sad1_1);
784 sad_array[5] = HADD_SW_S32(sad);
785
786 sad = __msa_hadd_u_w(sad2_0, sad2_0);
787 sad += __msa_hadd_u_w(sad2_1, sad2_1);
788 sad_array[6] = HADD_SW_S32(sad);
789
790 sad = __msa_hadd_u_w(sad3_0, sad3_0);
791 sad += __msa_hadd_u_w(sad3_1, sad3_1);
792 sad_array[7] = HADD_SW_S32(sad);
793 }
794
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)795 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
796 const uint8_t * const aref_ptr[],
797 int32_t ref_stride,
798 int32_t height, uint32_t *sad_array) {
799 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
800 int32_t ht_cnt;
801 uint32_t src0, src1, src2, src3;
802 uint32_t ref0, ref1, ref2, ref3;
803 v16u8 src = { 0 };
804 v16u8 ref = { 0 };
805 v16u8 diff;
806 v8u16 sad0 = { 0 };
807 v8u16 sad1 = { 0 };
808 v8u16 sad2 = { 0 };
809 v8u16 sad3 = { 0 };
810
811 ref0_ptr = aref_ptr[0];
812 ref1_ptr = aref_ptr[1];
813 ref2_ptr = aref_ptr[2];
814 ref3_ptr = aref_ptr[3];
815
816 for (ht_cnt = (height >> 2); ht_cnt--;) {
817 LW4(src_ptr, src_stride, src0, src1, src2, src3);
818 INSERT_W4_UB(src0, src1, src2, src3, src);
819 src_ptr += (4 * src_stride);
820
821 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
822 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
823 ref0_ptr += (4 * ref_stride);
824
825 diff = __msa_asub_u_b(src, ref);
826 sad0 += __msa_hadd_u_h(diff, diff);
827
828 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
829 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
830 ref1_ptr += (4 * ref_stride);
831
832 diff = __msa_asub_u_b(src, ref);
833 sad1 += __msa_hadd_u_h(diff, diff);
834
835 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
836 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
837 ref2_ptr += (4 * ref_stride);
838
839 diff = __msa_asub_u_b(src, ref);
840 sad2 += __msa_hadd_u_h(diff, diff);
841
842 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
843 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
844 ref3_ptr += (4 * ref_stride);
845
846 diff = __msa_asub_u_b(src, ref);
847 sad3 += __msa_hadd_u_h(diff, diff);
848 }
849
850 sad_array[0] = HADD_UH_U32(sad0);
851 sad_array[1] = HADD_UH_U32(sad1);
852 sad_array[2] = HADD_UH_U32(sad2);
853 sad_array[3] = HADD_UH_U32(sad3);
854 }
855
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)856 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
857 const uint8_t * const aref_ptr[],
858 int32_t ref_stride,
859 int32_t height, uint32_t *sad_array) {
860 int32_t ht_cnt;
861 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
862 v16u8 src0, src1, src2, src3;
863 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
864 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
865 v8u16 sad0 = { 0 };
866 v8u16 sad1 = { 0 };
867 v8u16 sad2 = { 0 };
868 v8u16 sad3 = { 0 };
869
870 ref0_ptr = aref_ptr[0];
871 ref1_ptr = aref_ptr[1];
872 ref2_ptr = aref_ptr[2];
873 ref3_ptr = aref_ptr[3];
874
875 for (ht_cnt = (height >> 2); ht_cnt--;) {
876 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
877 src_ptr += (4 * src_stride);
878 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
879 ref0_ptr += (4 * ref_stride);
880 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
881 ref1_ptr += (4 * ref_stride);
882 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
883 ref2_ptr += (4 * ref_stride);
884 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
885 ref3_ptr += (4 * ref_stride);
886
887 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
888 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
889 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
890
891 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
892 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
893
894 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
895 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
896
897 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
898 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
899 }
900
901 sad_array[0] = HADD_UH_U32(sad0);
902 sad_array[1] = HADD_UH_U32(sad1);
903 sad_array[2] = HADD_UH_U32(sad2);
904 sad_array[3] = HADD_UH_U32(sad3);
905 }
906
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)907 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
908 const uint8_t * const aref_ptr[],
909 int32_t ref_stride,
910 int32_t height, uint32_t *sad_array) {
911 int32_t ht_cnt;
912 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
913 v16u8 src, ref0, ref1, ref2, ref3, diff;
914 v8u16 sad0 = { 0 };
915 v8u16 sad1 = { 0 };
916 v8u16 sad2 = { 0 };
917 v8u16 sad3 = { 0 };
918
919 ref0_ptr = aref_ptr[0];
920 ref1_ptr = aref_ptr[1];
921 ref2_ptr = aref_ptr[2];
922 ref3_ptr = aref_ptr[3];
923
924 for (ht_cnt = (height >> 1); ht_cnt--;) {
925 src = LD_UB(src_ptr);
926 src_ptr += src_stride;
927 ref0 = LD_UB(ref0_ptr);
928 ref0_ptr += ref_stride;
929 ref1 = LD_UB(ref1_ptr);
930 ref1_ptr += ref_stride;
931 ref2 = LD_UB(ref2_ptr);
932 ref2_ptr += ref_stride;
933 ref3 = LD_UB(ref3_ptr);
934 ref3_ptr += ref_stride;
935
936 diff = __msa_asub_u_b(src, ref0);
937 sad0 += __msa_hadd_u_h(diff, diff);
938 diff = __msa_asub_u_b(src, ref1);
939 sad1 += __msa_hadd_u_h(diff, diff);
940 diff = __msa_asub_u_b(src, ref2);
941 sad2 += __msa_hadd_u_h(diff, diff);
942 diff = __msa_asub_u_b(src, ref3);
943 sad3 += __msa_hadd_u_h(diff, diff);
944
945 src = LD_UB(src_ptr);
946 src_ptr += src_stride;
947 ref0 = LD_UB(ref0_ptr);
948 ref0_ptr += ref_stride;
949 ref1 = LD_UB(ref1_ptr);
950 ref1_ptr += ref_stride;
951 ref2 = LD_UB(ref2_ptr);
952 ref2_ptr += ref_stride;
953 ref3 = LD_UB(ref3_ptr);
954 ref3_ptr += ref_stride;
955
956 diff = __msa_asub_u_b(src, ref0);
957 sad0 += __msa_hadd_u_h(diff, diff);
958 diff = __msa_asub_u_b(src, ref1);
959 sad1 += __msa_hadd_u_h(diff, diff);
960 diff = __msa_asub_u_b(src, ref2);
961 sad2 += __msa_hadd_u_h(diff, diff);
962 diff = __msa_asub_u_b(src, ref3);
963 sad3 += __msa_hadd_u_h(diff, diff);
964 }
965
966 sad_array[0] = HADD_UH_U32(sad0);
967 sad_array[1] = HADD_UH_U32(sad1);
968 sad_array[2] = HADD_UH_U32(sad2);
969 sad_array[3] = HADD_UH_U32(sad3);
970 }
971
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)972 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
973 const uint8_t * const aref_ptr[],
974 int32_t ref_stride,
975 int32_t height, uint32_t *sad_array) {
976 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
977 int32_t ht_cnt;
978 v16u8 src0, src1, ref0, ref1;
979 v8u16 sad0 = { 0 };
980 v8u16 sad1 = { 0 };
981 v8u16 sad2 = { 0 };
982 v8u16 sad3 = { 0 };
983
984 ref0_ptr = aref_ptr[0];
985 ref1_ptr = aref_ptr[1];
986 ref2_ptr = aref_ptr[2];
987 ref3_ptr = aref_ptr[3];
988
989 for (ht_cnt = height; ht_cnt--;) {
990 LD_UB2(src, 16, src0, src1);
991 src += src_stride;
992
993 LD_UB2(ref0_ptr, 16, ref0, ref1);
994 ref0_ptr += ref_stride;
995 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
996
997 LD_UB2(ref1_ptr, 16, ref0, ref1);
998 ref1_ptr += ref_stride;
999 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
1000
1001 LD_UB2(ref2_ptr, 16, ref0, ref1);
1002 ref2_ptr += ref_stride;
1003 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
1004
1005 LD_UB2(ref3_ptr, 16, ref0, ref1);
1006 ref3_ptr += ref_stride;
1007 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
1008 }
1009
1010 sad_array[0] = HADD_UH_U32(sad0);
1011 sad_array[1] = HADD_UH_U32(sad1);
1012 sad_array[2] = HADD_UH_U32(sad2);
1013 sad_array[3] = HADD_UH_U32(sad3);
1014 }
1015
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)1016 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
1017 const uint8_t * const aref_ptr[],
1018 int32_t ref_stride,
1019 int32_t height, uint32_t *sad_array) {
1020 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
1021 int32_t ht_cnt;
1022 v16u8 src0, src1, src2, src3;
1023 v16u8 ref0, ref1, ref2, ref3;
1024 v8u16 sad0_0 = { 0 };
1025 v8u16 sad0_1 = { 0 };
1026 v8u16 sad1_0 = { 0 };
1027 v8u16 sad1_1 = { 0 };
1028 v8u16 sad2_0 = { 0 };
1029 v8u16 sad2_1 = { 0 };
1030 v8u16 sad3_0 = { 0 };
1031 v8u16 sad3_1 = { 0 };
1032
1033 ref0_ptr = aref_ptr[0];
1034 ref1_ptr = aref_ptr[1];
1035 ref2_ptr = aref_ptr[2];
1036 ref3_ptr = aref_ptr[3];
1037
1038 for (ht_cnt = height; ht_cnt--;) {
1039 LD_UB4(src, 16, src0, src1, src2, src3);
1040 src += src_stride;
1041
1042 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
1043 ref0_ptr += ref_stride;
1044 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1045 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1046
1047 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
1048 ref1_ptr += ref_stride;
1049 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1050 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1051
1052 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
1053 ref2_ptr += ref_stride;
1054 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1055 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1056
1057 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
1058 ref3_ptr += ref_stride;
1059 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1060 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1061 }
1062
1063 sad_array[0] = HADD_UH_U32(sad0_0);
1064 sad_array[0] += HADD_UH_U32(sad0_1);
1065 sad_array[1] = HADD_UH_U32(sad1_0);
1066 sad_array[1] += HADD_UH_U32(sad1_1);
1067 sad_array[2] = HADD_UH_U32(sad2_0);
1068 sad_array[2] += HADD_UH_U32(sad2_1);
1069 sad_array[3] = HADD_UH_U32(sad3_0);
1070 sad_array[3] += HADD_UH_U32(sad3_1);
1071 }
1072
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1073 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
1074 const uint8_t *ref_ptr, int32_t ref_stride,
1075 int32_t height, const uint8_t *sec_pred) {
1076 int32_t ht_cnt;
1077 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1078 v16u8 src = { 0 };
1079 v16u8 ref = { 0 };
1080 v16u8 diff, pred, comp;
1081 v8u16 sad = { 0 };
1082
1083 for (ht_cnt = (height >> 2); ht_cnt--;) {
1084 LW4(src_ptr, src_stride, src0, src1, src2, src3);
1085 src_ptr += (4 * src_stride);
1086 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
1087 ref_ptr += (4 * ref_stride);
1088 pred = LD_UB(sec_pred);
1089 sec_pred += 16;
1090
1091 INSERT_W4_UB(src0, src1, src2, src3, src);
1092 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1093
1094 comp = __msa_aver_u_b(pred, ref);
1095 diff = __msa_asub_u_b(src, comp);
1096 sad += __msa_hadd_u_h(diff, diff);
1097 }
1098
1099 return HADD_UH_U32(sad);
1100 }
1101
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1102 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
1103 const uint8_t *ref, int32_t ref_stride,
1104 int32_t height, const uint8_t *sec_pred) {
1105 int32_t ht_cnt;
1106 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1107 v16u8 diff0, diff1, pred0, pred1;
1108 v8u16 sad = { 0 };
1109
1110 for (ht_cnt = (height >> 2); ht_cnt--;) {
1111 LD_UB4(src, src_stride, src0, src1, src2, src3);
1112 src += (4 * src_stride);
1113 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1114 ref += (4 * ref_stride);
1115 LD_UB2(sec_pred, 16, pred0, pred1);
1116 sec_pred += 32;
1117 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
1118 src0, src1, ref0, ref1);
1119 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
1120 sad += SAD_UB2_UH(src0, src1, diff0, diff1);
1121 }
1122
1123 return HADD_UH_U32(sad);
1124 }
1125
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1126 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
1127 const uint8_t *ref, int32_t ref_stride,
1128 int32_t height, const uint8_t *sec_pred) {
1129 int32_t ht_cnt;
1130 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1131 v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
1132 v8u16 sad = { 0 };
1133
1134 for (ht_cnt = (height >> 3); ht_cnt--;) {
1135 LD_UB4(src, src_stride, src0, src1, src2, src3);
1136 src += (4 * src_stride);
1137 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1138 ref += (4 * ref_stride);
1139 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1140 sec_pred += (4 * 16);
1141 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1142 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1143 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1144 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1145
1146 LD_UB4(src, src_stride, src0, src1, src2, src3);
1147 src += (4 * src_stride);
1148 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1149 ref += (4 * ref_stride);
1150 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1151 sec_pred += (4 * 16);
1152 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1153 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1154 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1155 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1156 }
1157
1158 return HADD_UH_U32(sad);
1159 }
1160
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1161 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
1162 const uint8_t *ref, int32_t ref_stride,
1163 int32_t height, const uint8_t *sec_pred) {
1164 int32_t ht_cnt;
1165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1166 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
1167 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
1168 v16u8 comp0, comp1;
1169 v8u16 sad = { 0 };
1170
1171 for (ht_cnt = (height >> 2); ht_cnt--;) {
1172 LD_UB4(src, src_stride, src0, src2, src4, src6);
1173 LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
1174 src += (4 * src_stride);
1175
1176 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
1177 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
1178 ref += (4 * ref_stride);
1179
1180 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
1181 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
1182 sec_pred += (4 * 32);
1183
1184 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1185 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1186 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1187 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1188 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
1189 sad += SAD_UB2_UH(src4, src5, comp0, comp1);
1190 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
1191 sad += SAD_UB2_UH(src6, src7, comp0, comp1);
1192 }
1193
1194 return HADD_UH_U32(sad);
1195 }
1196
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1197 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
1198 const uint8_t *ref, int32_t ref_stride,
1199 int32_t height, const uint8_t *sec_pred) {
1200 int32_t ht_cnt;
1201 v16u8 src0, src1, src2, src3;
1202 v16u8 ref0, ref1, ref2, ref3;
1203 v16u8 comp0, comp1, comp2, comp3;
1204 v16u8 pred0, pred1, pred2, pred3;
1205 v8u16 sad0 = { 0 };
1206 v8u16 sad1 = { 0 };
1207 v4u32 sad;
1208
1209 for (ht_cnt = (height >> 2); ht_cnt--;) {
1210 LD_UB4(src, 16, src0, src1, src2, src3);
1211 src += src_stride;
1212 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1213 ref += ref_stride;
1214 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1215 sec_pred += 64;
1216 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1217 comp0, comp1, comp2, comp3);
1218 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1219 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1220
1221 LD_UB4(src, 16, src0, src1, src2, src3);
1222 src += src_stride;
1223 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1224 ref += ref_stride;
1225 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1226 sec_pred += 64;
1227 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1228 comp0, comp1, comp2, comp3);
1229 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1230 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1231
1232 LD_UB4(src, 16, src0, src1, src2, src3);
1233 src += src_stride;
1234 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1235 ref += ref_stride;
1236 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1237 sec_pred += 64;
1238 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1239 comp0, comp1, comp2, comp3);
1240 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1241 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1242
1243 LD_UB4(src, 16, src0, src1, src2, src3);
1244 src += src_stride;
1245 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1246 ref += ref_stride;
1247 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1248 sec_pred += 64;
1249 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1250 comp0, comp1, comp2, comp3);
1251 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1252 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1253 }
1254
1255 sad = __msa_hadd_u_w(sad0, sad0);
1256 sad += __msa_hadd_u_w(sad1, sad1);
1257
1258 return HADD_SW_S32(sad);
1259 }
1260
1261 #define VPX_SAD_4xHEIGHT_MSA(height) \
1262 uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
1263 const uint8_t *ref, int32_t ref_stride) { \
1264 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
1265 }
1266
1267 #define VPX_SAD_8xHEIGHT_MSA(height) \
1268 uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
1269 const uint8_t *ref, int32_t ref_stride) { \
1270 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
1271 }
1272
1273 #define VPX_SAD_16xHEIGHT_MSA(height) \
1274 uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
1275 const uint8_t *ref, int32_t ref_stride) { \
1276 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
1277 }
1278
1279 #define VPX_SAD_32xHEIGHT_MSA(height) \
1280 uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
1281 const uint8_t *ref, int32_t ref_stride) { \
1282 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
1283 }
1284
1285 #define VPX_SAD_64xHEIGHT_MSA(height) \
1286 uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
1287 const uint8_t *ref, int32_t ref_stride) { \
1288 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
1289 }
1290
1291 #define VPX_SAD_4xHEIGHTx3_MSA(height) \
1292 void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1293 const uint8_t *ref, int32_t ref_stride, \
1294 uint32_t *sads) { \
1295 sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1296 }
1297
1298 #define VPX_SAD_8xHEIGHTx3_MSA(height) \
1299 void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1300 const uint8_t *ref, int32_t ref_stride, \
1301 uint32_t *sads) { \
1302 sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1303 }
1304
1305 #define VPX_SAD_16xHEIGHTx3_MSA(height) \
1306 void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1307 const uint8_t *ref, int32_t ref_stride, \
1308 uint32_t *sads) { \
1309 sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1310 }
1311
1312 #define VPX_SAD_32xHEIGHTx3_MSA(height) \
1313 void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1314 const uint8_t *ref, int32_t ref_stride, \
1315 uint32_t *sads) { \
1316 sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1317 }
1318
1319 #define VPX_SAD_64xHEIGHTx3_MSA(height) \
1320 void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1321 const uint8_t *ref, int32_t ref_stride, \
1322 uint32_t *sads) { \
1323 sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1324 }
1325
1326 #define VPX_SAD_4xHEIGHTx8_MSA(height) \
1327 void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1328 const uint8_t *ref, int32_t ref_stride, \
1329 uint32_t *sads) { \
1330 sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1331 }
1332
1333 #define VPX_SAD_8xHEIGHTx8_MSA(height) \
1334 void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1335 const uint8_t *ref, int32_t ref_stride, \
1336 uint32_t *sads) { \
1337 sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1338 }
1339
1340 #define VPX_SAD_16xHEIGHTx8_MSA(height) \
1341 void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1342 const uint8_t *ref, int32_t ref_stride, \
1343 uint32_t *sads) { \
1344 sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1345 }
1346
1347 #define VPX_SAD_32xHEIGHTx8_MSA(height) \
1348 void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1349 const uint8_t *ref, int32_t ref_stride, \
1350 uint32_t *sads) { \
1351 sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1352 }
1353
1354 #define VPX_SAD_64xHEIGHTx8_MSA(height) \
1355 void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1356 const uint8_t *ref, int32_t ref_stride, \
1357 uint32_t *sads) { \
1358 sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1359 }
1360
1361 #define VPX_SAD_4xHEIGHTx4D_MSA(height) \
1362 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1363 const uint8_t *const refs[], \
1364 int32_t ref_stride, uint32_t *sads) { \
1365 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1366 }
1367
1368 #define VPX_SAD_8xHEIGHTx4D_MSA(height) \
1369 void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1370 const uint8_t *const refs[], \
1371 int32_t ref_stride, uint32_t *sads) { \
1372 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1373 }
1374
1375 #define VPX_SAD_16xHEIGHTx4D_MSA(height) \
1376 void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1377 const uint8_t *const refs[], \
1378 int32_t ref_stride, uint32_t *sads) { \
1379 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1380 }
1381
1382 #define VPX_SAD_32xHEIGHTx4D_MSA(height) \
1383 void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1384 const uint8_t *const refs[], \
1385 int32_t ref_stride, uint32_t *sads) { \
1386 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1387 }
1388
1389 #define VPX_SAD_64xHEIGHTx4D_MSA(height) \
1390 void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1391 const uint8_t *const refs[], \
1392 int32_t ref_stride, uint32_t *sads) { \
1393 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1394 }
1395
1396 #define VPX_AVGSAD_4xHEIGHT_MSA(height) \
1397 uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1398 const uint8_t *ref, int32_t ref_stride, \
1399 const uint8_t *second_pred) { \
1400 return avgsad_4width_msa(src, src_stride, ref, ref_stride, \
1401 height, second_pred); \
1402 }
1403
1404 #define VPX_AVGSAD_8xHEIGHT_MSA(height) \
1405 uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1406 const uint8_t *ref, int32_t ref_stride, \
1407 const uint8_t *second_pred) { \
1408 return avgsad_8width_msa(src, src_stride, ref, ref_stride, \
1409 height, second_pred); \
1410 }
1411
1412 #define VPX_AVGSAD_16xHEIGHT_MSA(height) \
1413 uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1414 const uint8_t *ref, int32_t ref_stride, \
1415 const uint8_t *second_pred) { \
1416 return avgsad_16width_msa(src, src_stride, ref, ref_stride, \
1417 height, second_pred); \
1418 }
1419
1420 #define VPX_AVGSAD_32xHEIGHT_MSA(height) \
1421 uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1422 const uint8_t *ref, int32_t ref_stride, \
1423 const uint8_t *second_pred) { \
1424 return avgsad_32width_msa(src, src_stride, ref, ref_stride, \
1425 height, second_pred); \
1426 }
1427
1428 #define VPX_AVGSAD_64xHEIGHT_MSA(height) \
1429 uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1430 const uint8_t *ref, int32_t ref_stride, \
1431 const uint8_t *second_pred) { \
1432 return avgsad_64width_msa(src, src_stride, ref, ref_stride, \
1433 height, second_pred); \
1434 }
1435
1436 // 64x64
1437 VPX_SAD_64xHEIGHT_MSA(64);
1438 VPX_SAD_64xHEIGHTx3_MSA(64);
1439 VPX_SAD_64xHEIGHTx8_MSA(64);
1440 VPX_SAD_64xHEIGHTx4D_MSA(64);
1441 VPX_AVGSAD_64xHEIGHT_MSA(64);
1442
1443 // 64x32
1444 VPX_SAD_64xHEIGHT_MSA(32);
1445 VPX_SAD_64xHEIGHTx3_MSA(32);
1446 VPX_SAD_64xHEIGHTx8_MSA(32);
1447 VPX_SAD_64xHEIGHTx4D_MSA(32);
1448 VPX_AVGSAD_64xHEIGHT_MSA(32);
1449
1450 // 32x64
1451 VPX_SAD_32xHEIGHT_MSA(64);
1452 VPX_SAD_32xHEIGHTx3_MSA(64);
1453 VPX_SAD_32xHEIGHTx8_MSA(64);
1454 VPX_SAD_32xHEIGHTx4D_MSA(64);
1455 VPX_AVGSAD_32xHEIGHT_MSA(64);
1456
1457 // 32x32
1458 VPX_SAD_32xHEIGHT_MSA(32);
1459 VPX_SAD_32xHEIGHTx3_MSA(32);
1460 VPX_SAD_32xHEIGHTx8_MSA(32);
1461 VPX_SAD_32xHEIGHTx4D_MSA(32);
1462 VPX_AVGSAD_32xHEIGHT_MSA(32);
1463
1464 // 32x16
1465 VPX_SAD_32xHEIGHT_MSA(16);
1466 VPX_SAD_32xHEIGHTx3_MSA(16);
1467 VPX_SAD_32xHEIGHTx8_MSA(16);
1468 VPX_SAD_32xHEIGHTx4D_MSA(16);
1469 VPX_AVGSAD_32xHEIGHT_MSA(16);
1470
1471 // 16x32
1472 VPX_SAD_16xHEIGHT_MSA(32);
1473 VPX_SAD_16xHEIGHTx3_MSA(32);
1474 VPX_SAD_16xHEIGHTx8_MSA(32);
1475 VPX_SAD_16xHEIGHTx4D_MSA(32);
1476 VPX_AVGSAD_16xHEIGHT_MSA(32);
1477
1478 // 16x16
1479 VPX_SAD_16xHEIGHT_MSA(16);
1480 VPX_SAD_16xHEIGHTx3_MSA(16);
1481 VPX_SAD_16xHEIGHTx8_MSA(16);
1482 VPX_SAD_16xHEIGHTx4D_MSA(16);
1483 VPX_AVGSAD_16xHEIGHT_MSA(16);
1484
1485 // 16x8
1486 VPX_SAD_16xHEIGHT_MSA(8);
1487 VPX_SAD_16xHEIGHTx3_MSA(8);
1488 VPX_SAD_16xHEIGHTx8_MSA(8);
1489 VPX_SAD_16xHEIGHTx4D_MSA(8);
1490 VPX_AVGSAD_16xHEIGHT_MSA(8);
1491
1492 // 8x16
1493 VPX_SAD_8xHEIGHT_MSA(16);
1494 VPX_SAD_8xHEIGHTx3_MSA(16);
1495 VPX_SAD_8xHEIGHTx8_MSA(16);
1496 VPX_SAD_8xHEIGHTx4D_MSA(16);
1497 VPX_AVGSAD_8xHEIGHT_MSA(16);
1498
1499 // 8x8
1500 VPX_SAD_8xHEIGHT_MSA(8);
1501 VPX_SAD_8xHEIGHTx3_MSA(8);
1502 VPX_SAD_8xHEIGHTx8_MSA(8);
1503 VPX_SAD_8xHEIGHTx4D_MSA(8);
1504 VPX_AVGSAD_8xHEIGHT_MSA(8);
1505
1506 // 8x4
1507 VPX_SAD_8xHEIGHT_MSA(4);
1508 VPX_SAD_8xHEIGHTx3_MSA(4);
1509 VPX_SAD_8xHEIGHTx8_MSA(4);
1510 VPX_SAD_8xHEIGHTx4D_MSA(4);
1511 VPX_AVGSAD_8xHEIGHT_MSA(4);
1512
1513 // 4x8
1514 VPX_SAD_4xHEIGHT_MSA(8);
1515 VPX_SAD_4xHEIGHTx3_MSA(8);
1516 VPX_SAD_4xHEIGHTx8_MSA(8);
1517 VPX_SAD_4xHEIGHTx4D_MSA(8);
1518 VPX_AVGSAD_4xHEIGHT_MSA(8);
1519
1520 // 4x4
1521 VPX_SAD_4xHEIGHT_MSA(4);
1522 VPX_SAD_4xHEIGHTx3_MSA(4);
1523 VPX_SAD_4xHEIGHTx8_MSA(4);
1524 VPX_SAD_4xHEIGHTx4D_MSA(4);
1525 VPX_AVGSAD_4xHEIGHT_MSA(4);
1526