• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13 
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) {    \
15   out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0);  \
16   out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1);  \
17   out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2);  \
18   out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3);  \
19 }
20 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
21 
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)22 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
23                                const uint8_t *ref_ptr, int32_t ref_stride,
24                                int32_t height) {
25   int32_t ht_cnt;
26   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
27   v16u8 src = { 0 };
28   v16u8 ref = { 0 };
29   v16u8 diff;
30   v8u16 sad = { 0 };
31 
32   for (ht_cnt = (height >> 2); ht_cnt--;) {
33     LW4(src_ptr, src_stride, src0, src1, src2, src3);
34     src_ptr += (4 * src_stride);
35     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
36     ref_ptr += (4 * ref_stride);
37 
38     INSERT_W4_UB(src0, src1, src2, src3, src);
39     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
40 
41     diff = __msa_asub_u_b(src, ref);
42     sad += __msa_hadd_u_h(diff, diff);
43   }
44 
45   return HADD_UH_U32(sad);
46 }
47 
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)48 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
49                                const uint8_t *ref, int32_t ref_stride,
50                                int32_t height) {
51   int32_t ht_cnt;
52   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
53   v8u16 sad = { 0 };
54 
55   for (ht_cnt = (height >> 2); ht_cnt--;) {
56     LD_UB4(src, src_stride, src0, src1, src2, src3);
57     src += (4 * src_stride);
58     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
59     ref += (4 * ref_stride);
60 
61     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
62                 src0, src1, ref0, ref1);
63     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
64   }
65 
66   return HADD_UH_U32(sad);
67 }
68 
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)69 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
70                                 const uint8_t *ref, int32_t ref_stride,
71                                 int32_t height) {
72   int32_t ht_cnt;
73   v16u8 src0, src1, ref0, ref1;
74   v8u16 sad = { 0 };
75 
76   for (ht_cnt = (height >> 2); ht_cnt--;) {
77     LD_UB2(src, src_stride, src0, src1);
78     src += (2 * src_stride);
79     LD_UB2(ref, ref_stride, ref0, ref1);
80     ref += (2 * ref_stride);
81     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
82 
83     LD_UB2(src, src_stride, src0, src1);
84     src += (2 * src_stride);
85     LD_UB2(ref, ref_stride, ref0, ref1);
86     ref += (2 * ref_stride);
87     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
88   }
89 
90   return HADD_UH_U32(sad);
91 }
92 
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)93 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
94                                 const uint8_t *ref, int32_t ref_stride,
95                                 int32_t height) {
96   int32_t ht_cnt;
97   v16u8 src0, src1, ref0, ref1;
98   v8u16 sad = { 0 };
99 
100   for (ht_cnt = (height >> 2); ht_cnt--;) {
101     LD_UB2(src, 16, src0, src1);
102     src += src_stride;
103     LD_UB2(ref, 16, ref0, ref1);
104     ref += ref_stride;
105     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
106 
107     LD_UB2(src, 16, src0, src1);
108     src += src_stride;
109     LD_UB2(ref, 16, ref0, ref1);
110     ref += ref_stride;
111     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
112 
113     LD_UB2(src, 16, src0, src1);
114     src += src_stride;
115     LD_UB2(ref, 16, ref0, ref1);
116     ref += ref_stride;
117     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
118 
119     LD_UB2(src, 16, src0, src1);
120     src += src_stride;
121     LD_UB2(ref, 16, ref0, ref1);
122     ref += ref_stride;
123     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
124   }
125 
126   return HADD_UH_U32(sad);
127 }
128 
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)129 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
130                                 const uint8_t *ref, int32_t ref_stride,
131                                 int32_t height) {
132   int32_t ht_cnt;
133   uint32_t sad = 0;
134   v16u8 src0, src1, src2, src3;
135   v16u8 ref0, ref1, ref2, ref3;
136   v8u16 sad0 = { 0 };
137   v8u16 sad1 = { 0 };
138 
139   for (ht_cnt = (height >> 1); ht_cnt--;) {
140     LD_UB4(src, 16, src0, src1, src2, src3);
141     src += src_stride;
142     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
143     ref += ref_stride;
144     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
145     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
146 
147     LD_UB4(src, 16, src0, src1, src2, src3);
148     src += src_stride;
149     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
150     ref += ref_stride;
151     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
152     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
153   }
154 
155   sad = HADD_UH_U32(sad0);
156   sad += HADD_UH_U32(sad1);
157 
158   return sad;
159 }
160 
sad_4width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)161 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
162                               const uint8_t *ref_ptr, int32_t ref_stride,
163                               int32_t height, uint32_t *sad_array) {
164   int32_t ht_cnt;
165   uint32_t src0, src1, src2, src3;
166   v16u8 src = { 0 };
167   v16u8 ref = { 0 };
168   v16u8 ref0, ref1, ref2, ref3, diff;
169   v8u16 sad0 = { 0 };
170   v8u16 sad1 = { 0 };
171   v8u16 sad2 = { 0 };
172 
173   for (ht_cnt = (height >> 2); ht_cnt--;) {
174     LW4(src_ptr, src_stride, src0, src1, src2, src3);
175     src_ptr += (4 * src_stride);
176     INSERT_W4_UB(src0, src1, src2, src3, src);
177 
178     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
179     ref_ptr += (4 * ref_stride);
180     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
181     diff = __msa_asub_u_b(src, ref);
182     sad0 += __msa_hadd_u_h(diff, diff);
183 
184     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
185     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
186     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
187     diff = __msa_asub_u_b(src, ref);
188     sad1 += __msa_hadd_u_h(diff, diff);
189 
190     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
191     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
192     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
193     diff = __msa_asub_u_b(src, ref);
194     sad2 += __msa_hadd_u_h(diff, diff);
195   }
196 
197   sad_array[0] = HADD_UH_U32(sad0);
198   sad_array[1] = HADD_UH_U32(sad1);
199   sad_array[2] = HADD_UH_U32(sad2);
200 }
201 
sad_8width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)202 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
203                               const uint8_t *ref, int32_t ref_stride,
204                               int32_t height, uint32_t *sad_array) {
205   int32_t ht_cnt;
206   v16u8 src0, src1, src2, src3;
207   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
208   v8u16 sad0 = { 0 };
209   v8u16 sad1 = { 0 };
210   v8u16 sad2 = { 0 };
211 
212   for (ht_cnt = (height >> 2); ht_cnt--;) {
213     LD_UB4(src, src_stride, src0, src1, src2, src3);
214     src += (4 * src_stride);
215     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
216     ref += (4 * ref_stride);
217     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
218                 src0, src1, ref0, ref1);
219     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
220 
221     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
222     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
223     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
224     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
225 
226     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
227     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
228     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
229     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
230   }
231 
232   sad_array[0] = HADD_UH_U32(sad0);
233   sad_array[1] = HADD_UH_U32(sad1);
234   sad_array[2] = HADD_UH_U32(sad2);
235 }
236 
sad_16width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)237 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
238                                const uint8_t *ref_ptr, int32_t ref_stride,
239                                int32_t height, uint32_t *sad_array) {
240   int32_t ht_cnt;
241   v16u8 src, ref, ref0, ref1, diff;
242   v8u16 sad0 = { 0 };
243   v8u16 sad1 = { 0 };
244   v8u16 sad2 = { 0 };
245 
246   for (ht_cnt = (height >> 1); ht_cnt--;) {
247     src = LD_UB(src_ptr);
248     src_ptr += src_stride;
249     LD_UB2(ref_ptr, 16, ref0, ref1);
250     ref_ptr += ref_stride;
251 
252     diff = __msa_asub_u_b(src, ref0);
253     sad0 += __msa_hadd_u_h(diff, diff);
254 
255     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
256     diff = __msa_asub_u_b(src, ref);
257     sad1 += __msa_hadd_u_h(diff, diff);
258 
259     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
260     diff = __msa_asub_u_b(src, ref);
261     sad2 += __msa_hadd_u_h(diff, diff);
262 
263     src = LD_UB(src_ptr);
264     src_ptr += src_stride;
265     LD_UB2(ref_ptr, 16, ref0, ref1);
266     ref_ptr += ref_stride;
267 
268     diff = __msa_asub_u_b(src, ref0);
269     sad0 += __msa_hadd_u_h(diff, diff);
270 
271     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
272     diff = __msa_asub_u_b(src, ref);
273     sad1 += __msa_hadd_u_h(diff, diff);
274 
275     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
276     diff = __msa_asub_u_b(src, ref);
277     sad2 += __msa_hadd_u_h(diff, diff);
278   }
279 
280   sad_array[0] = HADD_UH_U32(sad0);
281   sad_array[1] = HADD_UH_U32(sad1);
282   sad_array[2] = HADD_UH_U32(sad2);
283 }
284 
sad_32width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)285 static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
286                                const uint8_t *ref, int32_t ref_stride,
287                                int32_t height, uint32_t *sad_array) {
288   int32_t ht_cnt;
289   v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
290   v8u16 sad0 = { 0 };
291   v8u16 sad1 = { 0 };
292   v8u16 sad2 = { 0 };
293 
294   for (ht_cnt = height >> 1; ht_cnt--;) {
295     LD_UB2(src, 16, src0, src1);
296     src += src_stride;
297     LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
298     ref += ref_stride;
299 
300     sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
301 
302     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
303     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
304 
305     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
306     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
307 
308     LD_UB2(src, 16, src0, src1);
309     src += src_stride;
310     LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
311     ref += ref_stride;
312 
313     sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
314 
315     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
316     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
317 
318     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
319     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
320   }
321 
322   sad_array[0] = HADD_UH_U32(sad0);
323   sad_array[1] = HADD_UH_U32(sad1);
324   sad_array[2] = HADD_UH_U32(sad2);
325 }
326 
sad_64width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)327 static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
328                                const uint8_t *ref, int32_t ref_stride,
329                                int32_t height, uint32_t *sad_array) {
330   int32_t ht_cnt;
331   v16u8 src0, src1, src2, src3;
332   v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
333   v8u16 sad0_0 = { 0 };
334   v8u16 sad0_1 = { 0 };
335   v8u16 sad1_0 = { 0 };
336   v8u16 sad1_1 = { 0 };
337   v8u16 sad2_0 = { 0 };
338   v8u16 sad2_1 = { 0 };
339   v4u32 sad;
340 
341   for (ht_cnt = height; ht_cnt--;) {
342     LD_UB4(src, 16, src0, src1, src2, src3);
343     src += src_stride;
344     LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
345     ref0_4 = LD_UB(ref + 64);
346     ref += ref_stride;
347 
348     sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
349     sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
350 
351     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
352     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
353     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
354     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
355 
356     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
357     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
358     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
359     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
360   }
361 
362   sad = __msa_hadd_u_w(sad0_0, sad0_0);
363   sad += __msa_hadd_u_w(sad0_1, sad0_1);
364   sad_array[0] = HADD_SW_S32((v4i32)sad);
365 
366   sad = __msa_hadd_u_w(sad1_0, sad1_0);
367   sad += __msa_hadd_u_w(sad1_1, sad1_1);
368   sad_array[1] = HADD_SW_S32((v4i32)sad);
369 
370   sad = __msa_hadd_u_w(sad2_0, sad2_0);
371   sad += __msa_hadd_u_w(sad2_1, sad2_1);
372   sad_array[2] = HADD_SW_S32((v4i32)sad);
373 }
374 
sad_4width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)375 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
376                               const uint8_t *ref_ptr, int32_t ref_stride,
377                               int32_t height, uint32_t *sad_array) {
378   int32_t ht_cnt;
379   uint32_t src0, src1, src2, src3;
380   v16u8 ref0, ref1, ref2, ref3, diff;
381   v16u8 src = { 0 };
382   v16u8 ref = { 0 };
383   v8u16 sad0 = { 0 };
384   v8u16 sad1 = { 0 };
385   v8u16 sad2 = { 0 };
386   v8u16 sad3 = { 0 };
387   v8u16 sad4 = { 0 };
388   v8u16 sad5 = { 0 };
389   v8u16 sad6 = { 0 };
390   v8u16 sad7 = { 0 };
391 
392   for (ht_cnt = (height >> 2); ht_cnt--;) {
393     LW4(src_ptr, src_stride, src0, src1, src2, src3);
394     INSERT_W4_UB(src0, src1, src2, src3, src);
395     src_ptr += (4 * src_stride);
396     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
397     ref_ptr += (4 * ref_stride);
398 
399     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
400     diff = __msa_asub_u_b(src, ref);
401     sad0 += __msa_hadd_u_h(diff, diff);
402 
403     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
404     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
405     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
406     diff = __msa_asub_u_b(src, ref);
407     sad1 += __msa_hadd_u_h(diff, diff);
408 
409     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
410     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
411     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
412     diff = __msa_asub_u_b(src, ref);
413     sad2 += __msa_hadd_u_h(diff, diff);
414 
415     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
416     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
417     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
418     diff = __msa_asub_u_b(src, ref);
419     sad3 += __msa_hadd_u_h(diff, diff);
420 
421     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
422     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
423     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
424     diff = __msa_asub_u_b(src, ref);
425     sad4 += __msa_hadd_u_h(diff, diff);
426 
427     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
428     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
429     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
430     diff = __msa_asub_u_b(src, ref);
431     sad5 += __msa_hadd_u_h(diff, diff);
432 
433     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
434     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
435     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
436     diff = __msa_asub_u_b(src, ref);
437     sad6 += __msa_hadd_u_h(diff, diff);
438 
439     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
440     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
441     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
442     diff = __msa_asub_u_b(src, ref);
443     sad7 += __msa_hadd_u_h(diff, diff);
444   }
445 
446   sad_array[0] = HADD_UH_U32(sad0);
447   sad_array[1] = HADD_UH_U32(sad1);
448   sad_array[2] = HADD_UH_U32(sad2);
449   sad_array[3] = HADD_UH_U32(sad3);
450   sad_array[4] = HADD_UH_U32(sad4);
451   sad_array[5] = HADD_UH_U32(sad5);
452   sad_array[6] = HADD_UH_U32(sad6);
453   sad_array[7] = HADD_UH_U32(sad7);
454 }
455 
sad_8width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)456 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
457                               const uint8_t *ref, int32_t ref_stride,
458                               int32_t height, uint32_t *sad_array) {
459   int32_t ht_cnt;
460   v16u8 src0, src1, src2, src3;
461   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
462   v8u16 sad0 = { 0 };
463   v8u16 sad1 = { 0 };
464   v8u16 sad2 = { 0 };
465   v8u16 sad3 = { 0 };
466   v8u16 sad4 = { 0 };
467   v8u16 sad5 = { 0 };
468   v8u16 sad6 = { 0 };
469   v8u16 sad7 = { 0 };
470 
471   for (ht_cnt = (height >> 2); ht_cnt--;) {
472     LD_UB4(src, src_stride, src0, src1, src2, src3);
473     src += (4 * src_stride);
474     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
475     ref += (4 * ref_stride);
476     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
477                 src0, src1, ref0, ref1);
478     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
479 
480     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
481     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
482     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
483     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
484 
485     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
486     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
487     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
488     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
489 
490     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
491     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
492     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
493     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
494 
495     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
496     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
497     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
498     sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
499 
500     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
501     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
502     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
503     sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
504 
505     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
506     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
507     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
508     sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
509 
510     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
511     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
512     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
513     sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
514   }
515 
516   sad_array[0] = HADD_UH_U32(sad0);
517   sad_array[1] = HADD_UH_U32(sad1);
518   sad_array[2] = HADD_UH_U32(sad2);
519   sad_array[3] = HADD_UH_U32(sad3);
520   sad_array[4] = HADD_UH_U32(sad4);
521   sad_array[5] = HADD_UH_U32(sad5);
522   sad_array[6] = HADD_UH_U32(sad6);
523   sad_array[7] = HADD_UH_U32(sad7);
524 }
525 
sad_16width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)526 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
527                                const uint8_t *ref_ptr, int32_t ref_stride,
528                                int32_t height, uint32_t *sad_array) {
529   int32_t ht_cnt;
530   v16u8 src, ref0, ref1, ref;
531   v16u8 diff;
532   v8u16 sad0 = { 0 };
533   v8u16 sad1 = { 0 };
534   v8u16 sad2 = { 0 };
535   v8u16 sad3 = { 0 };
536   v8u16 sad4 = { 0 };
537   v8u16 sad5 = { 0 };
538   v8u16 sad6 = { 0 };
539   v8u16 sad7 = { 0 };
540 
541   for (ht_cnt = (height >> 1); ht_cnt--;) {
542     src = LD_UB(src_ptr);
543     src_ptr += src_stride;
544     LD_UB2(ref_ptr, 16, ref0, ref1);
545     ref_ptr += ref_stride;
546 
547     diff = __msa_asub_u_b(src, ref0);
548     sad0 += __msa_hadd_u_h(diff, diff);
549 
550     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
551     diff = __msa_asub_u_b(src, ref);
552     sad1 += __msa_hadd_u_h(diff, diff);
553 
554     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
555     diff = __msa_asub_u_b(src, ref);
556     sad2 += __msa_hadd_u_h(diff, diff);
557 
558     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
559     diff = __msa_asub_u_b(src, ref);
560     sad3 += __msa_hadd_u_h(diff, diff);
561 
562     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
563     diff = __msa_asub_u_b(src, ref);
564     sad4 += __msa_hadd_u_h(diff, diff);
565 
566     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
567     diff = __msa_asub_u_b(src, ref);
568     sad5 += __msa_hadd_u_h(diff, diff);
569 
570     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
571     diff = __msa_asub_u_b(src, ref);
572     sad6 += __msa_hadd_u_h(diff, diff);
573 
574     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
575     diff = __msa_asub_u_b(src, ref);
576     sad7 += __msa_hadd_u_h(diff, diff);
577 
578     src = LD_UB(src_ptr);
579     src_ptr += src_stride;
580     LD_UB2(ref_ptr, 16, ref0, ref1);
581     ref_ptr += ref_stride;
582 
583     diff = __msa_asub_u_b(src, ref0);
584     sad0 += __msa_hadd_u_h(diff, diff);
585 
586     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
587     diff = __msa_asub_u_b(src, ref);
588     sad1 += __msa_hadd_u_h(diff, diff);
589 
590     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
591     diff = __msa_asub_u_b(src, ref);
592     sad2 += __msa_hadd_u_h(diff, diff);
593 
594     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
595     diff = __msa_asub_u_b(src, ref);
596     sad3 += __msa_hadd_u_h(diff, diff);
597 
598     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
599     diff = __msa_asub_u_b(src, ref);
600     sad4 += __msa_hadd_u_h(diff, diff);
601 
602     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
603     diff = __msa_asub_u_b(src, ref);
604     sad5 += __msa_hadd_u_h(diff, diff);
605 
606     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
607     diff = __msa_asub_u_b(src, ref);
608     sad6 += __msa_hadd_u_h(diff, diff);
609 
610     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
611     diff = __msa_asub_u_b(src, ref);
612     sad7 += __msa_hadd_u_h(diff, diff);
613   }
614 
615   sad_array[0] = HADD_UH_U32(sad0);
616   sad_array[1] = HADD_UH_U32(sad1);
617   sad_array[2] = HADD_UH_U32(sad2);
618   sad_array[3] = HADD_UH_U32(sad3);
619   sad_array[4] = HADD_UH_U32(sad4);
620   sad_array[5] = HADD_UH_U32(sad5);
621   sad_array[6] = HADD_UH_U32(sad6);
622   sad_array[7] = HADD_UH_U32(sad7);
623 }
624 
sad_32width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)625 static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
626                                const uint8_t *ref, int32_t ref_stride,
627                                int32_t height, uint32_t *sad_array) {
628   int32_t ht_cnt;
629   v16u8 src0, src1;
630   v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
631   v8u16 sad0 = { 0 };
632   v8u16 sad1 = { 0 };
633   v8u16 sad2 = { 0 };
634   v8u16 sad3 = { 0 };
635   v8u16 sad4 = { 0 };
636   v8u16 sad5 = { 0 };
637   v8u16 sad6 = { 0 };
638   v8u16 sad7 = { 0 };
639 
640   for (ht_cnt = height; ht_cnt--;) {
641     LD_UB2(src, 16, src0, src1);
642     src += src_stride;
643     LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
644     ref += ref_stride;
645 
646     sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
647 
648     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
649     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
650 
651     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
652     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
653 
654     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
655     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
656 
657     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
658     sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
659 
660     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
661     sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
662 
663     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
664     sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
665 
666     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
667     sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
668   }
669 
670   sad_array[0] = HADD_UH_U32(sad0);
671   sad_array[1] = HADD_UH_U32(sad1);
672   sad_array[2] = HADD_UH_U32(sad2);
673   sad_array[3] = HADD_UH_U32(sad3);
674   sad_array[4] = HADD_UH_U32(sad4);
675   sad_array[5] = HADD_UH_U32(sad5);
676   sad_array[6] = HADD_UH_U32(sad6);
677   sad_array[7] = HADD_UH_U32(sad7);
678 }
679 
sad_64width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)680 static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
681                                const uint8_t *ref, int32_t ref_stride,
682                                int32_t height, uint32_t *sad_array) {
683   const uint8_t *src_dup, *ref_dup;
684   int32_t ht_cnt;
685   v16u8 src0, src1, src2, src3;
686   v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
687   v16u8 ref0, ref1, ref2, ref3;
688   v8u16 sad0_0 = { 0 };
689   v8u16 sad0_1 = { 0 };
690   v8u16 sad1_0 = { 0 };
691   v8u16 sad1_1 = { 0 };
692   v8u16 sad2_0 = { 0 };
693   v8u16 sad2_1 = { 0 };
694   v8u16 sad3_0 = { 0 };
695   v8u16 sad3_1 = { 0 };
696   v4u32 sad;
697 
698   src_dup = src;
699   ref_dup = ref;
700 
701   for (ht_cnt = height; ht_cnt--;) {
702     LD_UB4(src, 16, src0, src1, src2, src3);
703     src += src_stride;
704     LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
705     ref += ref_stride;
706 
707     sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
708     sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
709 
710     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
711     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
712     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
713     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
714 
715     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
716     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
717     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
718     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
719 
720     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
721     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
722     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
723     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
724   }
725 
726   sad = __msa_hadd_u_w(sad0_0, sad0_0);
727   sad += __msa_hadd_u_w(sad0_1, sad0_1);
728   sad_array[0] = HADD_SW_S32(sad);
729 
730   sad = __msa_hadd_u_w(sad1_0, sad1_0);
731   sad += __msa_hadd_u_w(sad1_1, sad1_1);
732   sad_array[1] = HADD_SW_S32(sad);
733 
734   sad = __msa_hadd_u_w(sad2_0, sad2_0);
735   sad += __msa_hadd_u_w(sad2_1, sad2_1);
736   sad_array[2] = HADD_SW_S32(sad);
737 
738   sad = __msa_hadd_u_w(sad3_0, sad3_0);
739   sad += __msa_hadd_u_w(sad3_1, sad3_1);
740   sad_array[3] = HADD_SW_S32(sad);
741 
742   sad0_0 = (v8u16)__msa_ldi_h(0);
743   sad0_1 = (v8u16)__msa_ldi_h(0);
744   sad1_0 = (v8u16)__msa_ldi_h(0);
745   sad1_1 = (v8u16)__msa_ldi_h(0);
746   sad2_0 = (v8u16)__msa_ldi_h(0);
747   sad2_1 = (v8u16)__msa_ldi_h(0);
748   sad3_0 = (v8u16)__msa_ldi_h(0);
749   sad3_1 = (v8u16)__msa_ldi_h(0);
750 
751   for (ht_cnt = 64; ht_cnt--;) {
752     LD_UB4(src_dup, 16, src0, src1, src2, src3);
753     src_dup += src_stride;
754     LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
755     ref_dup += ref_stride;
756 
757     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
758     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
759     sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
760     sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
761 
762     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
763     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
764     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
765     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
766 
767     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
768     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
769     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
770     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
771 
772     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
773     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
774     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
775     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
776   }
777 
778   sad = __msa_hadd_u_w(sad0_0, sad0_0);
779   sad += __msa_hadd_u_w(sad0_1, sad0_1);
780   sad_array[4] = HADD_SW_S32(sad);
781 
782   sad = __msa_hadd_u_w(sad1_0, sad1_0);
783   sad += __msa_hadd_u_w(sad1_1, sad1_1);
784   sad_array[5] = HADD_SW_S32(sad);
785 
786   sad = __msa_hadd_u_w(sad2_0, sad2_0);
787   sad += __msa_hadd_u_w(sad2_1, sad2_1);
788   sad_array[6] = HADD_SW_S32(sad);
789 
790   sad = __msa_hadd_u_w(sad3_0, sad3_0);
791   sad += __msa_hadd_u_w(sad3_1, sad3_1);
792   sad_array[7] = HADD_SW_S32(sad);
793 }
794 
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)795 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
796                                const uint8_t * const aref_ptr[],
797                                int32_t ref_stride,
798                                int32_t height, uint32_t *sad_array) {
799   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
800   int32_t ht_cnt;
801   uint32_t src0, src1, src2, src3;
802   uint32_t ref0, ref1, ref2, ref3;
803   v16u8 src = { 0 };
804   v16u8 ref = { 0 };
805   v16u8 diff;
806   v8u16 sad0 = { 0 };
807   v8u16 sad1 = { 0 };
808   v8u16 sad2 = { 0 };
809   v8u16 sad3 = { 0 };
810 
811   ref0_ptr = aref_ptr[0];
812   ref1_ptr = aref_ptr[1];
813   ref2_ptr = aref_ptr[2];
814   ref3_ptr = aref_ptr[3];
815 
816   for (ht_cnt = (height >> 2); ht_cnt--;) {
817     LW4(src_ptr, src_stride, src0, src1, src2, src3);
818     INSERT_W4_UB(src0, src1, src2, src3, src);
819     src_ptr += (4 * src_stride);
820 
821     LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
822     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
823     ref0_ptr += (4 * ref_stride);
824 
825     diff = __msa_asub_u_b(src, ref);
826     sad0 += __msa_hadd_u_h(diff, diff);
827 
828     LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
829     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
830     ref1_ptr += (4 * ref_stride);
831 
832     diff = __msa_asub_u_b(src, ref);
833     sad1 += __msa_hadd_u_h(diff, diff);
834 
835     LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
836     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
837     ref2_ptr += (4 * ref_stride);
838 
839     diff = __msa_asub_u_b(src, ref);
840     sad2 += __msa_hadd_u_h(diff, diff);
841 
842     LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
843     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
844     ref3_ptr += (4 * ref_stride);
845 
846     diff = __msa_asub_u_b(src, ref);
847     sad3 += __msa_hadd_u_h(diff, diff);
848   }
849 
850   sad_array[0] = HADD_UH_U32(sad0);
851   sad_array[1] = HADD_UH_U32(sad1);
852   sad_array[2] = HADD_UH_U32(sad2);
853   sad_array[3] = HADD_UH_U32(sad3);
854 }
855 
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)856 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
857                                const uint8_t * const aref_ptr[],
858                                int32_t ref_stride,
859                                int32_t height, uint32_t *sad_array) {
860   int32_t ht_cnt;
861   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
862   v16u8 src0, src1, src2, src3;
863   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
864   v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
865   v8u16 sad0 = { 0 };
866   v8u16 sad1 = { 0 };
867   v8u16 sad2 = { 0 };
868   v8u16 sad3 = { 0 };
869 
870   ref0_ptr = aref_ptr[0];
871   ref1_ptr = aref_ptr[1];
872   ref2_ptr = aref_ptr[2];
873   ref3_ptr = aref_ptr[3];
874 
875   for (ht_cnt = (height >> 2); ht_cnt--;) {
876     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
877     src_ptr += (4 * src_stride);
878     LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
879     ref0_ptr += (4 * ref_stride);
880     LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
881     ref1_ptr += (4 * ref_stride);
882     LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
883     ref2_ptr += (4 * ref_stride);
884     LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
885     ref3_ptr += (4 * ref_stride);
886 
887     PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
888     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
889     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
890 
891     PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
892     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
893 
894     PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
895     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
896 
897     PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
898     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
899   }
900 
901   sad_array[0] = HADD_UH_U32(sad0);
902   sad_array[1] = HADD_UH_U32(sad1);
903   sad_array[2] = HADD_UH_U32(sad2);
904   sad_array[3] = HADD_UH_U32(sad3);
905 }
906 
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)907 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
908                                 const uint8_t * const aref_ptr[],
909                                 int32_t ref_stride,
910                                 int32_t height, uint32_t *sad_array) {
911   int32_t ht_cnt;
912   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
913   v16u8 src, ref0, ref1, ref2, ref3, diff;
914   v8u16 sad0 = { 0 };
915   v8u16 sad1 = { 0 };
916   v8u16 sad2 = { 0 };
917   v8u16 sad3 = { 0 };
918 
919   ref0_ptr = aref_ptr[0];
920   ref1_ptr = aref_ptr[1];
921   ref2_ptr = aref_ptr[2];
922   ref3_ptr = aref_ptr[3];
923 
924   for (ht_cnt = (height >> 1); ht_cnt--;) {
925     src = LD_UB(src_ptr);
926     src_ptr += src_stride;
927     ref0 = LD_UB(ref0_ptr);
928     ref0_ptr += ref_stride;
929     ref1 = LD_UB(ref1_ptr);
930     ref1_ptr += ref_stride;
931     ref2 = LD_UB(ref2_ptr);
932     ref2_ptr += ref_stride;
933     ref3 = LD_UB(ref3_ptr);
934     ref3_ptr += ref_stride;
935 
936     diff = __msa_asub_u_b(src, ref0);
937     sad0 += __msa_hadd_u_h(diff, diff);
938     diff = __msa_asub_u_b(src, ref1);
939     sad1 += __msa_hadd_u_h(diff, diff);
940     diff = __msa_asub_u_b(src, ref2);
941     sad2 += __msa_hadd_u_h(diff, diff);
942     diff = __msa_asub_u_b(src, ref3);
943     sad3 += __msa_hadd_u_h(diff, diff);
944 
945     src = LD_UB(src_ptr);
946     src_ptr += src_stride;
947     ref0 = LD_UB(ref0_ptr);
948     ref0_ptr += ref_stride;
949     ref1 = LD_UB(ref1_ptr);
950     ref1_ptr += ref_stride;
951     ref2 = LD_UB(ref2_ptr);
952     ref2_ptr += ref_stride;
953     ref3 = LD_UB(ref3_ptr);
954     ref3_ptr += ref_stride;
955 
956     diff = __msa_asub_u_b(src, ref0);
957     sad0 += __msa_hadd_u_h(diff, diff);
958     diff = __msa_asub_u_b(src, ref1);
959     sad1 += __msa_hadd_u_h(diff, diff);
960     diff = __msa_asub_u_b(src, ref2);
961     sad2 += __msa_hadd_u_h(diff, diff);
962     diff = __msa_asub_u_b(src, ref3);
963     sad3 += __msa_hadd_u_h(diff, diff);
964   }
965 
966   sad_array[0] = HADD_UH_U32(sad0);
967   sad_array[1] = HADD_UH_U32(sad1);
968   sad_array[2] = HADD_UH_U32(sad2);
969   sad_array[3] = HADD_UH_U32(sad3);
970 }
971 
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)972 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
973                                 const uint8_t * const aref_ptr[],
974                                 int32_t ref_stride,
975                                 int32_t height, uint32_t *sad_array) {
976   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
977   int32_t ht_cnt;
978   v16u8 src0, src1, ref0, ref1;
979   v8u16 sad0 = { 0 };
980   v8u16 sad1 = { 0 };
981   v8u16 sad2 = { 0 };
982   v8u16 sad3 = { 0 };
983 
984   ref0_ptr = aref_ptr[0];
985   ref1_ptr = aref_ptr[1];
986   ref2_ptr = aref_ptr[2];
987   ref3_ptr = aref_ptr[3];
988 
989   for (ht_cnt = height; ht_cnt--;) {
990     LD_UB2(src, 16, src0, src1);
991     src += src_stride;
992 
993     LD_UB2(ref0_ptr, 16, ref0, ref1);
994     ref0_ptr += ref_stride;
995     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
996 
997     LD_UB2(ref1_ptr, 16, ref0, ref1);
998     ref1_ptr += ref_stride;
999     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
1000 
1001     LD_UB2(ref2_ptr, 16, ref0, ref1);
1002     ref2_ptr += ref_stride;
1003     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
1004 
1005     LD_UB2(ref3_ptr, 16, ref0, ref1);
1006     ref3_ptr += ref_stride;
1007     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
1008   }
1009 
1010   sad_array[0] = HADD_UH_U32(sad0);
1011   sad_array[1] = HADD_UH_U32(sad1);
1012   sad_array[2] = HADD_UH_U32(sad2);
1013   sad_array[3] = HADD_UH_U32(sad3);
1014 }
1015 
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)1016 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
1017                                 const uint8_t * const aref_ptr[],
1018                                 int32_t ref_stride,
1019                                 int32_t height, uint32_t *sad_array) {
1020   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
1021   int32_t ht_cnt;
1022   v16u8 src0, src1, src2, src3;
1023   v16u8 ref0, ref1, ref2, ref3;
1024   v8u16 sad0_0 = { 0 };
1025   v8u16 sad0_1 = { 0 };
1026   v8u16 sad1_0 = { 0 };
1027   v8u16 sad1_1 = { 0 };
1028   v8u16 sad2_0 = { 0 };
1029   v8u16 sad2_1 = { 0 };
1030   v8u16 sad3_0 = { 0 };
1031   v8u16 sad3_1 = { 0 };
1032 
1033   ref0_ptr = aref_ptr[0];
1034   ref1_ptr = aref_ptr[1];
1035   ref2_ptr = aref_ptr[2];
1036   ref3_ptr = aref_ptr[3];
1037 
1038   for (ht_cnt = height; ht_cnt--;) {
1039     LD_UB4(src, 16, src0, src1, src2, src3);
1040     src += src_stride;
1041 
1042     LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
1043     ref0_ptr += ref_stride;
1044     sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1045     sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1046 
1047     LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
1048     ref1_ptr += ref_stride;
1049     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1050     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1051 
1052     LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
1053     ref2_ptr += ref_stride;
1054     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1055     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1056 
1057     LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
1058     ref3_ptr += ref_stride;
1059     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1060     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1061   }
1062 
1063   sad_array[0] = HADD_UH_U32(sad0_0);
1064   sad_array[0] += HADD_UH_U32(sad0_1);
1065   sad_array[1] = HADD_UH_U32(sad1_0);
1066   sad_array[1] += HADD_UH_U32(sad1_1);
1067   sad_array[2] = HADD_UH_U32(sad2_0);
1068   sad_array[2] += HADD_UH_U32(sad2_1);
1069   sad_array[3] = HADD_UH_U32(sad3_0);
1070   sad_array[3] += HADD_UH_U32(sad3_1);
1071 }
1072 
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1073 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
1074                                   const uint8_t *ref_ptr, int32_t ref_stride,
1075                                   int32_t height, const uint8_t *sec_pred) {
1076   int32_t ht_cnt;
1077   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1078   v16u8 src = { 0 };
1079   v16u8 ref = { 0 };
1080   v16u8 diff, pred, comp;
1081   v8u16 sad = { 0 };
1082 
1083   for (ht_cnt = (height >> 2); ht_cnt--;) {
1084     LW4(src_ptr, src_stride, src0, src1, src2, src3);
1085     src_ptr += (4 * src_stride);
1086     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
1087     ref_ptr += (4 * ref_stride);
1088     pred = LD_UB(sec_pred);
1089     sec_pred += 16;
1090 
1091     INSERT_W4_UB(src0, src1, src2, src3, src);
1092     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1093 
1094     comp = __msa_aver_u_b(pred, ref);
1095     diff = __msa_asub_u_b(src, comp);
1096     sad += __msa_hadd_u_h(diff, diff);
1097   }
1098 
1099   return HADD_UH_U32(sad);
1100 }
1101 
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1102 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
1103                                   const uint8_t *ref, int32_t ref_stride,
1104                                   int32_t height, const uint8_t *sec_pred) {
1105   int32_t ht_cnt;
1106   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1107   v16u8 diff0, diff1, pred0, pred1;
1108   v8u16 sad = { 0 };
1109 
1110   for (ht_cnt = (height >> 2); ht_cnt--;) {
1111     LD_UB4(src, src_stride, src0, src1, src2, src3);
1112     src += (4 * src_stride);
1113     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1114     ref += (4 * ref_stride);
1115     LD_UB2(sec_pred, 16, pred0, pred1);
1116     sec_pred += 32;
1117     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
1118                 src0, src1, ref0, ref1);
1119     AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
1120     sad += SAD_UB2_UH(src0, src1, diff0, diff1);
1121   }
1122 
1123   return HADD_UH_U32(sad);
1124 }
1125 
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1126 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
1127                                    const uint8_t *ref, int32_t ref_stride,
1128                                    int32_t height, const uint8_t *sec_pred) {
1129   int32_t ht_cnt;
1130   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1131   v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
1132   v8u16 sad = { 0 };
1133 
1134   for (ht_cnt = (height >> 3); ht_cnt--;) {
1135     LD_UB4(src, src_stride, src0, src1, src2, src3);
1136     src += (4 * src_stride);
1137     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1138     ref += (4 * ref_stride);
1139     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1140     sec_pred += (4 * 16);
1141     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1142     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1143     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1144     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1145 
1146     LD_UB4(src, src_stride, src0, src1, src2, src3);
1147     src += (4 * src_stride);
1148     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1149     ref += (4 * ref_stride);
1150     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1151     sec_pred += (4 * 16);
1152     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1153     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1154     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1155     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1156   }
1157 
1158   return HADD_UH_U32(sad);
1159 }
1160 
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1161 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
1162                                    const uint8_t *ref, int32_t ref_stride,
1163                                    int32_t height, const uint8_t *sec_pred) {
1164   int32_t ht_cnt;
1165   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1166   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
1167   v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
1168   v16u8 comp0, comp1;
1169   v8u16 sad = { 0 };
1170 
1171   for (ht_cnt = (height >> 2); ht_cnt--;) {
1172     LD_UB4(src, src_stride, src0, src2, src4, src6);
1173     LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
1174     src += (4 * src_stride);
1175 
1176     LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
1177     LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
1178     ref += (4 * ref_stride);
1179 
1180     LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
1181     LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
1182     sec_pred += (4 * 32);
1183 
1184     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1185     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1186     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1187     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1188     AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
1189     sad += SAD_UB2_UH(src4, src5, comp0, comp1);
1190     AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
1191     sad += SAD_UB2_UH(src6, src7, comp0, comp1);
1192   }
1193 
1194   return HADD_UH_U32(sad);
1195 }
1196 
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1197 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
1198                                    const uint8_t *ref, int32_t ref_stride,
1199                                    int32_t height, const uint8_t *sec_pred) {
1200   int32_t ht_cnt;
1201   v16u8 src0, src1, src2, src3;
1202   v16u8 ref0, ref1, ref2, ref3;
1203   v16u8 comp0, comp1, comp2, comp3;
1204   v16u8 pred0, pred1, pred2, pred3;
1205   v8u16 sad0 = { 0 };
1206   v8u16 sad1 = { 0 };
1207   v4u32 sad;
1208 
1209   for (ht_cnt = (height >> 2); ht_cnt--;) {
1210     LD_UB4(src, 16, src0, src1, src2, src3);
1211     src += src_stride;
1212     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1213     ref += ref_stride;
1214     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1215     sec_pred += 64;
1216     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1217                 comp0, comp1, comp2, comp3);
1218     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1219     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1220 
1221     LD_UB4(src, 16, src0, src1, src2, src3);
1222     src += src_stride;
1223     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1224     ref += ref_stride;
1225     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1226     sec_pred += 64;
1227     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1228                 comp0, comp1, comp2, comp3);
1229     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1230     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1231 
1232     LD_UB4(src, 16, src0, src1, src2, src3);
1233     src += src_stride;
1234     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1235     ref += ref_stride;
1236     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1237     sec_pred += 64;
1238     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1239                 comp0, comp1, comp2, comp3);
1240     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1241     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1242 
1243     LD_UB4(src, 16, src0, src1, src2, src3);
1244     src += src_stride;
1245     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1246     ref += ref_stride;
1247     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1248     sec_pred += 64;
1249     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1250                 comp0, comp1, comp2, comp3);
1251     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1252     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1253   }
1254 
1255   sad = __msa_hadd_u_w(sad0, sad0);
1256   sad += __msa_hadd_u_w(sad1, sad1);
1257 
1258   return HADD_SW_S32(sad);
1259 }
1260 
1261 #define VPX_SAD_4xHEIGHT_MSA(height)                                        \
1262 uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1263                                  const uint8_t *ref, int32_t ref_stride) {  \
1264   return sad_4width_msa(src, src_stride,  ref, ref_stride, height);         \
1265 }
1266 
1267 #define VPX_SAD_8xHEIGHT_MSA(height)                                        \
1268 uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1269                                  const uint8_t *ref, int32_t ref_stride) {  \
1270   return sad_8width_msa(src, src_stride, ref, ref_stride, height);          \
1271 }
1272 
1273 #define VPX_SAD_16xHEIGHT_MSA(height)                                        \
1274 uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1275                                   const uint8_t *ref, int32_t ref_stride) {  \
1276   return sad_16width_msa(src, src_stride, ref, ref_stride, height);          \
1277 }
1278 
1279 #define VPX_SAD_32xHEIGHT_MSA(height)                                        \
1280 uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1281                                   const uint8_t *ref, int32_t ref_stride) {  \
1282   return sad_32width_msa(src, src_stride, ref, ref_stride, height);          \
1283 }
1284 
1285 #define VPX_SAD_64xHEIGHT_MSA(height)                                        \
1286 uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1287                                   const uint8_t *ref, int32_t ref_stride) {  \
1288   return sad_64width_msa(src, src_stride, ref, ref_stride, height);          \
1289 }
1290 
1291 #define VPX_SAD_4xHEIGHTx3_MSA(height)                                  \
1292 void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1293                                const uint8_t *ref, int32_t ref_stride,  \
1294                                uint32_t *sads) {                        \
1295   sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1296 }
1297 
1298 #define VPX_SAD_8xHEIGHTx3_MSA(height)                                  \
1299 void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1300                                const uint8_t *ref, int32_t ref_stride,  \
1301                                uint32_t *sads) {                        \
1302   sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1303 }
1304 
1305 #define VPX_SAD_16xHEIGHTx3_MSA(height)                                  \
1306 void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1307                                 const uint8_t *ref, int32_t ref_stride,  \
1308                                 uint32_t *sads) {                        \
1309   sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1310 }
1311 
1312 #define VPX_SAD_32xHEIGHTx3_MSA(height)                                  \
1313 void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1314                                 const uint8_t *ref, int32_t ref_stride,  \
1315                                 uint32_t *sads) {                        \
1316   sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1317 }
1318 
1319 #define VPX_SAD_64xHEIGHTx3_MSA(height)                                  \
1320 void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1321                                 const uint8_t *ref, int32_t ref_stride,  \
1322                                 uint32_t *sads) {                        \
1323   sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1324 }
1325 
1326 #define VPX_SAD_4xHEIGHTx8_MSA(height)                                  \
1327 void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1328                                const uint8_t *ref, int32_t ref_stride,  \
1329                                uint32_t *sads) {                        \
1330   sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1331 }
1332 
1333 #define VPX_SAD_8xHEIGHTx8_MSA(height)                                  \
1334 void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1335                                const uint8_t *ref, int32_t ref_stride,  \
1336                                uint32_t *sads) {                        \
1337   sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1338 }
1339 
1340 #define VPX_SAD_16xHEIGHTx8_MSA(height)                                  \
1341 void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1342                                 const uint8_t *ref, int32_t ref_stride,  \
1343                                 uint32_t *sads) {                        \
1344   sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1345 }
1346 
1347 #define VPX_SAD_32xHEIGHTx8_MSA(height)                                  \
1348 void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1349                                 const uint8_t *ref, int32_t ref_stride,  \
1350                                 uint32_t *sads) {                        \
1351   sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1352 }
1353 
1354 #define VPX_SAD_64xHEIGHTx8_MSA(height)                                  \
1355 void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1356                                 const uint8_t *ref, int32_t ref_stride,  \
1357                                 uint32_t *sads) {                        \
1358   sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1359 }
1360 
1361 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                  \
1362 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1363                                 const uint8_t *const refs[],             \
1364                                 int32_t ref_stride, uint32_t *sads) {    \
1365   sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1366 }
1367 
1368 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                  \
1369 void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1370                                 const uint8_t *const refs[],             \
1371                                 int32_t ref_stride, uint32_t *sads) {    \
1372   sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1373 }
1374 
1375 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                  \
1376 void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1377                                  const uint8_t *const refs[],             \
1378                                  int32_t ref_stride, uint32_t *sads) {    \
1379   sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1380 }
1381 
1382 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                  \
1383 void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1384                                  const uint8_t *const refs[],             \
1385                                  int32_t ref_stride, uint32_t *sads) {    \
1386   sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1387 }
1388 
1389 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                  \
1390 void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1391                                  const uint8_t *const refs[],             \
1392                                  int32_t ref_stride, uint32_t *sads) {    \
1393   sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1394 }
1395 
1396 #define VPX_AVGSAD_4xHEIGHT_MSA(height)                                       \
1397 uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1398                                      const uint8_t *ref, int32_t ref_stride,  \
1399                                      const uint8_t *second_pred) {            \
1400   return avgsad_4width_msa(src, src_stride, ref, ref_stride,                  \
1401                            height, second_pred);                              \
1402 }
1403 
1404 #define VPX_AVGSAD_8xHEIGHT_MSA(height)                                       \
1405 uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1406                                      const uint8_t *ref, int32_t ref_stride,  \
1407                                      const uint8_t *second_pred) {            \
1408   return avgsad_8width_msa(src, src_stride, ref, ref_stride,                  \
1409                            height, second_pred);                              \
1410 }
1411 
1412 #define VPX_AVGSAD_16xHEIGHT_MSA(height)                                       \
1413 uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1414                                       const uint8_t *ref, int32_t ref_stride,  \
1415                                       const uint8_t *second_pred) {            \
1416   return avgsad_16width_msa(src, src_stride, ref, ref_stride,                  \
1417                             height, second_pred);                              \
1418 }
1419 
1420 #define VPX_AVGSAD_32xHEIGHT_MSA(height)                                       \
1421 uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1422                                       const uint8_t *ref, int32_t ref_stride,  \
1423                                       const uint8_t *second_pred) {            \
1424   return avgsad_32width_msa(src, src_stride, ref, ref_stride,                  \
1425                             height, second_pred);                              \
1426 }
1427 
1428 #define VPX_AVGSAD_64xHEIGHT_MSA(height)                                       \
1429 uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1430                                       const uint8_t *ref, int32_t ref_stride,  \
1431                                       const uint8_t *second_pred) {            \
1432   return avgsad_64width_msa(src, src_stride, ref, ref_stride,                  \
1433                             height, second_pred);                              \
1434 }
1435 
1436 // 64x64
1437 VPX_SAD_64xHEIGHT_MSA(64);
1438 VPX_SAD_64xHEIGHTx3_MSA(64);
1439 VPX_SAD_64xHEIGHTx8_MSA(64);
1440 VPX_SAD_64xHEIGHTx4D_MSA(64);
1441 VPX_AVGSAD_64xHEIGHT_MSA(64);
1442 
1443 // 64x32
1444 VPX_SAD_64xHEIGHT_MSA(32);
1445 VPX_SAD_64xHEIGHTx3_MSA(32);
1446 VPX_SAD_64xHEIGHTx8_MSA(32);
1447 VPX_SAD_64xHEIGHTx4D_MSA(32);
1448 VPX_AVGSAD_64xHEIGHT_MSA(32);
1449 
1450 // 32x64
1451 VPX_SAD_32xHEIGHT_MSA(64);
1452 VPX_SAD_32xHEIGHTx3_MSA(64);
1453 VPX_SAD_32xHEIGHTx8_MSA(64);
1454 VPX_SAD_32xHEIGHTx4D_MSA(64);
1455 VPX_AVGSAD_32xHEIGHT_MSA(64);
1456 
1457 // 32x32
1458 VPX_SAD_32xHEIGHT_MSA(32);
1459 VPX_SAD_32xHEIGHTx3_MSA(32);
1460 VPX_SAD_32xHEIGHTx8_MSA(32);
1461 VPX_SAD_32xHEIGHTx4D_MSA(32);
1462 VPX_AVGSAD_32xHEIGHT_MSA(32);
1463 
1464 // 32x16
1465 VPX_SAD_32xHEIGHT_MSA(16);
1466 VPX_SAD_32xHEIGHTx3_MSA(16);
1467 VPX_SAD_32xHEIGHTx8_MSA(16);
1468 VPX_SAD_32xHEIGHTx4D_MSA(16);
1469 VPX_AVGSAD_32xHEIGHT_MSA(16);
1470 
1471 // 16x32
1472 VPX_SAD_16xHEIGHT_MSA(32);
1473 VPX_SAD_16xHEIGHTx3_MSA(32);
1474 VPX_SAD_16xHEIGHTx8_MSA(32);
1475 VPX_SAD_16xHEIGHTx4D_MSA(32);
1476 VPX_AVGSAD_16xHEIGHT_MSA(32);
1477 
1478 // 16x16
1479 VPX_SAD_16xHEIGHT_MSA(16);
1480 VPX_SAD_16xHEIGHTx3_MSA(16);
1481 VPX_SAD_16xHEIGHTx8_MSA(16);
1482 VPX_SAD_16xHEIGHTx4D_MSA(16);
1483 VPX_AVGSAD_16xHEIGHT_MSA(16);
1484 
1485 // 16x8
1486 VPX_SAD_16xHEIGHT_MSA(8);
1487 VPX_SAD_16xHEIGHTx3_MSA(8);
1488 VPX_SAD_16xHEIGHTx8_MSA(8);
1489 VPX_SAD_16xHEIGHTx4D_MSA(8);
1490 VPX_AVGSAD_16xHEIGHT_MSA(8);
1491 
1492 // 8x16
1493 VPX_SAD_8xHEIGHT_MSA(16);
1494 VPX_SAD_8xHEIGHTx3_MSA(16);
1495 VPX_SAD_8xHEIGHTx8_MSA(16);
1496 VPX_SAD_8xHEIGHTx4D_MSA(16);
1497 VPX_AVGSAD_8xHEIGHT_MSA(16);
1498 
1499 // 8x8
1500 VPX_SAD_8xHEIGHT_MSA(8);
1501 VPX_SAD_8xHEIGHTx3_MSA(8);
1502 VPX_SAD_8xHEIGHTx8_MSA(8);
1503 VPX_SAD_8xHEIGHTx4D_MSA(8);
1504 VPX_AVGSAD_8xHEIGHT_MSA(8);
1505 
1506 // 8x4
1507 VPX_SAD_8xHEIGHT_MSA(4);
1508 VPX_SAD_8xHEIGHTx3_MSA(4);
1509 VPX_SAD_8xHEIGHTx8_MSA(4);
1510 VPX_SAD_8xHEIGHTx4D_MSA(4);
1511 VPX_AVGSAD_8xHEIGHT_MSA(4);
1512 
1513 // 4x8
1514 VPX_SAD_4xHEIGHT_MSA(8);
1515 VPX_SAD_4xHEIGHTx3_MSA(8);
1516 VPX_SAD_4xHEIGHTx8_MSA(8);
1517 VPX_SAD_4xHEIGHTx4D_MSA(8);
1518 VPX_AVGSAD_4xHEIGHT_MSA(8);
1519 
1520 // 4x4
1521 VPX_SAD_4xHEIGHT_MSA(4);
1522 VPX_SAD_4xHEIGHTx3_MSA(4);
1523 VPX_SAD_4xHEIGHTx8_MSA(4);
1524 VPX_SAD_4xHEIGHTx4D_MSA(4);
1525 VPX_AVGSAD_4xHEIGHT_MSA(4);
1526