• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_util/loongson_intrinsics.h"
14 
sad_ub2_uh(__m128i in0,__m128i in1,__m128i ref0,__m128i ref1)15 static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
16                                  __m128i ref1) {
17   __m128i diff0_m, diff1_m, sad_m0;
18   __m128i sad_m = __lsx_vldi(0);
19 
20   diff0_m = __lsx_vabsd_bu(in0, ref0);
21   diff1_m = __lsx_vabsd_bu(in1, ref1);
22 
23   sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
24   sad_m = __lsx_vadd_h(sad_m, sad_m0);
25   sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
26   sad_m = __lsx_vadd_h(sad_m, sad_m0);
27 
28   return sad_m;
29 }
30 
hadd_uw_u32(__m128i in)31 static INLINE uint32_t hadd_uw_u32(__m128i in) {
32   __m128i res0_m;
33   uint32_t sum_m;
34 
35   res0_m = __lsx_vhaddw_du_wu(in, in);
36   res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
37   sum_m = __lsx_vpickve2gr_w(res0_m, 0);
38 
39   return sum_m;
40 }
41 
hadd_uh_u32(__m128i in)42 static INLINE uint32_t hadd_uh_u32(__m128i in) {
43   __m128i res_m;
44   uint32_t sum_m;
45 
46   res_m = __lsx_vhaddw_wu_hu(in, in);
47   sum_m = hadd_uw_u32(res_m);
48 
49   return sum_m;
50 }
51 
hadd_sw_s32(__m128i in)52 static INLINE int32_t hadd_sw_s32(__m128i in) {
53   __m128i res0_m;
54   int32_t sum_m;
55 
56   res0_m = __lsx_vhaddw_d_w(in, in);
57   res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
58   sum_m = __lsx_vpickve2gr_w(res0_m, 0);
59 
60   return sum_m;
61 }
62 
sad_8width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)63 static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
64                                const uint8_t *ref, int32_t ref_stride,
65                                int32_t height) {
66   int32_t ht_cnt;
67   uint32_t res;
68   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
69   __m128i sad = __lsx_vldi(0);
70 
71   for (ht_cnt = (height >> 2); ht_cnt--;) {
72     DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
73     src += src_stride;
74     ref += ref_stride;
75     DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
76     src += src_stride;
77     ref += ref_stride;
78     DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
79     src += src_stride;
80     ref += ref_stride;
81     DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
82     src += src_stride;
83     ref += ref_stride;
84     DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
85               src0, src1, ref0, ref1);
86     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
87     sad = __lsx_vadd_h(sad, sad_tmp);
88   }
89   res = hadd_uh_u32(sad);
90   return res;
91 }
92 
sad_16width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)93 static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
94                                 const uint8_t *ref, int32_t ref_stride,
95                                 int32_t height) {
96   int32_t ht_cnt = (height >> 2);
97   uint32_t res;
98   __m128i src0, src1, ref0, ref1, sad_tmp;
99   __m128i sad = __lsx_vldi(0);
100   int32_t src_stride2 = src_stride << 1;
101   int32_t ref_stride2 = ref_stride << 1;
102 
103   for (; ht_cnt--;) {
104     DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
105     DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
106     src += src_stride2;
107     ref += ref_stride2;
108     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
109     sad = __lsx_vadd_h(sad, sad_tmp);
110 
111     DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
112     DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
113     src += src_stride2;
114     ref += ref_stride2;
115     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
116     sad = __lsx_vadd_h(sad, sad_tmp);
117   }
118 
119   res = hadd_uh_u32(sad);
120   return res;
121 }
122 
sad_32width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)123 static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
124                                 const uint8_t *ref, int32_t ref_stride,
125                                 int32_t height) {
126   int32_t ht_cnt = (height >> 2);
127   uint32_t res;
128   __m128i src0, src1, ref0, ref1;
129   __m128i sad_tmp;
130   __m128i sad = __lsx_vldi(0);
131 
132   for (; ht_cnt--;) {
133     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
134     src += src_stride;
135     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
136     ref += ref_stride;
137     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
138     sad = __lsx_vadd_h(sad, sad_tmp);
139 
140     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
141     src += src_stride;
142     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
143     ref += ref_stride;
144     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
145     sad = __lsx_vadd_h(sad, sad_tmp);
146 
147     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
148     src += src_stride;
149     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
150     ref += ref_stride;
151     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
152     sad = __lsx_vadd_h(sad, sad_tmp);
153 
154     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
155     src += src_stride;
156     DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
157     ref += ref_stride;
158     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
159     sad = __lsx_vadd_h(sad, sad_tmp);
160   }
161   res = hadd_uh_u32(sad);
162   return res;
163 }
164 
sad_64width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)165 static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
166                                 const uint8_t *ref, int32_t ref_stride,
167                                 int32_t height) {
168   int32_t ht_cnt = (height >> 1);
169   uint32_t sad = 0;
170   __m128i src0, src1, src2, src3;
171   __m128i ref0, ref1, ref2, ref3;
172   __m128i sad_tmp;
173   __m128i sad0 = __lsx_vldi(0);
174   __m128i sad1 = sad0;
175 
176   for (; ht_cnt--;) {
177     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
178               src3);
179     src += src_stride;
180     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
181               ref3);
182     ref += ref_stride;
183     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
184     sad0 = __lsx_vadd_h(sad0, sad_tmp);
185     sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
186     sad1 = __lsx_vadd_h(sad1, sad_tmp);
187 
188     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
189               src3);
190     src += src_stride;
191     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
192               ref3);
193     ref += ref_stride;
194     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
195     sad0 = __lsx_vadd_h(sad0, sad_tmp);
196     sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
197     sad1 = __lsx_vadd_h(sad1, sad_tmp);
198   }
199 
200   sad = hadd_uh_u32(sad0);
201   sad += hadd_uh_u32(sad1);
202 
203   return sad;
204 }
205 
sad_8width_x4d_lsx(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)206 static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
207                                const uint8_t *const aref_ptr[],
208                                int32_t ref_stride, int32_t height,
209                                uint32_t *sad_array) {
210   int32_t ht_cnt = (height >> 2);
211   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
212   __m128i src0, src1, src2, src3, sad_tmp;
213   __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
214   __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
215   __m128i sad0 = __lsx_vldi(0);
216   __m128i sad1 = sad0;
217   __m128i sad2 = sad0;
218   __m128i sad3 = sad0;
219   int32_t src_stride2 = src_stride << 1;
220   int32_t src_stride3 = src_stride2 + src_stride;
221   int32_t src_stride4 = src_stride2 << 1;
222   int32_t ref_stride2 = ref_stride << 1;
223   int32_t ref_stride3 = ref_stride2 + ref_stride;
224   int32_t ref_stride4 = ref_stride2 << 1;
225 
226   ref0_ptr = aref_ptr[0];
227   ref1_ptr = aref_ptr[1];
228   ref2_ptr = aref_ptr[2];
229   ref3_ptr = aref_ptr[3];
230 
231   for (; ht_cnt--;) {
232     src0 = __lsx_vld(src_ptr, 0);
233     DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
234               src2);
235     src3 = __lsx_vldx(src_ptr, src_stride3);
236     src_ptr += src_stride4;
237     ref0 = __lsx_vld(ref0_ptr, 0);
238     DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
239               ref2);
240     ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
241     ref0_ptr += ref_stride4;
242     ref4 = __lsx_vld(ref1_ptr, 0);
243     DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
244               ref6);
245     ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
246     ref1_ptr += ref_stride4;
247     ref8 = __lsx_vld(ref2_ptr, 0);
248     DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
249               ref10);
250     ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
251     ref2_ptr += ref_stride4;
252     ref12 = __lsx_vld(ref3_ptr, 0);
253     DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
254               ref14);
255     ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
256     ref3_ptr += ref_stride4;
257 
258     DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
259     DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
260     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
261     sad0 = __lsx_vadd_h(sad0, sad_tmp);
262 
263     DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
264     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
265     sad1 = __lsx_vadd_h(sad1, sad_tmp);
266 
267     DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
268     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
269     sad2 = __lsx_vadd_h(sad2, sad_tmp);
270 
271     DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
272     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
273     sad3 = __lsx_vadd_h(sad3, sad_tmp);
274   }
275   sad_array[0] = hadd_uh_u32(sad0);
276   sad_array[1] = hadd_uh_u32(sad1);
277   sad_array[2] = hadd_uh_u32(sad2);
278   sad_array[3] = hadd_uh_u32(sad3);
279 }
280 
sad_16width_x4d_lsx(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)281 static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
282                                 const uint8_t *const aref_ptr[],
283                                 int32_t ref_stride, int32_t height,
284                                 uint32_t *sad_array) {
285   int32_t ht_cnt = (height >> 1);
286   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
287   __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
288   __m128i sad0 = __lsx_vldi(0);
289   __m128i sad1 = sad0;
290   __m128i sad2 = sad0;
291   __m128i sad3 = sad0;
292 
293   ref0_ptr = aref_ptr[0];
294   ref1_ptr = aref_ptr[1];
295   ref2_ptr = aref_ptr[2];
296   ref3_ptr = aref_ptr[3];
297 
298   for (; ht_cnt--;) {
299     src = __lsx_vld(src_ptr, 0);
300     src_ptr += src_stride;
301     ref0 = __lsx_vld(ref0_ptr, 0);
302     ref0_ptr += ref_stride;
303     ref1 = __lsx_vld(ref1_ptr, 0);
304     ref1_ptr += ref_stride;
305     ref2 = __lsx_vld(ref2_ptr, 0);
306     ref2_ptr += ref_stride;
307     ref3 = __lsx_vld(ref3_ptr, 0);
308     ref3_ptr += ref_stride;
309 
310     diff = __lsx_vabsd_bu(src, ref0);
311     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
312     sad0 = __lsx_vadd_h(sad0, sad_tmp);
313     diff = __lsx_vabsd_bu(src, ref1);
314     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
315     sad1 = __lsx_vadd_h(sad1, sad_tmp);
316     diff = __lsx_vabsd_bu(src, ref2);
317     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
318     sad2 = __lsx_vadd_h(sad2, sad_tmp);
319     diff = __lsx_vabsd_bu(src, ref3);
320     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
321     sad3 = __lsx_vadd_h(sad3, sad_tmp);
322 
323     src = __lsx_vld(src_ptr, 0);
324     src_ptr += src_stride;
325     ref0 = __lsx_vld(ref0_ptr, 0);
326     ref0_ptr += ref_stride;
327     ref1 = __lsx_vld(ref1_ptr, 0);
328     ref1_ptr += ref_stride;
329     ref2 = __lsx_vld(ref2_ptr, 0);
330     ref2_ptr += ref_stride;
331     ref3 = __lsx_vld(ref3_ptr, 0);
332     ref3_ptr += ref_stride;
333 
334     diff = __lsx_vabsd_bu(src, ref0);
335     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
336     sad0 = __lsx_vadd_h(sad0, sad_tmp);
337     diff = __lsx_vabsd_bu(src, ref1);
338     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
339     sad1 = __lsx_vadd_h(sad1, sad_tmp);
340     diff = __lsx_vabsd_bu(src, ref2);
341     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
342     sad2 = __lsx_vadd_h(sad2, sad_tmp);
343     diff = __lsx_vabsd_bu(src, ref3);
344     sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
345     sad3 = __lsx_vadd_h(sad3, sad_tmp);
346   }
347   sad_array[0] = hadd_uh_u32(sad0);
348   sad_array[1] = hadd_uh_u32(sad1);
349   sad_array[2] = hadd_uh_u32(sad2);
350   sad_array[3] = hadd_uh_u32(sad3);
351 }
352 
sad_32width_x4d_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)353 static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
354                                 const uint8_t *const aref_ptr[],
355                                 int32_t ref_stride, int32_t height,
356                                 uint32_t *sad_array) {
357   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
358   int32_t ht_cnt = height;
359   __m128i src0, src1, ref0, ref1, sad_tmp;
360   __m128i sad0 = __lsx_vldi(0);
361   __m128i sad1 = sad0;
362   __m128i sad2 = sad0;
363   __m128i sad3 = sad0;
364 
365   ref0_ptr = aref_ptr[0];
366   ref1_ptr = aref_ptr[1];
367   ref2_ptr = aref_ptr[2];
368   ref3_ptr = aref_ptr[3];
369 
370   for (; ht_cnt--;) {
371     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
372     src += src_stride;
373 
374     DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
375     ref0_ptr += ref_stride;
376     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
377     sad0 = __lsx_vadd_h(sad0, sad_tmp);
378 
379     DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
380     ref1_ptr += ref_stride;
381     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
382     sad1 = __lsx_vadd_h(sad1, sad_tmp);
383 
384     DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
385     ref2_ptr += ref_stride;
386     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
387     sad2 = __lsx_vadd_h(sad2, sad_tmp);
388 
389     DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
390     ref3_ptr += ref_stride;
391     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
392     sad3 = __lsx_vadd_h(sad3, sad_tmp);
393   }
394   sad_array[0] = hadd_uh_u32(sad0);
395   sad_array[1] = hadd_uh_u32(sad1);
396   sad_array[2] = hadd_uh_u32(sad2);
397   sad_array[3] = hadd_uh_u32(sad3);
398 }
399 
sad_64width_x4d_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)400 static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
401                                 const uint8_t *const aref_ptr[],
402                                 int32_t ref_stride, int32_t height,
403                                 uint32_t *sad_array) {
404   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
405   int32_t ht_cnt = height;
406   __m128i src0, src1, src2, src3;
407   __m128i ref0, ref1, ref2, ref3;
408   __m128i sad, sad_tmp;
409 
410   __m128i sad0_0 = __lsx_vldi(0);
411   __m128i sad0_1 = sad0_0;
412   __m128i sad1_0 = sad0_0;
413   __m128i sad1_1 = sad0_0;
414   __m128i sad2_0 = sad0_0;
415   __m128i sad2_1 = sad0_0;
416   __m128i sad3_0 = sad0_0;
417   __m128i sad3_1 = sad0_0;
418 
419   ref0_ptr = aref_ptr[0];
420   ref1_ptr = aref_ptr[1];
421   ref2_ptr = aref_ptr[2];
422   ref3_ptr = aref_ptr[3];
423 
424   for (; ht_cnt--;) {
425     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
426               src3);
427     src += src_stride;
428 
429     DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
430               ref0, ref1, ref2, ref3);
431     ref0_ptr += ref_stride;
432     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
433     sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
434     sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
435     sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
436 
437     DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
438               ref0, ref1, ref2, ref3);
439     ref1_ptr += ref_stride;
440     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
441     sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
442     sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
443     sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
444 
445     DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
446               ref0, ref1, ref2, ref3);
447     ref2_ptr += ref_stride;
448     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
449     sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
450     sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
451     sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
452 
453     DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
454               ref0, ref1, ref2, ref3);
455     ref3_ptr += ref_stride;
456     sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
457     sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
458     sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
459     sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
460   }
461   sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
462   sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
463   sad = __lsx_vadd_w(sad, sad_tmp);
464   sad_array[0] = hadd_uw_u32(sad);
465 
466   sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
467   sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
468   sad = __lsx_vadd_w(sad, sad_tmp);
469   sad_array[1] = hadd_uw_u32(sad);
470 
471   sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
472   sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
473   sad = __lsx_vadd_w(sad, sad_tmp);
474   sad_array[2] = hadd_uw_u32(sad);
475 
476   sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
477   sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
478   sad = __lsx_vadd_w(sad, sad_tmp);
479   sad_array[3] = hadd_uw_u32(sad);
480 }
481 
avgsad_32width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)482 static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
483                                    const uint8_t *ref, int32_t ref_stride,
484                                    int32_t height, const uint8_t *sec_pred) {
485   int32_t res, ht_cnt = (height >> 2);
486   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
487   __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
488   __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
489   __m128i comp0, comp1, sad_tmp;
490   __m128i sad = __lsx_vldi(0);
491   uint8_t *src_tmp, *ref_tmp;
492   int32_t src_stride2 = src_stride << 1;
493   int32_t src_stride3 = src_stride2 + src_stride;
494   int32_t src_stride4 = src_stride2 << 1;
495   int32_t ref_stride2 = ref_stride << 1;
496   int32_t ref_stride3 = ref_stride2 + ref_stride;
497   int32_t ref_stride4 = ref_stride2 << 1;
498 
499   for (; ht_cnt--;) {
500     src_tmp = (uint8_t *)src + 16;
501     src0 = __lsx_vld(src, 0);
502     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
503     src6 = __lsx_vldx(src, src_stride3);
504     src1 = __lsx_vld(src_tmp, 0);
505     DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
506               src5);
507     src7 = __lsx_vldx(src_tmp, src_stride3);
508     src += src_stride4;
509 
510     ref_tmp = (uint8_t *)ref + 16;
511     ref0 = __lsx_vld(ref, 0);
512     DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
513     ref6 = __lsx_vldx(ref, ref_stride3);
514     ref1 = __lsx_vld(ref_tmp, 0);
515     DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
516               ref5);
517     ref7 = __lsx_vldx(ref_tmp, ref_stride3);
518     ref += ref_stride4;
519 
520     DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
521               pred0, pred2, pred4, pred6);
522     DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
523               112, pred1, pred3, pred5, pred7);
524     sec_pred += 128;
525 
526     DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
527     sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
528     sad = __lsx_vadd_h(sad, sad_tmp);
529     DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
530     sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
531     sad = __lsx_vadd_h(sad, sad_tmp);
532     DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
533     sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
534     sad = __lsx_vadd_h(sad, sad_tmp);
535     DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
536     sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
537     sad = __lsx_vadd_h(sad, sad_tmp);
538   }
539   res = hadd_uh_u32(sad);
540   return res;
541 }
542 
avgsad_64width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)543 static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
544                                    const uint8_t *ref, int32_t ref_stride,
545                                    int32_t height, const uint8_t *sec_pred) {
546   int32_t res, ht_cnt = (height >> 2);
547   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
548   __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
549   __m128i sad, sad_tmp;
550   __m128i sad0 = __lsx_vldi(0);
551   __m128i sad1 = sad0;
552 
553   for (; ht_cnt--;) {
554     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
555               src3);
556     src += src_stride;
557     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
558               ref3);
559     ref += ref_stride;
560     DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
561               pred0, pred1, pred2, pred3);
562     sec_pred += 64;
563     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
564               ref3, comp0, comp1, comp2, comp3);
565     sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
566     sad0 = __lsx_vadd_h(sad0, sad_tmp);
567     sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
568     sad1 = __lsx_vadd_h(sad1, sad_tmp);
569 
570     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
571               src3);
572     src += src_stride;
573     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
574               ref3);
575     ref += ref_stride;
576     DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
577               pred0, pred1, pred2, pred3);
578     sec_pred += 64;
579     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
580               ref3, comp0, comp1, comp2, comp3);
581     sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
582     sad0 = __lsx_vadd_h(sad0, sad_tmp);
583     sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
584     sad1 = __lsx_vadd_h(sad1, sad_tmp);
585 
586     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
587               src3);
588     src += src_stride;
589     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
590               ref3);
591     ref += ref_stride;
592     DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
593               pred0, pred1, pred2, pred3);
594     sec_pred += 64;
595     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
596               ref3, comp0, comp1, comp2, comp3);
597     sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
598     sad0 = __lsx_vadd_h(sad0, sad_tmp);
599     sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
600     sad1 = __lsx_vadd_h(sad1, sad_tmp);
601 
602     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
603               src3);
604     src += src_stride;
605     DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
606               ref3);
607     ref += ref_stride;
608     DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
609               pred0, pred1, pred2, pred3);
610     sec_pred += 64;
611     DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
612               ref3, comp0, comp1, comp2, comp3);
613     sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
614     sad0 = __lsx_vadd_h(sad0, sad_tmp);
615     sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
616     sad1 = __lsx_vadd_h(sad1, sad_tmp);
617   }
618   sad = __lsx_vhaddw_wu_hu(sad0, sad0);
619   sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
620   sad = __lsx_vadd_w(sad, sad_tmp);
621 
622   res = hadd_sw_s32(sad);
623   return res;
624 }
625 
626 #define VPX_SAD_8xHT_LSX(height)                                             \
627   uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
628                                    const uint8_t *ref, int32_t ref_stride) { \
629     return sad_8width_lsx(src, src_stride, ref, ref_stride, height);         \
630   }
631 
632 #define VPX_SAD_16xHT_LSX(height)                                             \
633   uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
634                                     const uint8_t *ref, int32_t ref_stride) { \
635     return sad_16width_lsx(src, src_stride, ref, ref_stride, height);         \
636   }
637 
638 #define VPX_SAD_32xHT_LSX(height)                                             \
639   uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
640                                     const uint8_t *ref, int32_t ref_stride) { \
641     return sad_32width_lsx(src, src_stride, ref, ref_stride, height);         \
642   }
643 
644 #define VPX_SAD_64xHT_LSX(height)                                             \
645   uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
646                                     const uint8_t *ref, int32_t ref_stride) { \
647     return sad_64width_lsx(src, src_stride, ref, ref_stride, height);         \
648   }
649 
650 #define VPX_SAD_8xHTx4D_LSX(height)                                       \
651   void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
652                                   const uint8_t *const refs[4],           \
653                                   int32_t ref_stride, uint32_t sads[4]) { \
654     sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
655   }
656 
657 #define VPX_SAD_16xHTx4D_LSX(height)                                       \
658   void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
659                                    const uint8_t *const refs[],            \
660                                    int32_t ref_stride, uint32_t *sads) {   \
661     sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
662   }
663 
664 #define VPX_SAD_32xHTx4D_LSX(height)                                       \
665   void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
666                                    const uint8_t *const refs[],            \
667                                    int32_t ref_stride, uint32_t *sads) {   \
668     sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
669   }
670 
671 #define VPX_SAD_64xHTx4D_LSX(height)                                       \
672   void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
673                                    const uint8_t *const refs[],            \
674                                    int32_t ref_stride, uint32_t *sads) {   \
675     sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
676   }
677 
678 #define VPX_AVGSAD_32xHT_LSX(height)                                    \
679   uint32_t vpx_sad32x##height##_avg_lsx(                                \
680       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
681       int32_t ref_stride, const uint8_t *second_pred) {                 \
682     return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
683                               second_pred);                             \
684   }
685 
686 #define VPX_AVGSAD_64xHT_LSX(height)                                    \
687   uint32_t vpx_sad64x##height##_avg_lsx(                                \
688       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
689       int32_t ref_stride, const uint8_t *second_pred) {                 \
690     return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
691                               second_pred);                             \
692   }
693 
694 #define SAD64                                                             \
695   VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
696       VPX_AVGSAD_64xHT_LSX(64)
697 
698 SAD64
699 
700 #define SAD32                                                             \
701   VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
702       VPX_AVGSAD_32xHT_LSX(32)
703 
704 SAD32
705 
706 #define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
707 
708 SAD16
709 
710 #define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
711 
712 SAD8
713 
714 #undef SAD64
715 #undef SAD32
716 #undef SAD16
717 #undef SAD8
718