1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_util/loongson_intrinsics.h"
14
sad_ub2_uh(__m128i in0,__m128i in1,__m128i ref0,__m128i ref1)15 static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
16 __m128i ref1) {
17 __m128i diff0_m, diff1_m, sad_m0;
18 __m128i sad_m = __lsx_vldi(0);
19
20 diff0_m = __lsx_vabsd_bu(in0, ref0);
21 diff1_m = __lsx_vabsd_bu(in1, ref1);
22
23 sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
24 sad_m = __lsx_vadd_h(sad_m, sad_m0);
25 sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
26 sad_m = __lsx_vadd_h(sad_m, sad_m0);
27
28 return sad_m;
29 }
30
hadd_uw_u32(__m128i in)31 static INLINE uint32_t hadd_uw_u32(__m128i in) {
32 __m128i res0_m;
33 uint32_t sum_m;
34
35 res0_m = __lsx_vhaddw_du_wu(in, in);
36 res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
37 sum_m = __lsx_vpickve2gr_w(res0_m, 0);
38
39 return sum_m;
40 }
41
hadd_uh_u32(__m128i in)42 static INLINE uint32_t hadd_uh_u32(__m128i in) {
43 __m128i res_m;
44 uint32_t sum_m;
45
46 res_m = __lsx_vhaddw_wu_hu(in, in);
47 sum_m = hadd_uw_u32(res_m);
48
49 return sum_m;
50 }
51
hadd_sw_s32(__m128i in)52 static INLINE int32_t hadd_sw_s32(__m128i in) {
53 __m128i res0_m;
54 int32_t sum_m;
55
56 res0_m = __lsx_vhaddw_d_w(in, in);
57 res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
58 sum_m = __lsx_vpickve2gr_w(res0_m, 0);
59
60 return sum_m;
61 }
62
sad_8width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)63 static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
64 const uint8_t *ref, int32_t ref_stride,
65 int32_t height) {
66 int32_t ht_cnt;
67 uint32_t res;
68 __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
69 __m128i sad = __lsx_vldi(0);
70
71 for (ht_cnt = (height >> 2); ht_cnt--;) {
72 DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
73 src += src_stride;
74 ref += ref_stride;
75 DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
76 src += src_stride;
77 ref += ref_stride;
78 DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
79 src += src_stride;
80 ref += ref_stride;
81 DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
82 src += src_stride;
83 ref += ref_stride;
84 DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
85 src0, src1, ref0, ref1);
86 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
87 sad = __lsx_vadd_h(sad, sad_tmp);
88 }
89 res = hadd_uh_u32(sad);
90 return res;
91 }
92
sad_16width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)93 static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
94 const uint8_t *ref, int32_t ref_stride,
95 int32_t height) {
96 int32_t ht_cnt = (height >> 2);
97 uint32_t res;
98 __m128i src0, src1, ref0, ref1, sad_tmp;
99 __m128i sad = __lsx_vldi(0);
100 int32_t src_stride2 = src_stride << 1;
101 int32_t ref_stride2 = ref_stride << 1;
102
103 for (; ht_cnt--;) {
104 DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
105 DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
106 src += src_stride2;
107 ref += ref_stride2;
108 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
109 sad = __lsx_vadd_h(sad, sad_tmp);
110
111 DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
112 DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
113 src += src_stride2;
114 ref += ref_stride2;
115 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
116 sad = __lsx_vadd_h(sad, sad_tmp);
117 }
118
119 res = hadd_uh_u32(sad);
120 return res;
121 }
122
sad_32width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)123 static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
124 const uint8_t *ref, int32_t ref_stride,
125 int32_t height) {
126 int32_t ht_cnt = (height >> 2);
127 uint32_t res;
128 __m128i src0, src1, ref0, ref1;
129 __m128i sad_tmp;
130 __m128i sad = __lsx_vldi(0);
131
132 for (; ht_cnt--;) {
133 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
134 src += src_stride;
135 DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
136 ref += ref_stride;
137 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
138 sad = __lsx_vadd_h(sad, sad_tmp);
139
140 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
141 src += src_stride;
142 DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
143 ref += ref_stride;
144 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
145 sad = __lsx_vadd_h(sad, sad_tmp);
146
147 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
148 src += src_stride;
149 DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
150 ref += ref_stride;
151 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
152 sad = __lsx_vadd_h(sad, sad_tmp);
153
154 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
155 src += src_stride;
156 DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
157 ref += ref_stride;
158 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
159 sad = __lsx_vadd_h(sad, sad_tmp);
160 }
161 res = hadd_uh_u32(sad);
162 return res;
163 }
164
sad_64width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)165 static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
166 const uint8_t *ref, int32_t ref_stride,
167 int32_t height) {
168 int32_t ht_cnt = (height >> 1);
169 uint32_t sad = 0;
170 __m128i src0, src1, src2, src3;
171 __m128i ref0, ref1, ref2, ref3;
172 __m128i sad_tmp;
173 __m128i sad0 = __lsx_vldi(0);
174 __m128i sad1 = sad0;
175
176 for (; ht_cnt--;) {
177 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
178 src3);
179 src += src_stride;
180 DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
181 ref3);
182 ref += ref_stride;
183 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
184 sad0 = __lsx_vadd_h(sad0, sad_tmp);
185 sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
186 sad1 = __lsx_vadd_h(sad1, sad_tmp);
187
188 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
189 src3);
190 src += src_stride;
191 DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
192 ref3);
193 ref += ref_stride;
194 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
195 sad0 = __lsx_vadd_h(sad0, sad_tmp);
196 sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
197 sad1 = __lsx_vadd_h(sad1, sad_tmp);
198 }
199
200 sad = hadd_uh_u32(sad0);
201 sad += hadd_uh_u32(sad1);
202
203 return sad;
204 }
205
sad_8width_x4d_lsx(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)206 static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
207 const uint8_t *const aref_ptr[],
208 int32_t ref_stride, int32_t height,
209 uint32_t *sad_array) {
210 int32_t ht_cnt = (height >> 2);
211 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
212 __m128i src0, src1, src2, src3, sad_tmp;
213 __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
214 __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
215 __m128i sad0 = __lsx_vldi(0);
216 __m128i sad1 = sad0;
217 __m128i sad2 = sad0;
218 __m128i sad3 = sad0;
219 int32_t src_stride2 = src_stride << 1;
220 int32_t src_stride3 = src_stride2 + src_stride;
221 int32_t src_stride4 = src_stride2 << 1;
222 int32_t ref_stride2 = ref_stride << 1;
223 int32_t ref_stride3 = ref_stride2 + ref_stride;
224 int32_t ref_stride4 = ref_stride2 << 1;
225
226 ref0_ptr = aref_ptr[0];
227 ref1_ptr = aref_ptr[1];
228 ref2_ptr = aref_ptr[2];
229 ref3_ptr = aref_ptr[3];
230
231 for (; ht_cnt--;) {
232 src0 = __lsx_vld(src_ptr, 0);
233 DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
234 src2);
235 src3 = __lsx_vldx(src_ptr, src_stride3);
236 src_ptr += src_stride4;
237 ref0 = __lsx_vld(ref0_ptr, 0);
238 DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
239 ref2);
240 ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
241 ref0_ptr += ref_stride4;
242 ref4 = __lsx_vld(ref1_ptr, 0);
243 DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
244 ref6);
245 ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
246 ref1_ptr += ref_stride4;
247 ref8 = __lsx_vld(ref2_ptr, 0);
248 DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
249 ref10);
250 ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
251 ref2_ptr += ref_stride4;
252 ref12 = __lsx_vld(ref3_ptr, 0);
253 DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
254 ref14);
255 ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
256 ref3_ptr += ref_stride4;
257
258 DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
259 DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
260 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
261 sad0 = __lsx_vadd_h(sad0, sad_tmp);
262
263 DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
264 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
265 sad1 = __lsx_vadd_h(sad1, sad_tmp);
266
267 DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
268 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
269 sad2 = __lsx_vadd_h(sad2, sad_tmp);
270
271 DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
272 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
273 sad3 = __lsx_vadd_h(sad3, sad_tmp);
274 }
275 sad_array[0] = hadd_uh_u32(sad0);
276 sad_array[1] = hadd_uh_u32(sad1);
277 sad_array[2] = hadd_uh_u32(sad2);
278 sad_array[3] = hadd_uh_u32(sad3);
279 }
280
sad_16width_x4d_lsx(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)281 static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
282 const uint8_t *const aref_ptr[],
283 int32_t ref_stride, int32_t height,
284 uint32_t *sad_array) {
285 int32_t ht_cnt = (height >> 1);
286 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
287 __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
288 __m128i sad0 = __lsx_vldi(0);
289 __m128i sad1 = sad0;
290 __m128i sad2 = sad0;
291 __m128i sad3 = sad0;
292
293 ref0_ptr = aref_ptr[0];
294 ref1_ptr = aref_ptr[1];
295 ref2_ptr = aref_ptr[2];
296 ref3_ptr = aref_ptr[3];
297
298 for (; ht_cnt--;) {
299 src = __lsx_vld(src_ptr, 0);
300 src_ptr += src_stride;
301 ref0 = __lsx_vld(ref0_ptr, 0);
302 ref0_ptr += ref_stride;
303 ref1 = __lsx_vld(ref1_ptr, 0);
304 ref1_ptr += ref_stride;
305 ref2 = __lsx_vld(ref2_ptr, 0);
306 ref2_ptr += ref_stride;
307 ref3 = __lsx_vld(ref3_ptr, 0);
308 ref3_ptr += ref_stride;
309
310 diff = __lsx_vabsd_bu(src, ref0);
311 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
312 sad0 = __lsx_vadd_h(sad0, sad_tmp);
313 diff = __lsx_vabsd_bu(src, ref1);
314 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
315 sad1 = __lsx_vadd_h(sad1, sad_tmp);
316 diff = __lsx_vabsd_bu(src, ref2);
317 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
318 sad2 = __lsx_vadd_h(sad2, sad_tmp);
319 diff = __lsx_vabsd_bu(src, ref3);
320 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
321 sad3 = __lsx_vadd_h(sad3, sad_tmp);
322
323 src = __lsx_vld(src_ptr, 0);
324 src_ptr += src_stride;
325 ref0 = __lsx_vld(ref0_ptr, 0);
326 ref0_ptr += ref_stride;
327 ref1 = __lsx_vld(ref1_ptr, 0);
328 ref1_ptr += ref_stride;
329 ref2 = __lsx_vld(ref2_ptr, 0);
330 ref2_ptr += ref_stride;
331 ref3 = __lsx_vld(ref3_ptr, 0);
332 ref3_ptr += ref_stride;
333
334 diff = __lsx_vabsd_bu(src, ref0);
335 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
336 sad0 = __lsx_vadd_h(sad0, sad_tmp);
337 diff = __lsx_vabsd_bu(src, ref1);
338 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
339 sad1 = __lsx_vadd_h(sad1, sad_tmp);
340 diff = __lsx_vabsd_bu(src, ref2);
341 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
342 sad2 = __lsx_vadd_h(sad2, sad_tmp);
343 diff = __lsx_vabsd_bu(src, ref3);
344 sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
345 sad3 = __lsx_vadd_h(sad3, sad_tmp);
346 }
347 sad_array[0] = hadd_uh_u32(sad0);
348 sad_array[1] = hadd_uh_u32(sad1);
349 sad_array[2] = hadd_uh_u32(sad2);
350 sad_array[3] = hadd_uh_u32(sad3);
351 }
352
sad_32width_x4d_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)353 static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
354 const uint8_t *const aref_ptr[],
355 int32_t ref_stride, int32_t height,
356 uint32_t *sad_array) {
357 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
358 int32_t ht_cnt = height;
359 __m128i src0, src1, ref0, ref1, sad_tmp;
360 __m128i sad0 = __lsx_vldi(0);
361 __m128i sad1 = sad0;
362 __m128i sad2 = sad0;
363 __m128i sad3 = sad0;
364
365 ref0_ptr = aref_ptr[0];
366 ref1_ptr = aref_ptr[1];
367 ref2_ptr = aref_ptr[2];
368 ref3_ptr = aref_ptr[3];
369
370 for (; ht_cnt--;) {
371 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
372 src += src_stride;
373
374 DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
375 ref0_ptr += ref_stride;
376 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
377 sad0 = __lsx_vadd_h(sad0, sad_tmp);
378
379 DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
380 ref1_ptr += ref_stride;
381 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
382 sad1 = __lsx_vadd_h(sad1, sad_tmp);
383
384 DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
385 ref2_ptr += ref_stride;
386 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
387 sad2 = __lsx_vadd_h(sad2, sad_tmp);
388
389 DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
390 ref3_ptr += ref_stride;
391 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
392 sad3 = __lsx_vadd_h(sad3, sad_tmp);
393 }
394 sad_array[0] = hadd_uh_u32(sad0);
395 sad_array[1] = hadd_uh_u32(sad1);
396 sad_array[2] = hadd_uh_u32(sad2);
397 sad_array[3] = hadd_uh_u32(sad3);
398 }
399
sad_64width_x4d_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)400 static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
401 const uint8_t *const aref_ptr[],
402 int32_t ref_stride, int32_t height,
403 uint32_t *sad_array) {
404 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
405 int32_t ht_cnt = height;
406 __m128i src0, src1, src2, src3;
407 __m128i ref0, ref1, ref2, ref3;
408 __m128i sad, sad_tmp;
409
410 __m128i sad0_0 = __lsx_vldi(0);
411 __m128i sad0_1 = sad0_0;
412 __m128i sad1_0 = sad0_0;
413 __m128i sad1_1 = sad0_0;
414 __m128i sad2_0 = sad0_0;
415 __m128i sad2_1 = sad0_0;
416 __m128i sad3_0 = sad0_0;
417 __m128i sad3_1 = sad0_0;
418
419 ref0_ptr = aref_ptr[0];
420 ref1_ptr = aref_ptr[1];
421 ref2_ptr = aref_ptr[2];
422 ref3_ptr = aref_ptr[3];
423
424 for (; ht_cnt--;) {
425 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
426 src3);
427 src += src_stride;
428
429 DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
430 ref0, ref1, ref2, ref3);
431 ref0_ptr += ref_stride;
432 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
433 sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
434 sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
435 sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
436
437 DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
438 ref0, ref1, ref2, ref3);
439 ref1_ptr += ref_stride;
440 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
441 sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
442 sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
443 sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
444
445 DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
446 ref0, ref1, ref2, ref3);
447 ref2_ptr += ref_stride;
448 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
449 sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
450 sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
451 sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
452
453 DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
454 ref0, ref1, ref2, ref3);
455 ref3_ptr += ref_stride;
456 sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
457 sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
458 sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
459 sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
460 }
461 sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
462 sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
463 sad = __lsx_vadd_w(sad, sad_tmp);
464 sad_array[0] = hadd_uw_u32(sad);
465
466 sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
467 sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
468 sad = __lsx_vadd_w(sad, sad_tmp);
469 sad_array[1] = hadd_uw_u32(sad);
470
471 sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
472 sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
473 sad = __lsx_vadd_w(sad, sad_tmp);
474 sad_array[2] = hadd_uw_u32(sad);
475
476 sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
477 sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
478 sad = __lsx_vadd_w(sad, sad_tmp);
479 sad_array[3] = hadd_uw_u32(sad);
480 }
481
avgsad_32width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)482 static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
483 const uint8_t *ref, int32_t ref_stride,
484 int32_t height, const uint8_t *sec_pred) {
485 int32_t res, ht_cnt = (height >> 2);
486 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
487 __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
488 __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
489 __m128i comp0, comp1, sad_tmp;
490 __m128i sad = __lsx_vldi(0);
491 uint8_t *src_tmp, *ref_tmp;
492 int32_t src_stride2 = src_stride << 1;
493 int32_t src_stride3 = src_stride2 + src_stride;
494 int32_t src_stride4 = src_stride2 << 1;
495 int32_t ref_stride2 = ref_stride << 1;
496 int32_t ref_stride3 = ref_stride2 + ref_stride;
497 int32_t ref_stride4 = ref_stride2 << 1;
498
499 for (; ht_cnt--;) {
500 src_tmp = (uint8_t *)src + 16;
501 src0 = __lsx_vld(src, 0);
502 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
503 src6 = __lsx_vldx(src, src_stride3);
504 src1 = __lsx_vld(src_tmp, 0);
505 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
506 src5);
507 src7 = __lsx_vldx(src_tmp, src_stride3);
508 src += src_stride4;
509
510 ref_tmp = (uint8_t *)ref + 16;
511 ref0 = __lsx_vld(ref, 0);
512 DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
513 ref6 = __lsx_vldx(ref, ref_stride3);
514 ref1 = __lsx_vld(ref_tmp, 0);
515 DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
516 ref5);
517 ref7 = __lsx_vldx(ref_tmp, ref_stride3);
518 ref += ref_stride4;
519
520 DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
521 pred0, pred2, pred4, pred6);
522 DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
523 112, pred1, pred3, pred5, pred7);
524 sec_pred += 128;
525
526 DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
527 sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
528 sad = __lsx_vadd_h(sad, sad_tmp);
529 DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
530 sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
531 sad = __lsx_vadd_h(sad, sad_tmp);
532 DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
533 sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
534 sad = __lsx_vadd_h(sad, sad_tmp);
535 DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
536 sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
537 sad = __lsx_vadd_h(sad, sad_tmp);
538 }
539 res = hadd_uh_u32(sad);
540 return res;
541 }
542
avgsad_64width_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)543 static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
544 const uint8_t *ref, int32_t ref_stride,
545 int32_t height, const uint8_t *sec_pred) {
546 int32_t res, ht_cnt = (height >> 2);
547 __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
548 __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
549 __m128i sad, sad_tmp;
550 __m128i sad0 = __lsx_vldi(0);
551 __m128i sad1 = sad0;
552
553 for (; ht_cnt--;) {
554 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
555 src3);
556 src += src_stride;
557 DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
558 ref3);
559 ref += ref_stride;
560 DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
561 pred0, pred1, pred2, pred3);
562 sec_pred += 64;
563 DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
564 ref3, comp0, comp1, comp2, comp3);
565 sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
566 sad0 = __lsx_vadd_h(sad0, sad_tmp);
567 sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
568 sad1 = __lsx_vadd_h(sad1, sad_tmp);
569
570 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
571 src3);
572 src += src_stride;
573 DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
574 ref3);
575 ref += ref_stride;
576 DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
577 pred0, pred1, pred2, pred3);
578 sec_pred += 64;
579 DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
580 ref3, comp0, comp1, comp2, comp3);
581 sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
582 sad0 = __lsx_vadd_h(sad0, sad_tmp);
583 sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
584 sad1 = __lsx_vadd_h(sad1, sad_tmp);
585
586 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
587 src3);
588 src += src_stride;
589 DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
590 ref3);
591 ref += ref_stride;
592 DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
593 pred0, pred1, pred2, pred3);
594 sec_pred += 64;
595 DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
596 ref3, comp0, comp1, comp2, comp3);
597 sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
598 sad0 = __lsx_vadd_h(sad0, sad_tmp);
599 sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
600 sad1 = __lsx_vadd_h(sad1, sad_tmp);
601
602 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
603 src3);
604 src += src_stride;
605 DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
606 ref3);
607 ref += ref_stride;
608 DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
609 pred0, pred1, pred2, pred3);
610 sec_pred += 64;
611 DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
612 ref3, comp0, comp1, comp2, comp3);
613 sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
614 sad0 = __lsx_vadd_h(sad0, sad_tmp);
615 sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
616 sad1 = __lsx_vadd_h(sad1, sad_tmp);
617 }
618 sad = __lsx_vhaddw_wu_hu(sad0, sad0);
619 sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
620 sad = __lsx_vadd_w(sad, sad_tmp);
621
622 res = hadd_sw_s32(sad);
623 return res;
624 }
625
626 #define VPX_SAD_8xHT_LSX(height) \
627 uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \
628 const uint8_t *ref, int32_t ref_stride) { \
629 return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \
630 }
631
632 #define VPX_SAD_16xHT_LSX(height) \
633 uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \
634 const uint8_t *ref, int32_t ref_stride) { \
635 return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \
636 }
637
638 #define VPX_SAD_32xHT_LSX(height) \
639 uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \
640 const uint8_t *ref, int32_t ref_stride) { \
641 return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \
642 }
643
644 #define VPX_SAD_64xHT_LSX(height) \
645 uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \
646 const uint8_t *ref, int32_t ref_stride) { \
647 return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \
648 }
649
650 #define VPX_SAD_8xHTx4D_LSX(height) \
651 void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
652 const uint8_t *const refs[4], \
653 int32_t ref_stride, uint32_t sads[4]) { \
654 sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
655 }
656
657 #define VPX_SAD_16xHTx4D_LSX(height) \
658 void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
659 const uint8_t *const refs[], \
660 int32_t ref_stride, uint32_t *sads) { \
661 sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
662 }
663
664 #define VPX_SAD_32xHTx4D_LSX(height) \
665 void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
666 const uint8_t *const refs[], \
667 int32_t ref_stride, uint32_t *sads) { \
668 sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
669 }
670
671 #define VPX_SAD_64xHTx4D_LSX(height) \
672 void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
673 const uint8_t *const refs[], \
674 int32_t ref_stride, uint32_t *sads) { \
675 sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
676 }
677
678 #define VPX_AVGSAD_32xHT_LSX(height) \
679 uint32_t vpx_sad32x##height##_avg_lsx( \
680 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
681 int32_t ref_stride, const uint8_t *second_pred) { \
682 return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
683 second_pred); \
684 }
685
686 #define VPX_AVGSAD_64xHT_LSX(height) \
687 uint32_t vpx_sad64x##height##_avg_lsx( \
688 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
689 int32_t ref_stride, const uint8_t *second_pred) { \
690 return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
691 second_pred); \
692 }
693
694 #define SAD64 \
695 VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
696 VPX_AVGSAD_64xHT_LSX(64)
697
698 SAD64
699
700 #define SAD32 \
701 VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
702 VPX_AVGSAD_32xHT_LSX(32)
703
704 SAD32
705
706 #define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
707
708 SAD16
709
710 #define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
711
712 SAD8
713
714 #undef SAD64
715 #undef SAD32
716 #undef SAD16
717 #undef SAD8
718