1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file satd_sad_lasx.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 12/10/2021 Created
37 *
38 *************************************************************************************
39 */
40
41 #include <stdint.h>
42 #include "loongson_intrinsics.h"
43
44 #define HORISUM(in0, in1, out0) \
45 out0 = __lasx_xvabsd_bu(in0, in1); \
46 out0 = __lasx_xvhaddw_hu_bu(out0, out0); \
47 out0 = __lasx_xvhaddw_wu_hu(out0, out0); \
48 out0 = __lasx_xvhaddw_du_wu(out0, out0); \
49
WelsSampleSad4x4_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)50 int32_t WelsSampleSad4x4_lasx (uint8_t* pSample1, int32_t iStride1,
51 uint8_t* pSample2, int32_t iStride2) {
52 uint8_t *pSrc1 = pSample1;
53 uint8_t *pSrc2 = pSample2;
54 int32_t iStride0 = 0;
55 int32_t iStride1_tmp = iStride1 << 1;
56 int32_t iStride2_tmp = iStride2 << 1;
57
58 __m256i src1_0, src1_1, src1_2, src1_3;
59 __m256i src2_0, src2_1, src2_2, src2_3;
60
61 DUP4_ARG2(__lasx_xvldx,
62 pSrc1, iStride0,
63 pSrc1, iStride1,
64 pSrc1, iStride1_tmp,
65 pSrc1, iStride1_tmp + iStride1,
66 src1_0, src1_1, src1_2, src1_3);
67 DUP4_ARG2(__lasx_xvldx,
68 pSrc2, iStride0,
69 pSrc2, iStride2,
70 pSrc2, iStride2_tmp,
71 pSrc2, iStride2_tmp + iStride2,
72 src2_0, src2_1, src2_2, src2_3);
73
74 DUP2_ARG2(__lasx_xvpackev_w,
75 src1_0, src1_1, src1_2, src1_3,
76 src1_0, src1_2);
77 DUP2_ARG2(__lasx_xvpackev_w,
78 src2_0, src2_1, src2_2, src2_3,
79 src2_0, src2_2);
80 DUP2_ARG2(__lasx_xvpackev_d,
81 src1_0, src1_2, src2_0, src2_2,
82 src1_0, src2_0);
83
84 HORISUM(src1_0, src2_0, src1_0);
85
86 src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0);
87
88 return __lasx_xvpickve2gr_d(src1_0, 0);
89 }
90
91 static inline
WelsSampleSad8x8x2_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)92 int32_t WelsSampleSad8x8x2_lasx (uint8_t* pSample1, int32_t iStride1,
93 uint8_t* pSample2, int32_t iStride2) {
94 uint8_t* pSrc1 = pSample1;
95 uint8_t* pSrc2 = pSample2;
96 int32_t iStride0 = 0;
97 int32_t iStride1_tmp2 = iStride1 << 1;
98 int32_t iStride1_tmp3 = iStride1_tmp2 + iStride1;
99 int32_t iStride1_tmp4 = iStride1 << 2;
100 int32_t iStride1_tmp5 = iStride1_tmp4 + iStride1;
101 int32_t iStride1_tmp6 = iStride1_tmp5 + iStride1;
102 int32_t iStride1_tmp7 = iStride1_tmp6 + iStride1;
103 int32_t iStride2_tmp2 = iStride2 << 1;
104 int32_t iStride2_tmp3 = iStride2_tmp2 + iStride2;
105 int32_t iStride2_tmp4 = iStride2 << 2;
106 int32_t iStride2_tmp5 = iStride2_tmp4 + iStride2;
107 int32_t iStride2_tmp6 = iStride2_tmp5 + iStride2;
108 int32_t iStride2_tmp7 = iStride2_tmp6 + iStride2;
109
110 __m256i src1_0, src1_1, src1_2, src1_3,
111 src1_4, src1_5, src1_6, src1_7;
112 __m256i src2_0, src2_1, src2_2, src2_3,
113 src2_4, src2_5, src2_6, src2_7;
114
115 DUP4_ARG2(__lasx_xvldx,
116 pSrc1, iStride0,
117 pSrc1, iStride1,
118 pSrc1, iStride1_tmp2,
119 pSrc1, iStride1_tmp3,
120 src1_0, src1_1, src1_2, src1_3);
121 DUP4_ARG2(__lasx_xvldx,
122 pSrc1, iStride1_tmp4,
123 pSrc1, iStride1_tmp5,
124 pSrc1, iStride1_tmp6,
125 pSrc1, iStride1_tmp7,
126 src1_4, src1_5, src1_6, src1_7);
127 DUP4_ARG2(__lasx_xvldx,
128 pSrc2, iStride0,
129 pSrc2, iStride2,
130 pSrc2, iStride2_tmp2,
131 pSrc2, iStride2_tmp3,
132 src2_0, src2_1, src2_2, src2_3);
133 DUP4_ARG2(__lasx_xvldx,
134 pSrc2, iStride2_tmp4,
135 pSrc2, iStride2_tmp5,
136 pSrc2, iStride2_tmp6,
137 pSrc2, iStride2_tmp7,
138 src2_4, src2_5, src2_6, src2_7);
139
140 DUP4_ARG3(__lasx_xvpermi_q,
141 src1_0, src1_1, 0x20,
142 src1_2, src1_3, 0x20,
143 src1_4, src1_5, 0x20,
144 src1_6, src1_7, 0x20,
145 src1_0, src1_2, src1_4, src1_6);
146 DUP4_ARG3(__lasx_xvpermi_q,
147 src2_0, src2_1, 0x20,
148 src2_2, src2_3, 0x20,
149 src2_4, src2_5, 0x20,
150 src2_6, src2_7, 0x20,
151 src2_0, src2_2, src2_4, src2_6);
152
153 HORISUM(src1_0, src2_0, src1_0);
154 HORISUM(src1_2, src2_2, src1_2);
155 HORISUM(src1_4, src2_4, src1_4);
156 HORISUM(src1_6, src2_6, src1_6);
157
158 src1_0 = __lasx_xvadd_d(src1_0, src1_2);
159 src1_0 = __lasx_xvadd_d(src1_0, src1_4);
160 src1_0 = __lasx_xvadd_d(src1_0, src1_6);
161 src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0);
162
163 return (__lasx_xvpickve2gr_d(src1_0, 0) +
164 __lasx_xvpickve2gr_d(src1_0, 2));
165 }
166
WelsSampleSad8x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)167 int32_t WelsSampleSad8x8_lasx (uint8_t* pSample1, int32_t iStride1,
168 uint8_t* pSample2, int32_t iStride2) {
169 uint8_t* pSrc1 = pSample1;
170 uint8_t* pSrc2 = pSample2;
171 int32_t iStride0 = 0;
172 int32_t iStride1_tmp2 = iStride1 << 1;
173 int32_t iStride1_tmp3 = iStride1_tmp2 + iStride1;
174 int32_t iStride1_tmp4 = iStride1 << 2;
175 int32_t iStride1_tmp5 = iStride1_tmp4 + iStride1;
176 int32_t iStride1_tmp6 = iStride1_tmp5 + iStride1;
177 int32_t iStride1_tmp7 = iStride1_tmp6 + iStride1;
178 int32_t iStride2_tmp2 = iStride2 << 1;
179 int32_t iStride2_tmp3 = iStride2_tmp2 + iStride2;
180 int32_t iStride2_tmp4 = iStride2 << 2;
181 int32_t iStride2_tmp5 = iStride2_tmp4 + iStride2;
182 int32_t iStride2_tmp6 = iStride2_tmp5 + iStride2;
183 int32_t iStride2_tmp7 = iStride2_tmp6 + iStride2;
184
185 __m256i src1_0, src1_1, src1_2, src1_3,
186 src1_4, src1_5, src1_6, src1_7;
187 __m256i src2_0, src2_1, src2_2, src2_3,
188 src2_4, src2_5, src2_6, src2_7;
189
190 DUP4_ARG2(__lasx_xvldx,
191 pSrc1, iStride0,
192 pSrc1, iStride1,
193 pSrc1, iStride1_tmp2,
194 pSrc1, iStride1_tmp3,
195 src1_0, src1_1, src1_2, src1_3);
196 DUP4_ARG2(__lasx_xvldx,
197 pSrc1, iStride1_tmp4,
198 pSrc1, iStride1_tmp5,
199 pSrc1, iStride1_tmp6,
200 pSrc1, iStride1_tmp7,
201 src1_4, src1_5, src1_6, src1_7);
202 DUP4_ARG2(__lasx_xvldx,
203 pSrc2, iStride0,
204 pSrc2, iStride2,
205 pSrc2, iStride2_tmp2,
206 pSrc2, iStride2_tmp3,
207 src2_0, src2_1, src2_2, src2_3);
208 DUP4_ARG2(__lasx_xvldx,
209 pSrc2, iStride2_tmp4,
210 pSrc2, iStride2_tmp5,
211 pSrc2, iStride2_tmp6,
212 pSrc2, iStride2_tmp7,
213 src2_4, src2_5, src2_6, src2_7);
214
215 DUP4_ARG2(__lasx_xvpackev_d,
216 src1_0, src1_1, src1_2, src1_3,
217 src1_4, src1_5, src1_6, src1_7,
218 src1_0, src1_2, src1_4, src1_6);
219 DUP2_ARG3(__lasx_xvpermi_q,
220 src1_0, src1_2, 0x20,
221 src1_4, src1_6, 0x20,
222 src1_0, src1_4);
223 DUP4_ARG2(__lasx_xvpackev_d,
224 src2_0, src2_1, src2_2, src2_3,
225 src2_4, src2_5, src2_6, src2_7,
226 src2_0, src2_2, src2_4, src2_6);
227 DUP2_ARG3(__lasx_xvpermi_q,
228 src2_0, src2_2, 0x20,
229 src2_4, src2_6, 0x20,
230 src2_0, src2_4);
231
232 HORISUM(src1_0, src2_0, src1_0);
233 HORISUM(src1_4, src2_4, src1_4);
234
235 src1_0 = __lasx_xvadd_d(src1_0, src1_4);
236 src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0);
237
238 return (__lasx_xvpickve2gr_d(src1_0, 0) +
239 __lasx_xvpickve2gr_d(src1_0, 2));
240 }
241
WelsSampleSatd4x4_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)242 int32_t WelsSampleSatd4x4_lasx (uint8_t* pSample1, int32_t iStride1,
243 uint8_t* pSample2, int32_t iStride2) {
244 int32_t iSatdSum;
245 uint8_t* pSrc1 = pSample1;
246 uint8_t* pSrc2 = pSample2;
247 int32_t iStride0 = 0;
248 int32_t iStride1_tmp = iStride1 << 1;
249 int32_t iStride2_tmp = iStride2 << 1;
250
251 __m256i src1_0, src1_1, src1_2, src1_3;
252 __m256i src2_0, src2_1, src2_2, src2_3;
253 __m256i iSample01, iSample23;
254 __m256i tmp0, tmp1, tmp2, tmp3;
255 __m256i zero = __lasx_xvldi(0);
256 v16i16 mask= {1, 0, 3, 2, 5, 4, 7, 6, 1, 0, 3, 2, 5, 4, 7, 6};
257
258 DUP4_ARG2(__lasx_xvldx,
259 pSrc1, iStride0,
260 pSrc1, iStride1,
261 pSrc1, iStride1_tmp,
262 pSrc1, iStride1_tmp + iStride1,
263 src1_0, src1_1, src1_2, src1_3);
264 DUP4_ARG2(__lasx_xvldx,
265 pSrc2, iStride0,
266 pSrc2, iStride2,
267 pSrc2, iStride2_tmp,
268 pSrc2, iStride2_tmp + iStride2,
269 src2_0, src2_1, src2_2, src2_3);
270 DUP4_ARG2(__lasx_xvpackev_w,
271 src1_0, src1_1,
272 src1_2, src1_3,
273 src2_0, src2_1,
274 src2_2, src2_3,
275 src1_0, src1_2, src2_0, src2_2);
276 DUP2_ARG2(__lasx_xvpackev_d,
277 src1_0, src1_2,
278 src2_0, src2_2,
279 src1_0, src2_0);
280
281 tmp0 = __lasx_xvsubwev_h_bu(src1_0, src2_0);
282 tmp1 = __lasx_xvsubwod_h_bu(src1_0, src2_0);
283 tmp2 = __lasx_xvilvl_w(tmp0, tmp1);
284 tmp3 = __lasx_xvilvh_w(tmp0, tmp1);
285 tmp0 = __lasx_xvpermi_q(tmp3, tmp2, 0x20);
286 tmp0 = __lasx_xvshuf_h((__m256i)mask, tmp0, tmp0);
287
288 iSample01 = __lasx_xvhaddw_w_h(tmp0, tmp0);
289 iSample23 = __lasx_xvhsubw_w_h(tmp0, tmp0);
290 tmp0 = __lasx_xvhaddw_d_w(iSample01, iSample01);
291 tmp1 = __lasx_xvhaddw_d_w(iSample23, iSample23);
292 tmp2 = __lasx_xvhsubw_d_w(iSample23, iSample23);
293 tmp3 = __lasx_xvhsubw_d_w(iSample01, iSample01);
294
295 tmp1 = __lasx_xvpackev_w(tmp1, tmp0);
296 tmp3 = __lasx_xvpackev_w(tmp3, tmp2);
297 tmp0 = __lasx_xvpermi_q(tmp3, tmp1, 0x20);
298 tmp2 = __lasx_xvpermi_q(tmp3, tmp1, 0x31);
299 tmp0 = __lasx_xvpermi_w(tmp0, tmp0, 0x72);
300 tmp2 = __lasx_xvpermi_w(tmp2, tmp2, 0x72);
301
302 iSample01 = __lasx_xvadd_w(tmp0, tmp2);
303 iSample23 = __lasx_xvsub_w(tmp0, tmp2);
304
305 tmp0 = __lasx_xvhaddw_d_w(iSample01, iSample01);
306 tmp1 = __lasx_xvhaddw_d_w(iSample23, iSample23);
307 tmp2 = __lasx_xvhsubw_d_w(iSample23, iSample23);
308 tmp3 = __lasx_xvhsubw_d_w(iSample01, iSample01);
309
310 tmp0 = __lasx_xvpackev_w(tmp0, tmp1);
311 tmp2 = __lasx_xvpackev_w(tmp2, tmp3);
312
313 tmp0 = __lasx_xvabsd_w(tmp0, zero);
314 tmp2 = __lasx_xvabsd_w(tmp2, zero);
315 tmp0 = __lasx_xvadd_w(tmp0, tmp2);
316 tmp0 = __lasx_xvhaddw_d_w(tmp0, tmp0);
317 tmp0 = __lasx_xvhaddw_q_d(tmp0, tmp0);
318
319 iSatdSum = __lasx_xvpickve2gr_d(tmp0, 0) +
320 __lasx_xvpickve2gr_d(tmp0, 2);
321
322 return ((iSatdSum + 1) >> 1);
323 }
324
WelsSampleSad16x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)325 int32_t WelsSampleSad16x8_lasx (uint8_t* pSample1, int32_t iStride1,
326 uint8_t* pSample2, int32_t iStride2) {
327
328 return WelsSampleSad8x8x2_lasx (pSample1, iStride1,
329 pSample2, iStride2);
330 }
331
WelsSampleSad8x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)332 int32_t WelsSampleSad8x16_lasx (uint8_t* pSample1, int32_t iStride1,
333 uint8_t* pSample2, int32_t iStride2) {
334 int32_t iSadSum = 0;
335
336 iSadSum += WelsSampleSad8x8_lasx (pSample1, iStride1,
337 pSample2, iStride2);
338 iSadSum += WelsSampleSad8x8_lasx (pSample1 + (iStride1 << 3), iStride1,
339 pSample2 + (iStride2 << 3), iStride2);
340 return iSadSum;
341 }
342
WelsSampleSad16x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)343 int32_t WelsSampleSad16x16_lasx (uint8_t* pSample1, int32_t iStride1,
344 uint8_t* pSample2, int32_t iStride2) {
345 int32_t iSadSum = 0;
346
347 iSadSum += WelsSampleSad8x8x2_lasx (pSample1, iStride1,
348 pSample2, iStride2);
349 iSadSum += WelsSampleSad8x8x2_lasx (pSample1 + (iStride1 << 3), iStride1,
350 pSample2 + (iStride2 << 3), iStride2);
351 return iSadSum;
352 }
353
WelsSampleSadFour4x4_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)354 void WelsSampleSadFour4x4_lasx (uint8_t* iSample1, int32_t iStride1,
355 uint8_t* iSample2, int32_t iStride2,
356 int32_t* pSad) {
357 uint8_t *pSrc1 = iSample1;
358 uint8_t *pSrc2 = iSample2 - iStride2;
359 uint8_t *pSrc3 = iSample2 + iStride2;
360 uint8_t *pSrc4 = iSample2 - 1;
361 uint8_t *pSrc5 = iSample2 + 1;
362 int32_t iStride0 = 0;
363 int32_t iStride1_tmp = iStride1 << 1;
364 int32_t iStride2_tmp = iStride2 << 1;
365
366 __m256i src1_0, src1_1, src1_2, src1_3;
367 __m256i src2_0, src2_1, src2_2, src2_3;
368 __m256i cb0, cb1, cb2, cb3, cb4, cb5, cb6, cb7;
369
370 DUP4_ARG2(__lasx_xvldx,
371 pSrc1, iStride0,
372 pSrc1, iStride1,
373 pSrc1, iStride1_tmp,
374 pSrc1, iStride1_tmp + iStride1,
375 src1_0, src1_1, src1_2, src1_3);
376 DUP4_ARG2(__lasx_xvldx,
377 pSrc2, iStride0,
378 pSrc2, iStride2,
379 pSrc2, iStride2_tmp,
380 pSrc2, iStride2_tmp + iStride2,
381 src2_0, src2_1, src2_2, src2_3);
382 DUP4_ARG2(__lasx_xvpackev_w,
383 src1_0, src1_1, src1_2, src1_3,
384 src2_0, src2_1, src2_2, src2_3,
385 src1_0, src1_2, src2_0, src2_2);
386 DUP2_ARG2(__lasx_xvpackev_d,
387 src1_0, src1_2, src2_0, src2_2,
388 cb0, cb1); //16 16
389 DUP4_ARG2(__lasx_xvldx,
390 pSrc1, iStride0,
391 pSrc1, iStride1,
392 pSrc1, iStride1_tmp,
393 pSrc1, iStride1_tmp + iStride1,
394 src1_0, src1_1, src1_2, src1_3);
395 DUP4_ARG2(__lasx_xvldx,
396 pSrc3, iStride0,
397 pSrc3, iStride2,
398 pSrc3, iStride2_tmp,
399 pSrc3, iStride2_tmp + iStride2,
400 src2_0, src2_1, src2_2, src2_3);
401 DUP4_ARG2(__lasx_xvpackev_w,
402 src1_0, src1_1, src1_2, src1_3,
403 src2_0, src2_1, src2_2, src2_3,
404 src1_0, src1_2, src2_0, src2_2);
405 DUP2_ARG2(__lasx_xvpackev_d,
406 src1_0, src1_2, src2_0, src2_2,
407 cb2, cb3); //16 16
408 DUP4_ARG2(__lasx_xvldx,
409 pSrc1, iStride0,
410 pSrc1, iStride1,
411 pSrc1, iStride1_tmp,
412 pSrc1, iStride1_tmp + iStride1,
413 src1_0, src1_1, src1_2, src1_3);
414 DUP4_ARG2(__lasx_xvldx,
415 pSrc4, iStride0,
416 pSrc4, iStride2,
417 pSrc4, iStride2_tmp,
418 pSrc4, iStride2_tmp + iStride2,
419 src2_0, src2_1, src2_2, src2_3);
420 DUP4_ARG2(__lasx_xvpackev_w,
421 src1_0, src1_1, src1_2, src1_3,
422 src2_0, src2_1, src2_2, src2_3,
423 src1_0, src1_2, src2_0, src2_2);
424 DUP2_ARG2(__lasx_xvpackev_d,
425 src1_0, src1_2, src2_0, src2_2,
426 cb4, cb5); //16 16
427 DUP4_ARG2(__lasx_xvldx,
428 pSrc1, iStride0,
429 pSrc1, iStride1,
430 pSrc1, iStride1_tmp,
431 pSrc1, iStride1_tmp + iStride1,
432 src1_0, src1_1, src1_2, src1_3);
433 DUP4_ARG2(__lasx_xvldx,
434 pSrc5, iStride0,
435 pSrc5, iStride2,
436 pSrc5, iStride2_tmp,
437 pSrc5, iStride2_tmp + iStride2,
438 src2_0, src2_1, src2_2, src2_3);
439 DUP4_ARG2(__lasx_xvpackev_w,
440 src1_0, src1_1, src1_2, src1_3,
441 src2_0, src2_1, src2_2, src2_3,
442 src1_0, src1_2, src2_0, src2_2);
443 DUP2_ARG2(__lasx_xvpackev_d,
444 src1_0, src1_2, src2_0, src2_2,
445 cb6, cb7); //16 16
446
447 cb0 = __lasx_xvpermi_q(cb2, cb0, 0x20);
448 cb1 = __lasx_xvpermi_q(cb3, cb1, 0x20);
449 cb4 = __lasx_xvpermi_q(cb6, cb4, 0x20);
450 cb5 = __lasx_xvpermi_q(cb7, cb5, 0x20);
451
452 HORISUM(cb0, cb1, cb0);
453 HORISUM(cb4, cb5, cb4);
454
455 DUP2_ARG2(__lasx_xvhaddw_qu_du,
456 cb0, cb0, cb4, cb4,
457 cb0, cb4);
458
459 * (pSad) = __lasx_xvpickve2gr_d(cb0, 0);
460 * (pSad + 1) = __lasx_xvpickve2gr_d(cb0, 2);
461 * (pSad + 2) = __lasx_xvpickve2gr_d(cb4, 0);
462 * (pSad + 3) = __lasx_xvpickve2gr_d(cb4, 2);
463 }
464
WelsSampleSadFour8x8_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)465 void WelsSampleSadFour8x8_lasx (uint8_t* iSample1, int32_t iStride1,
466 uint8_t* iSample2, int32_t iStride2,
467 int32_t* pSad) {
468 * (pSad) = WelsSampleSad8x8_lasx (iSample1, iStride1,
469 (iSample2 - iStride2), iStride2);
470 * (pSad + 1) = WelsSampleSad8x8_lasx (iSample1, iStride1,
471 (iSample2 + iStride2), iStride2);
472 * (pSad + 2) = WelsSampleSad8x8_lasx (iSample1, iStride1,
473 (iSample2 - 1), iStride2);
474 * (pSad + 3) = WelsSampleSad8x8_lasx (iSample1, iStride1,
475 (iSample2 + 1), iStride2);
476 }
477
WelsSampleSadFour8x16_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)478 void WelsSampleSadFour8x16_lasx (uint8_t* iSample1, int32_t iStride1,
479 uint8_t* iSample2, int32_t iStride2,
480 int32_t* pSad) {
481 * (pSad) = WelsSampleSad8x16_lasx (iSample1, iStride1,
482 (iSample2 - iStride2), iStride2);
483 * (pSad + 1) = WelsSampleSad8x16_lasx (iSample1, iStride1,
484 (iSample2 + iStride2), iStride2);
485 * (pSad + 2) = WelsSampleSad8x16_lasx (iSample1, iStride1,
486 (iSample2 - 1), iStride2);
487 * (pSad + 3) = WelsSampleSad8x16_lasx (iSample1, iStride1,
488 (iSample2 + 1), iStride2);
489 }
490
WelsSampleSadFour16x8_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)491 void WelsSampleSadFour16x8_lasx (uint8_t* iSample1, int32_t iStride1,
492 uint8_t* iSample2, int32_t iStride2,
493 int32_t* pSad) {
494 * (pSad) = WelsSampleSad16x8_lasx (iSample1, iStride1,
495 (iSample2 - iStride2), iStride2);
496 * (pSad + 1) = WelsSampleSad16x8_lasx (iSample1, iStride1,
497 (iSample2 + iStride2), iStride2);
498 * (pSad + 2) = WelsSampleSad16x8_lasx (iSample1, iStride1,
499 (iSample2 - 1), iStride2);
500 * (pSad + 3) = WelsSampleSad16x8_lasx (iSample1, iStride1,
501 (iSample2 + 1), iStride2);
502 }
503
WelsSampleSadFour16x16_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)504 void WelsSampleSadFour16x16_lasx (uint8_t* iSample1, int32_t iStride1,
505 uint8_t* iSample2, int32_t iStride2,
506 int32_t* pSad) {
507 * (pSad) = WelsSampleSad16x16_lasx (iSample1, iStride1,
508 (iSample2 - iStride2), iStride2);
509 * (pSad + 1) = WelsSampleSad16x16_lasx (iSample1, iStride1,
510 (iSample2 + iStride2), iStride2);
511 * (pSad + 2) = WelsSampleSad16x16_lasx (iSample1, iStride1,
512 (iSample2 - 1), iStride2);
513 * (pSad + 3) = WelsSampleSad16x16_lasx (iSample1, iStride1,
514 (iSample2 + 1), iStride2);
515 }
516
WelsSampleSatd8x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)517 int32_t WelsSampleSatd8x8_lasx (uint8_t* pSample1, int32_t iStride1,
518 uint8_t* pSample2, int32_t iStride2) {
519 int32_t iSatdSum = 0;
520
521 iSatdSum += WelsSampleSatd4x4_lasx (pSample1, iStride1,
522 pSample2, iStride2);
523 iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + 4, iStride1,
524 pSample2 + 4, iStride2);
525 iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + (iStride1 << 2), iStride1,
526 pSample2 + (iStride2 << 2), iStride2);
527 iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + (iStride1 << 2) + 4, iStride1,
528 pSample2 + (iStride2 << 2) + 4, iStride2);
529 return iSatdSum;
530 }
531
WelsSampleSatd16x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)532 int32_t WelsSampleSatd16x8_lasx (uint8_t* pSample1, int32_t iStride1,
533 uint8_t* pSample2, int32_t iStride2) {
534 int32_t iSatdSum = 0;
535
536 iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1,
537 pSample2, iStride2);
538 iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + 8, iStride1,
539 pSample2 + 8, iStride2);
540 return iSatdSum;
541 }
542
WelsSampleSatd8x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)543 int32_t WelsSampleSatd8x16_lasx (uint8_t* pSample1, int32_t iStride1,
544 uint8_t* pSample2, int32_t iStride2) {
545 int32_t iSatdSum = 0;
546
547 iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1,
548 pSample2, iStride2);
549 iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3), iStride1,
550 pSample2 + (iStride2 << 3), iStride2);
551 return iSatdSum;
552 }
553
WelsSampleSatd16x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)554 int32_t WelsSampleSatd16x16_lasx (uint8_t* pSample1, int32_t iStride1,
555 uint8_t* pSample2, int32_t iStride2) {
556 int32_t iSatdSum = 0;
557
558 iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1,
559 pSample2, iStride2);
560 iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + 8, iStride1,
561 pSample2 + 8, iStride2);
562 iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3), iStride1,
563 pSample2 + (iStride2 << 3), iStride2);
564 iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3) + 8, iStride1,
565 pSample2 + (iStride2 << 3) + 8, iStride2);
566 return iSatdSum;
567 }
568