• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    satd_sad_lasx.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    12/10/2021 Created
37  *
38  *************************************************************************************
39  */
40 
41 #include <stdint.h>
42 #include "loongson_intrinsics.h"
43 
44 #define HORISUM(in0, in1, out0)            \
45   out0 = __lasx_xvabsd_bu(in0, in1);       \
46   out0 = __lasx_xvhaddw_hu_bu(out0, out0); \
47   out0 = __lasx_xvhaddw_wu_hu(out0, out0); \
48   out0 = __lasx_xvhaddw_du_wu(out0, out0); \
49 
WelsSampleSad4x4_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)50 int32_t WelsSampleSad4x4_lasx (uint8_t* pSample1, int32_t iStride1,
51                                uint8_t* pSample2, int32_t iStride2) {
52   uint8_t *pSrc1 = pSample1;
53   uint8_t *pSrc2 = pSample2;
54   int32_t iStride0 = 0;
55   int32_t iStride1_tmp = iStride1 << 1;
56   int32_t iStride2_tmp = iStride2 << 1;
57 
58   __m256i src1_0, src1_1, src1_2, src1_3;
59   __m256i src2_0, src2_1, src2_2, src2_3;
60 
61   DUP4_ARG2(__lasx_xvldx,
62             pSrc1, iStride0,
63             pSrc1, iStride1,
64             pSrc1, iStride1_tmp,
65             pSrc1, iStride1_tmp + iStride1,
66             src1_0, src1_1, src1_2, src1_3);
67   DUP4_ARG2(__lasx_xvldx,
68             pSrc2, iStride0,
69             pSrc2, iStride2,
70             pSrc2, iStride2_tmp,
71             pSrc2, iStride2_tmp + iStride2,
72             src2_0, src2_1, src2_2, src2_3);
73 
74   DUP2_ARG2(__lasx_xvpackev_w,
75             src1_0, src1_1, src1_2, src1_3,
76             src1_0, src1_2);
77   DUP2_ARG2(__lasx_xvpackev_w,
78             src2_0, src2_1, src2_2, src2_3,
79             src2_0, src2_2);
80   DUP2_ARG2(__lasx_xvpackev_d,
81             src1_0, src1_2, src2_0, src2_2,
82             src1_0, src2_0);
83 
84   HORISUM(src1_0, src2_0, src1_0);
85 
86   src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0);
87 
88   return __lasx_xvpickve2gr_d(src1_0, 0);
89 }
90 
91 static inline
WelsSampleSad8x8x2_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)92 int32_t WelsSampleSad8x8x2_lasx (uint8_t* pSample1, int32_t iStride1,
93                                  uint8_t* pSample2, int32_t iStride2) {
94   uint8_t* pSrc1 = pSample1;
95   uint8_t* pSrc2 = pSample2;
96   int32_t iStride0 = 0;
97   int32_t iStride1_tmp2 = iStride1 << 1;
98   int32_t iStride1_tmp3 = iStride1_tmp2 + iStride1;
99   int32_t iStride1_tmp4 = iStride1 << 2;
100   int32_t iStride1_tmp5 = iStride1_tmp4 + iStride1;
101   int32_t iStride1_tmp6 = iStride1_tmp5 + iStride1;
102   int32_t iStride1_tmp7 = iStride1_tmp6 + iStride1;
103   int32_t iStride2_tmp2 = iStride2 << 1;
104   int32_t iStride2_tmp3 = iStride2_tmp2 + iStride2;
105   int32_t iStride2_tmp4 = iStride2 << 2;
106   int32_t iStride2_tmp5 = iStride2_tmp4 + iStride2;
107   int32_t iStride2_tmp6 = iStride2_tmp5 + iStride2;
108   int32_t iStride2_tmp7 = iStride2_tmp6 + iStride2;
109 
110   __m256i src1_0, src1_1, src1_2, src1_3,
111           src1_4, src1_5, src1_6, src1_7;
112   __m256i src2_0, src2_1, src2_2, src2_3,
113           src2_4, src2_5, src2_6, src2_7;
114 
115   DUP4_ARG2(__lasx_xvldx,
116             pSrc1, iStride0,
117             pSrc1, iStride1,
118             pSrc1, iStride1_tmp2,
119             pSrc1, iStride1_tmp3,
120             src1_0, src1_1, src1_2, src1_3);
121   DUP4_ARG2(__lasx_xvldx,
122             pSrc1, iStride1_tmp4,
123             pSrc1, iStride1_tmp5,
124             pSrc1, iStride1_tmp6,
125             pSrc1, iStride1_tmp7,
126             src1_4, src1_5, src1_6, src1_7);
127   DUP4_ARG2(__lasx_xvldx,
128             pSrc2, iStride0,
129             pSrc2, iStride2,
130             pSrc2, iStride2_tmp2,
131             pSrc2, iStride2_tmp3,
132             src2_0, src2_1, src2_2, src2_3);
133   DUP4_ARG2(__lasx_xvldx,
134             pSrc2, iStride2_tmp4,
135             pSrc2, iStride2_tmp5,
136             pSrc2, iStride2_tmp6,
137             pSrc2, iStride2_tmp7,
138             src2_4, src2_5, src2_6, src2_7);
139 
140   DUP4_ARG3(__lasx_xvpermi_q,
141             src1_0, src1_1, 0x20,
142             src1_2, src1_3, 0x20,
143             src1_4, src1_5, 0x20,
144             src1_6, src1_7, 0x20,
145             src1_0, src1_2, src1_4, src1_6);
146   DUP4_ARG3(__lasx_xvpermi_q,
147             src2_0, src2_1, 0x20,
148             src2_2, src2_3, 0x20,
149             src2_4, src2_5, 0x20,
150             src2_6, src2_7, 0x20,
151             src2_0, src2_2, src2_4, src2_6);
152 
153   HORISUM(src1_0, src2_0, src1_0);
154   HORISUM(src1_2, src2_2, src1_2);
155   HORISUM(src1_4, src2_4, src1_4);
156   HORISUM(src1_6, src2_6, src1_6);
157 
158   src1_0 = __lasx_xvadd_d(src1_0, src1_2);
159   src1_0 = __lasx_xvadd_d(src1_0, src1_4);
160   src1_0 = __lasx_xvadd_d(src1_0, src1_6);
161   src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0);
162 
163   return (__lasx_xvpickve2gr_d(src1_0, 0) +
164           __lasx_xvpickve2gr_d(src1_0, 2));
165 }
166 
WelsSampleSad8x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)167 int32_t WelsSampleSad8x8_lasx (uint8_t* pSample1, int32_t iStride1,
168                                uint8_t* pSample2, int32_t iStride2) {
169   uint8_t* pSrc1 = pSample1;
170   uint8_t* pSrc2 = pSample2;
171   int32_t iStride0 = 0;
172   int32_t iStride1_tmp2 = iStride1 << 1;
173   int32_t iStride1_tmp3 = iStride1_tmp2 + iStride1;
174   int32_t iStride1_tmp4 = iStride1 << 2;
175   int32_t iStride1_tmp5 = iStride1_tmp4 + iStride1;
176   int32_t iStride1_tmp6 = iStride1_tmp5 + iStride1;
177   int32_t iStride1_tmp7 = iStride1_tmp6 + iStride1;
178   int32_t iStride2_tmp2 = iStride2 << 1;
179   int32_t iStride2_tmp3 = iStride2_tmp2 + iStride2;
180   int32_t iStride2_tmp4 = iStride2 << 2;
181   int32_t iStride2_tmp5 = iStride2_tmp4 + iStride2;
182   int32_t iStride2_tmp6 = iStride2_tmp5 + iStride2;
183   int32_t iStride2_tmp7 = iStride2_tmp6 + iStride2;
184 
185   __m256i src1_0, src1_1, src1_2, src1_3,
186           src1_4, src1_5, src1_6, src1_7;
187   __m256i src2_0, src2_1, src2_2, src2_3,
188           src2_4, src2_5, src2_6, src2_7;
189 
190   DUP4_ARG2(__lasx_xvldx,
191             pSrc1, iStride0,
192             pSrc1, iStride1,
193             pSrc1, iStride1_tmp2,
194             pSrc1, iStride1_tmp3,
195             src1_0, src1_1, src1_2, src1_3);
196   DUP4_ARG2(__lasx_xvldx,
197             pSrc1, iStride1_tmp4,
198             pSrc1, iStride1_tmp5,
199             pSrc1, iStride1_tmp6,
200             pSrc1, iStride1_tmp7,
201             src1_4, src1_5, src1_6, src1_7);
202   DUP4_ARG2(__lasx_xvldx,
203             pSrc2, iStride0,
204             pSrc2, iStride2,
205             pSrc2, iStride2_tmp2,
206             pSrc2, iStride2_tmp3,
207             src2_0, src2_1, src2_2, src2_3);
208   DUP4_ARG2(__lasx_xvldx,
209             pSrc2, iStride2_tmp4,
210             pSrc2, iStride2_tmp5,
211             pSrc2, iStride2_tmp6,
212             pSrc2, iStride2_tmp7,
213             src2_4, src2_5, src2_6, src2_7);
214 
215   DUP4_ARG2(__lasx_xvpackev_d,
216             src1_0, src1_1, src1_2, src1_3,
217             src1_4, src1_5, src1_6, src1_7,
218             src1_0, src1_2, src1_4, src1_6);
219   DUP2_ARG3(__lasx_xvpermi_q,
220             src1_0, src1_2, 0x20,
221             src1_4, src1_6, 0x20,
222             src1_0, src1_4);
223   DUP4_ARG2(__lasx_xvpackev_d,
224             src2_0, src2_1, src2_2, src2_3,
225             src2_4, src2_5, src2_6, src2_7,
226             src2_0, src2_2, src2_4, src2_6);
227   DUP2_ARG3(__lasx_xvpermi_q,
228             src2_0, src2_2, 0x20,
229             src2_4, src2_6, 0x20,
230             src2_0, src2_4);
231 
232   HORISUM(src1_0, src2_0, src1_0);
233   HORISUM(src1_4, src2_4, src1_4);
234 
235   src1_0 = __lasx_xvadd_d(src1_0, src1_4);
236   src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0);
237 
238   return (__lasx_xvpickve2gr_d(src1_0, 0) +
239           __lasx_xvpickve2gr_d(src1_0, 2));
240 }
241 
WelsSampleSatd4x4_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)242 int32_t WelsSampleSatd4x4_lasx (uint8_t* pSample1, int32_t iStride1,
243                                 uint8_t* pSample2, int32_t iStride2) {
244   int32_t iSatdSum;
245   uint8_t* pSrc1 = pSample1;
246   uint8_t* pSrc2 = pSample2;
247   int32_t iStride0 = 0;
248   int32_t iStride1_tmp = iStride1 << 1;
249   int32_t iStride2_tmp = iStride2 << 1;
250 
251   __m256i src1_0, src1_1, src1_2, src1_3;
252   __m256i src2_0, src2_1, src2_2, src2_3;
253   __m256i iSample01, iSample23;
254   __m256i tmp0, tmp1, tmp2, tmp3;
255   __m256i zero = __lasx_xvldi(0);
256   v16i16 mask= {1, 0, 3, 2, 5, 4, 7, 6, 1, 0, 3, 2, 5, 4, 7, 6};
257 
258   DUP4_ARG2(__lasx_xvldx,
259             pSrc1, iStride0,
260             pSrc1, iStride1,
261             pSrc1, iStride1_tmp,
262             pSrc1, iStride1_tmp + iStride1,
263             src1_0, src1_1, src1_2, src1_3);
264   DUP4_ARG2(__lasx_xvldx,
265             pSrc2, iStride0,
266             pSrc2, iStride2,
267             pSrc2, iStride2_tmp,
268             pSrc2, iStride2_tmp + iStride2,
269             src2_0, src2_1, src2_2, src2_3);
270   DUP4_ARG2(__lasx_xvpackev_w,
271             src1_0, src1_1,
272             src1_2, src1_3,
273             src2_0, src2_1,
274             src2_2, src2_3,
275             src1_0, src1_2, src2_0, src2_2);
276   DUP2_ARG2(__lasx_xvpackev_d,
277             src1_0, src1_2,
278             src2_0, src2_2,
279             src1_0, src2_0);
280 
281   tmp0 = __lasx_xvsubwev_h_bu(src1_0, src2_0);
282   tmp1 = __lasx_xvsubwod_h_bu(src1_0, src2_0);
283   tmp2 = __lasx_xvilvl_w(tmp0, tmp1);
284   tmp3 = __lasx_xvilvh_w(tmp0, tmp1);
285   tmp0 = __lasx_xvpermi_q(tmp3, tmp2, 0x20);
286   tmp0 = __lasx_xvshuf_h((__m256i)mask, tmp0, tmp0);
287 
288   iSample01 = __lasx_xvhaddw_w_h(tmp0, tmp0);
289   iSample23 = __lasx_xvhsubw_w_h(tmp0, tmp0);
290   tmp0 = __lasx_xvhaddw_d_w(iSample01, iSample01);
291   tmp1 = __lasx_xvhaddw_d_w(iSample23, iSample23);
292   tmp2 = __lasx_xvhsubw_d_w(iSample23, iSample23);
293   tmp3 = __lasx_xvhsubw_d_w(iSample01, iSample01);
294 
295   tmp1 = __lasx_xvpackev_w(tmp1, tmp0);
296   tmp3 = __lasx_xvpackev_w(tmp3, tmp2);
297   tmp0 = __lasx_xvpermi_q(tmp3, tmp1, 0x20);
298   tmp2 = __lasx_xvpermi_q(tmp3, tmp1, 0x31);
299   tmp0 = __lasx_xvpermi_w(tmp0, tmp0, 0x72);
300   tmp2 = __lasx_xvpermi_w(tmp2, tmp2, 0x72);
301 
302   iSample01 = __lasx_xvadd_w(tmp0, tmp2);
303   iSample23 = __lasx_xvsub_w(tmp0, tmp2);
304 
305   tmp0 = __lasx_xvhaddw_d_w(iSample01, iSample01);
306   tmp1 = __lasx_xvhaddw_d_w(iSample23, iSample23);
307   tmp2 = __lasx_xvhsubw_d_w(iSample23, iSample23);
308   tmp3 = __lasx_xvhsubw_d_w(iSample01, iSample01);
309 
310   tmp0 = __lasx_xvpackev_w(tmp0, tmp1);
311   tmp2 = __lasx_xvpackev_w(tmp2, tmp3);
312 
313   tmp0 = __lasx_xvabsd_w(tmp0, zero);
314   tmp2 = __lasx_xvabsd_w(tmp2, zero);
315   tmp0 = __lasx_xvadd_w(tmp0, tmp2);
316   tmp0 = __lasx_xvhaddw_d_w(tmp0, tmp0);
317   tmp0 = __lasx_xvhaddw_q_d(tmp0, tmp0);
318 
319   iSatdSum = __lasx_xvpickve2gr_d(tmp0, 0) +
320              __lasx_xvpickve2gr_d(tmp0, 2);
321 
322   return ((iSatdSum + 1) >> 1);
323 }
324 
WelsSampleSad16x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)325 int32_t WelsSampleSad16x8_lasx (uint8_t* pSample1, int32_t iStride1,
326                                 uint8_t* pSample2, int32_t iStride2) {
327 
328   return WelsSampleSad8x8x2_lasx (pSample1, iStride1,
329                                   pSample2, iStride2);
330 }
331 
WelsSampleSad8x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)332 int32_t WelsSampleSad8x16_lasx (uint8_t* pSample1, int32_t iStride1,
333                                 uint8_t* pSample2, int32_t iStride2) {
334   int32_t iSadSum = 0;
335 
336   iSadSum += WelsSampleSad8x8_lasx (pSample1, iStride1,
337                                     pSample2, iStride2);
338   iSadSum += WelsSampleSad8x8_lasx (pSample1 + (iStride1 << 3), iStride1,
339                                     pSample2 + (iStride2 << 3), iStride2);
340   return iSadSum;
341 }
342 
WelsSampleSad16x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)343 int32_t WelsSampleSad16x16_lasx (uint8_t* pSample1, int32_t iStride1,
344                                  uint8_t* pSample2, int32_t iStride2) {
345   int32_t iSadSum = 0;
346 
347   iSadSum += WelsSampleSad8x8x2_lasx (pSample1, iStride1,
348                                       pSample2, iStride2);
349   iSadSum += WelsSampleSad8x8x2_lasx (pSample1 + (iStride1 << 3), iStride1,
350                                       pSample2 + (iStride2 << 3), iStride2);
351   return iSadSum;
352 }
353 
WelsSampleSadFour4x4_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)354 void WelsSampleSadFour4x4_lasx (uint8_t* iSample1, int32_t iStride1,
355                                 uint8_t* iSample2, int32_t iStride2,
356                                 int32_t* pSad) {
357   uint8_t *pSrc1 = iSample1;
358   uint8_t *pSrc2 = iSample2 - iStride2;
359   uint8_t *pSrc3 = iSample2 + iStride2;
360   uint8_t *pSrc4 = iSample2 - 1;
361   uint8_t *pSrc5 = iSample2 + 1;
362   int32_t iStride0 = 0;
363   int32_t iStride1_tmp = iStride1 << 1;
364   int32_t iStride2_tmp = iStride2 << 1;
365 
366   __m256i src1_0, src1_1, src1_2, src1_3;
367   __m256i src2_0, src2_1, src2_2, src2_3;
368   __m256i cb0, cb1, cb2, cb3, cb4, cb5, cb6, cb7;
369 
370   DUP4_ARG2(__lasx_xvldx,
371             pSrc1, iStride0,
372             pSrc1, iStride1,
373             pSrc1, iStride1_tmp,
374             pSrc1, iStride1_tmp + iStride1,
375             src1_0, src1_1, src1_2, src1_3);
376   DUP4_ARG2(__lasx_xvldx,
377             pSrc2, iStride0,
378             pSrc2, iStride2,
379             pSrc2, iStride2_tmp,
380             pSrc2, iStride2_tmp + iStride2,
381             src2_0, src2_1, src2_2, src2_3);
382   DUP4_ARG2(__lasx_xvpackev_w,
383             src1_0, src1_1, src1_2, src1_3,
384             src2_0, src2_1, src2_2, src2_3,
385             src1_0, src1_2, src2_0, src2_2);
386   DUP2_ARG2(__lasx_xvpackev_d,
387             src1_0, src1_2, src2_0, src2_2,
388             cb0, cb1); //16 16
389   DUP4_ARG2(__lasx_xvldx,
390             pSrc1, iStride0,
391             pSrc1, iStride1,
392             pSrc1, iStride1_tmp,
393             pSrc1, iStride1_tmp + iStride1,
394             src1_0, src1_1, src1_2, src1_3);
395   DUP4_ARG2(__lasx_xvldx,
396             pSrc3, iStride0,
397             pSrc3, iStride2,
398             pSrc3, iStride2_tmp,
399             pSrc3, iStride2_tmp + iStride2,
400             src2_0, src2_1, src2_2, src2_3);
401   DUP4_ARG2(__lasx_xvpackev_w,
402             src1_0, src1_1, src1_2, src1_3,
403             src2_0, src2_1, src2_2, src2_3,
404             src1_0, src1_2, src2_0, src2_2);
405   DUP2_ARG2(__lasx_xvpackev_d,
406             src1_0, src1_2, src2_0, src2_2,
407             cb2, cb3); //16 16
408   DUP4_ARG2(__lasx_xvldx,
409             pSrc1, iStride0,
410             pSrc1, iStride1,
411             pSrc1, iStride1_tmp,
412             pSrc1, iStride1_tmp + iStride1,
413             src1_0, src1_1, src1_2, src1_3);
414   DUP4_ARG2(__lasx_xvldx,
415             pSrc4, iStride0,
416             pSrc4, iStride2,
417             pSrc4, iStride2_tmp,
418             pSrc4, iStride2_tmp + iStride2,
419             src2_0, src2_1, src2_2, src2_3);
420   DUP4_ARG2(__lasx_xvpackev_w,
421             src1_0, src1_1, src1_2, src1_3,
422             src2_0, src2_1, src2_2, src2_3,
423             src1_0, src1_2, src2_0, src2_2);
424   DUP2_ARG2(__lasx_xvpackev_d,
425             src1_0, src1_2, src2_0, src2_2,
426             cb4, cb5); //16 16
427   DUP4_ARG2(__lasx_xvldx,
428             pSrc1, iStride0,
429             pSrc1, iStride1,
430             pSrc1, iStride1_tmp,
431             pSrc1, iStride1_tmp + iStride1,
432             src1_0, src1_1, src1_2, src1_3);
433   DUP4_ARG2(__lasx_xvldx,
434             pSrc5, iStride0,
435             pSrc5, iStride2,
436             pSrc5, iStride2_tmp,
437             pSrc5, iStride2_tmp + iStride2,
438             src2_0, src2_1, src2_2, src2_3);
439   DUP4_ARG2(__lasx_xvpackev_w,
440             src1_0, src1_1, src1_2, src1_3,
441             src2_0, src2_1, src2_2, src2_3,
442             src1_0, src1_2, src2_0, src2_2);
443   DUP2_ARG2(__lasx_xvpackev_d,
444             src1_0, src1_2, src2_0, src2_2,
445             cb6, cb7); //16 16
446 
447   cb0 = __lasx_xvpermi_q(cb2, cb0, 0x20);
448   cb1 = __lasx_xvpermi_q(cb3, cb1, 0x20);
449   cb4 = __lasx_xvpermi_q(cb6, cb4, 0x20);
450   cb5 = __lasx_xvpermi_q(cb7, cb5, 0x20);
451 
452   HORISUM(cb0, cb1, cb0);
453   HORISUM(cb4, cb5, cb4);
454 
455   DUP2_ARG2(__lasx_xvhaddw_qu_du,
456            cb0, cb0, cb4, cb4,
457            cb0, cb4);
458 
459   * (pSad) = __lasx_xvpickve2gr_d(cb0, 0);
460   * (pSad + 1) = __lasx_xvpickve2gr_d(cb0, 2);
461   * (pSad + 2) = __lasx_xvpickve2gr_d(cb4, 0);
462   * (pSad + 3) = __lasx_xvpickve2gr_d(cb4, 2);
463 }
464 
WelsSampleSadFour8x8_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)465 void WelsSampleSadFour8x8_lasx (uint8_t* iSample1, int32_t iStride1,
466                                 uint8_t* iSample2, int32_t iStride2,
467                                 int32_t* pSad) {
468   * (pSad)     = WelsSampleSad8x8_lasx (iSample1, iStride1,
469                                        (iSample2 - iStride2), iStride2);
470   * (pSad + 1) = WelsSampleSad8x8_lasx (iSample1, iStride1,
471                                        (iSample2 + iStride2), iStride2);
472   * (pSad + 2) = WelsSampleSad8x8_lasx (iSample1, iStride1,
473                                        (iSample2 - 1), iStride2);
474   * (pSad + 3) = WelsSampleSad8x8_lasx (iSample1, iStride1,
475                                        (iSample2 + 1), iStride2);
476 }
477 
WelsSampleSadFour8x16_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)478 void WelsSampleSadFour8x16_lasx (uint8_t* iSample1, int32_t iStride1,
479                                  uint8_t* iSample2, int32_t iStride2,
480                                  int32_t* pSad) {
481   * (pSad)     = WelsSampleSad8x16_lasx (iSample1, iStride1,
482                                         (iSample2 - iStride2), iStride2);
483   * (pSad + 1) = WelsSampleSad8x16_lasx (iSample1, iStride1,
484                                         (iSample2 + iStride2), iStride2);
485   * (pSad + 2) = WelsSampleSad8x16_lasx (iSample1, iStride1,
486                                         (iSample2 - 1), iStride2);
487   * (pSad + 3) = WelsSampleSad8x16_lasx (iSample1, iStride1,
488                                         (iSample2 + 1), iStride2);
489 }
490 
WelsSampleSadFour16x8_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)491 void WelsSampleSadFour16x8_lasx (uint8_t* iSample1, int32_t iStride1,
492                                  uint8_t* iSample2, int32_t iStride2,
493                                  int32_t* pSad) {
494   * (pSad)     = WelsSampleSad16x8_lasx (iSample1, iStride1,
495                                         (iSample2 - iStride2), iStride2);
496   * (pSad + 1) = WelsSampleSad16x8_lasx (iSample1, iStride1,
497                                         (iSample2 + iStride2), iStride2);
498   * (pSad + 2) = WelsSampleSad16x8_lasx (iSample1, iStride1,
499                                         (iSample2 - 1), iStride2);
500   * (pSad + 3) = WelsSampleSad16x8_lasx (iSample1, iStride1,
501                                         (iSample2 + 1), iStride2);
502 }
503 
WelsSampleSadFour16x16_lasx(uint8_t * iSample1,int32_t iStride1,uint8_t * iSample2,int32_t iStride2,int32_t * pSad)504 void WelsSampleSadFour16x16_lasx (uint8_t* iSample1, int32_t iStride1,
505                                   uint8_t* iSample2, int32_t iStride2,
506                                   int32_t* pSad) {
507   * (pSad)     = WelsSampleSad16x16_lasx (iSample1, iStride1,
508                                          (iSample2 - iStride2), iStride2);
509   * (pSad + 1) = WelsSampleSad16x16_lasx (iSample1, iStride1,
510                                          (iSample2 + iStride2), iStride2);
511   * (pSad + 2) = WelsSampleSad16x16_lasx (iSample1, iStride1,
512                                          (iSample2 - 1), iStride2);
513   * (pSad + 3) = WelsSampleSad16x16_lasx (iSample1, iStride1,
514                                          (iSample2 + 1), iStride2);
515 }
516 
WelsSampleSatd8x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)517 int32_t WelsSampleSatd8x8_lasx (uint8_t* pSample1, int32_t iStride1,
518                                 uint8_t* pSample2, int32_t iStride2) {
519   int32_t iSatdSum = 0;
520 
521   iSatdSum += WelsSampleSatd4x4_lasx (pSample1, iStride1,
522                                       pSample2, iStride2);
523   iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + 4, iStride1,
524                                       pSample2 + 4, iStride2);
525   iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + (iStride1 << 2), iStride1,
526                                       pSample2 + (iStride2 << 2),   iStride2);
527   iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + (iStride1 << 2) + 4, iStride1,
528                                       pSample2 + (iStride2 << 2) + 4, iStride2);
529   return iSatdSum;
530 }
531 
WelsSampleSatd16x8_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)532 int32_t WelsSampleSatd16x8_lasx (uint8_t* pSample1, int32_t iStride1,
533                                  uint8_t* pSample2, int32_t iStride2) {
534   int32_t iSatdSum = 0;
535 
536   iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1,
537                                       pSample2, iStride2);
538   iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + 8, iStride1,
539                                       pSample2 + 8, iStride2);
540   return iSatdSum;
541 }
542 
WelsSampleSatd8x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)543 int32_t WelsSampleSatd8x16_lasx (uint8_t* pSample1, int32_t iStride1,
544                                  uint8_t* pSample2, int32_t iStride2) {
545   int32_t iSatdSum = 0;
546 
547   iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1,
548                                       pSample2, iStride2);
549   iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3), iStride1,
550                                       pSample2 + (iStride2 << 3), iStride2);
551   return iSatdSum;
552 }
553 
WelsSampleSatd16x16_lasx(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)554 int32_t WelsSampleSatd16x16_lasx (uint8_t* pSample1, int32_t iStride1,
555                                   uint8_t* pSample2, int32_t iStride2) {
556   int32_t iSatdSum = 0;
557 
558   iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1,
559                                       pSample2, iStride2);
560   iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + 8, iStride1,
561                                       pSample2 + 8, iStride2);
562   iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3), iStride1,
563                                       pSample2 + (iStride2 << 3), iStride2);
564   iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3) + 8, iStride1,
565                                       pSample2 + (iStride2 << 3) + 8, iStride2);
566   return iSatdSum;
567 }
568