• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  *************************************************************************************
3  * Copyright (c) 2022 Loongson Technology Corporation Limited
4  * Contributed by Jin Bo <jinbo@loongson.cn>
5  *
6  * \copy
7  *     Copyright (c)  2022, Cisco Systems
8  *     All rights reserved.
9  *
10  *     Redistribution and use in source and binary forms, with or without
11  *     modification, are permitted provided that the following conditions
12  *     are met:
13  *
14  *        * Redistributions of source code must retain the above copyright
15  *          notice, this list of conditions and the following disclaimer.
16  *
17  *        * Redistributions in binary form must reproduce the above copyright
18  *          notice, this list of conditions and the following disclaimer in
19  *          the documentation and/or other materials provided with the
20  *          distribution.
21  *
22  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  *     POSSIBILITY OF SUCH DAMAGE.
34  *
35  *
36  * \file    mc_horver_lsx.c
37  *
38  * \brief   Loongson optimization
39  *
40  * \date    3/3/2022 Created
41  *
42  *************************************************************************************
43  */
44 
45 #include <stdint.h>
46 #include "loongson_intrinsics.h"
47 
48 #define FILTER_INPUT_8BIT(_in0, _in1, _in2, _in3, \
49                           _in4, _in5, _out0)      \
50 do {                                              \
51   __m128i _tmp;                                   \
52   _in0  = __lsx_vadd_h(_in0, _in5);               \
53   _in1  = __lsx_vadd_h(_in1, _in4);               \
54   _in2  = __lsx_vadd_h(_in2, _in3);               \
55   _tmp  = __lsx_vslli_h(_in1, 2);                 \
56   _in1  = __lsx_vadd_h(_tmp, _in1);               \
57   _in0  = __lsx_vsub_h(_in0, _in1);               \
58   _tmp  = __lsx_vslli_h(_in2, 4);                 \
59   _in0  = __lsx_vadd_h(_in0, _tmp);               \
60   _tmp  = __lsx_vslli_h(_in2, 2);                 \
61   _out0 = __lsx_vadd_h(_in0, _tmp);               \
62 }while(0)
63 
64 #define HOR_FILTER_INPUT_16BIT(_in0, _in1, _in2, _in3, \
65                                _in4, _in5, _out0)      \
66 do {                                                   \
67   __m128i _pi05, _pi14, _pi23, _temp;                  \
68   _pi05 = __lsx_vadd_w(_in0, _in5);                    \
69   _pi14 = __lsx_vadd_w(_in1, _in4);                    \
70   _pi23 = __lsx_vadd_w(_in2, _in3);                    \
71   _temp = __lsx_vslli_w(_pi14, 2);                     \
72   _pi14 = __lsx_vadd_w(_temp, _pi14);                  \
73   _pi05 = __lsx_vsub_w(_pi05, _pi14);                  \
74   _temp = __lsx_vslli_w(_pi23, 4);                     \
75   _pi05 = __lsx_vadd_w(_pi05, _temp);                  \
76   _temp = __lsx_vslli_w(_pi23, 2);                     \
77   _out0 = __lsx_vadd_w(_pi05, _temp);                  \
78 }while(0)
79 
PixelAvgWidthEq4_lsx(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iHeight)80 void PixelAvgWidthEq4_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
81                           int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
82                           int32_t iHeight ) {
83   int32_t i;
84   __m128i src0, src1;
85   for (i = 0; i < iHeight; i++) {
86     src0 = __lsx_vldrepl_w(pSrcA, 0);
87     src1 = __lsx_vldrepl_w(pSrcB, 0);
88     pSrcA += iSrcAStride;
89     pSrcB += iSrcBStride;
90     src0 = __lsx_vavgr_bu(src0, src1);
91     __lsx_vstelm_w(src0, pDst, 0, 0);
92     pDst  += iDstStride;
93   }
94 }
95 
PixelAvgWidthEq8_lsx(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iHeight)96 void PixelAvgWidthEq8_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
97                           int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
98                           int32_t iHeight ) {
99   int32_t i;
100   __m128i src0, src1, src2, src3;
101   for (i = 0; i < iHeight; i += 2) {
102     src0 = __lsx_vldrepl_d(pSrcA, 0);
103     src1 = __lsx_vldrepl_d(pSrcB, 0);
104     pSrcA += iSrcAStride;
105     pSrcB += iSrcBStride;
106     src0 = __lsx_vavgr_bu(src0, src1);
107     src2 = __lsx_vldrepl_d(pSrcA, 0);
108     src3 = __lsx_vldrepl_d(pSrcB, 0);
109     pSrcA += iSrcAStride;
110     pSrcB += iSrcBStride;
111     src2 = __lsx_vavgr_bu(src2, src3);
112     __lsx_vstelm_d(src0, pDst, 0, 0);
113     pDst  += iDstStride;
114     __lsx_vstelm_d(src2, pDst, 0, 0);
115     pDst  += iDstStride;
116   }
117 }
118 
PixelAvgWidthEq16_lsx(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iHeight)119 void PixelAvgWidthEq16_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
120                            int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
121                            int32_t iHeight ) {
122   int32_t i;
123   __m128i src0, src1, src2, src3;
124   __m128i src4, src5, src6, src7;
125   for (i = 0; i < iHeight; i += 4) {
126     src0 = __lsx_vld(pSrcA, 0);
127     src1 = __lsx_vld(pSrcB, 0);
128     pSrcA += iSrcAStride;
129     pSrcB += iSrcBStride;
130     src0 = __lsx_vavgr_bu(src0, src1);
131     src2 = __lsx_vld(pSrcA, 0);
132     src3 = __lsx_vld(pSrcB, 0);
133     pSrcA += iSrcAStride;
134     pSrcB += iSrcBStride;
135     src2 = __lsx_vavgr_bu(src2, src3);
136     src4 = __lsx_vld(pSrcA, 0);
137     src5 = __lsx_vld(pSrcB, 0);
138     pSrcA += iSrcAStride;
139     pSrcB += iSrcBStride;
140     src4 = __lsx_vavgr_bu(src4, src5);
141     src6 = __lsx_vld(pSrcA, 0);
142     src7 = __lsx_vld(pSrcB, 0);
143     pSrcA += iSrcAStride;
144     pSrcB += iSrcBStride;
145     src6 = __lsx_vavgr_bu(src6, src7);
146     __lsx_vst(src0, pDst, 0);
147     pDst  += iDstStride;
148     __lsx_vst(src2, pDst, 0);
149     pDst += iDstStride;
150     __lsx_vst(src4, pDst, 0);
151     pDst += iDstStride;
152     __lsx_vst(src6, pDst, 0);
153     pDst += iDstStride;
154   }
155 }
156 
McHorVer02WidthEq8_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)157 void McHorVer02WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
158                             int32_t iDstStride, int32_t iHeight) {
159   int32_t iStride1 = iSrcStride;
160   int32_t iStride2 = iSrcStride << 1;
161   int32_t iStride3 = iStride1 + iStride2;
162   uint8_t *psrc = (uint8_t*)pSrc;
163   __m128i src0, src1, src2, src3, src4, src5;
164   for (int i = 0; i < iHeight; i++) {
165     DUP4_ARG2(__lsx_vldx,
166               psrc, -iStride2,
167               psrc, -iStride1,
168               psrc, iStride1,
169               psrc, iStride2,
170               src0, src1, src3, src4);
171     src2 = __lsx_vld(psrc, 0);
172     src5 = __lsx_vldx(psrc, iStride3);
173     DUP4_ARG2(__lsx_vsllwil_hu_bu,
174               src0, 0,
175               src1, 0,
176               src2, 0,
177               src3, 0,
178               src0, src1, src2, src3);
179     src4 = __lsx_vsllwil_hu_bu(src4, 0);
180     src5 = __lsx_vsllwil_hu_bu(src5, 0);
181     FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
182     src0 = __lsx_vsrari_h(src0, 5);
183     src0 = __lsx_vclip255_h(src0);
184     src0 = __lsx_vpickev_b(src0, src0);
185     __lsx_vstelm_d(src0, pDst, 0, 0);
186     pDst += iDstStride;
187     psrc += iSrcStride;
188   }
189 }
190 
McHorVer02WidthEq16_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)191 void McHorVer02WidthEq16_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
192                              int32_t iDstStride, int32_t iHeight) {
193   int32_t iStride1 = iSrcStride;
194   int32_t iStride2 = iSrcStride << 1;
195   int32_t iStride3 = iStride1 + iStride2;
196   uint8_t *psrc = (uint8_t*)pSrc;
197   __m128i src0, src1, src2, src3, src4, src5;
198   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
199   for (int i = 0; i < iHeight; i++) {
200     DUP4_ARG2(__lsx_vldx,
201               psrc, -iStride2,
202               psrc, -iStride1,
203               psrc, iStride1,
204               psrc, iStride2,
205               src0, src1, src3, src4);
206     src2 = __lsx_vld(psrc, 0);
207     src5 = __lsx_vldx(psrc, iStride3);
208     //l part
209     DUP4_ARG2(__lsx_vsllwil_hu_bu,
210               src0, 0,
211               src1, 0,
212               src2, 0,
213               src3, 0,
214               tmp0, tmp1, tmp2, tmp3);
215     tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
216     tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
217     FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
218     out0_l = __lsx_vsrari_h(out0_l, 5);
219     out0_l = __lsx_vclip255_h(out0_l);
220     //h part
221     DUP4_ARG1(__lsx_vexth_hu_bu,
222               src0,
223               src1,
224               src2,
225               src3,
226               tmp0, tmp1, tmp2, tmp3);
227     tmp4 = __lsx_vexth_hu_bu(src4);
228     tmp5 = __lsx_vexth_hu_bu(src5);
229     FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
230     out0_h = __lsx_vsrari_h(out0_h, 5);
231     out0_h = __lsx_vclip255_h(out0_h);
232     out0_l = __lsx_vpickev_b(out0_h, out0_l);
233     __lsx_vst(out0_l, pDst, 0);
234     pDst += iDstStride;
235     psrc += iSrcStride;
236   }
237 }
238 
McHorVer20WidthEq4_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)239 void McHorVer20WidthEq4_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
240                             int32_t iDstStride, int32_t iHeight) {
241   uint8_t *psrc = (uint8_t*)pSrc -2;
242   __m128i src0, src1, src2, src3, src4, src5;
243   for (int i = 0; i < iHeight; i++) {
244     src0 = __lsx_vld(psrc, 0);
245     DUP4_ARG2(__lsx_vbsrl_v,
246               src0, 1,
247               src0, 2,
248               src0, 3,
249               src0, 4,
250               src1, src2, src3, src4);
251     src5 = __lsx_vbsrl_v(src0, 5);
252     DUP4_ARG2(__lsx_vsllwil_hu_bu,
253               src0, 0,
254               src1, 0,
255               src2, 0,
256               src3, 0,
257               src0, src1, src2, src3);
258     src4 = __lsx_vsllwil_hu_bu(src4, 0);
259     src5 = __lsx_vsllwil_hu_bu(src5, 0);
260     FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
261     src0 = __lsx_vsrari_h(src0, 5);
262     src0 = __lsx_vclip255_h(src0);
263     src0 = __lsx_vpickev_b(src0, src0);
264     __lsx_vstelm_w(src0, pDst, 0, 0);
265     pDst += iDstStride;
266     psrc += iSrcStride;
267   }
268 }
269 
McHorVer20WidthEq5_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)270 void McHorVer20WidthEq5_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
271                             int32_t iDstStride, int32_t iHeight) {
272   uint8_t *psrc = (uint8_t*)pSrc -2;
273   __m128i src0, src1, src2, src3, src4, src5;
274   for (int i = 0; i < iHeight; i++) {
275     src0 = __lsx_vld(psrc, 0);
276     DUP4_ARG2(__lsx_vbsrl_v,
277               src0, 1,
278               src0, 2,
279               src0, 3,
280               src0, 4,
281               src1, src2, src3, src4);
282     src5 = __lsx_vbsrl_v(src0, 5);
283     DUP4_ARG2(__lsx_vsllwil_hu_bu,
284               src0, 0,
285               src1, 0,
286               src2, 0,
287               src3, 0,
288               src0, src1, src2, src3);
289     src4 = __lsx_vsllwil_hu_bu(src4, 0);
290     src5 = __lsx_vsllwil_hu_bu(src5, 0);
291     FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
292     src0 = __lsx_vsrari_h(src0, 5);
293     src0 = __lsx_vclip255_h(src0);
294     src0 = __lsx_vpickev_b(src0, src0);
295     __lsx_vstelm_w(src0, pDst, 0, 0);
296     __lsx_vstelm_b(src0, pDst, 4, 4);
297     pDst += iDstStride;
298     psrc += iSrcStride;
299   }
300 }
301 
McHorVer20WidthEq8_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)302 void McHorVer20WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
303                             int32_t iDstStride, int32_t iHeight) {
304   uint8_t *psrc = (uint8_t*)pSrc -2;
305   __m128i src0, src1, src2, src3, src4, src5;
306   for (int i = 0; i < iHeight; i++) {
307     src0 = __lsx_vld(psrc, 0);
308     DUP4_ARG2(__lsx_vbsrl_v,
309               src0, 1,
310               src0, 2,
311               src0, 3,
312               src0, 4,
313               src1, src2, src3, src4);
314     src5 = __lsx_vbsrl_v(src0, 5);
315     DUP4_ARG2(__lsx_vsllwil_hu_bu,
316               src0, 0,
317               src1, 0,
318               src2, 0,
319               src3, 0,
320               src0, src1, src2, src3);
321     src4 = __lsx_vsllwil_hu_bu(src4, 0);
322     src5 = __lsx_vsllwil_hu_bu(src5, 0);
323     FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
324     src0 = __lsx_vsrari_h(src0, 5);
325     src0 = __lsx_vclip255_h(src0);
326     src0 = __lsx_vpickev_b(src0, src0);
327     __lsx_vstelm_d(src0, pDst, 0, 0);
328     pDst += iDstStride;
329     psrc += iSrcStride;
330   }
331 }
332 
McHorVer20WidthEq9_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)333 void McHorVer20WidthEq9_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
334                             int32_t iDstStride, int32_t iHeight) {
335   McHorVer20WidthEq4_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
336   McHorVer20WidthEq5_lsx(&pSrc[4], iSrcStride, &pDst[4], iDstStride, iHeight);
337 }
338 
McHorVer20WidthEq16_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)339 void McHorVer20WidthEq16_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
340                              int32_t iDstStride, int32_t iHeight) {
341   uint8_t *psrc = (uint8_t*)pSrc - 2;
342   __m128i src0, src1, src2, src3, src4, src5;
343   __m128i tmp0, tmp1, tmp2 ,tmp3 ,tmp4, tmp5, out0_l, out0_h;
344   for (int i = 0; i < iHeight; i++) {
345     DUP4_ARG2(__lsx_vld,
346               psrc,  0,
347               psrc + 1, 0,
348               psrc + 2, 0,
349               psrc + 3, 0,
350               src0, src1, src2, src3);
351     src4 = __lsx_vld(psrc + 4, 0);
352     src5 = __lsx_vld(psrc + 5, 0);
353     //l part
354     DUP4_ARG2(__lsx_vsllwil_hu_bu,
355               src0, 0,
356               src1, 0,
357               src2, 0,
358               src3, 0,
359               tmp0, tmp1, tmp2, tmp3);
360     tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
361     tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
362     FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
363     out0_l = __lsx_vsrari_h(out0_l, 5);
364     out0_l = __lsx_vclip255_h(out0_l);
365     //h part
366     DUP4_ARG1(__lsx_vexth_hu_bu,
367               src0,
368               src1,
369               src2,
370               src3,
371               tmp0, tmp1, tmp2, tmp3);
372     tmp4 = __lsx_vexth_hu_bu(src4);
373     tmp5 = __lsx_vexth_hu_bu(src5);
374     FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
375     out0_h = __lsx_vsrari_h(out0_h, 5);
376     out0_h = __lsx_vclip255_h(out0_h);
377     out0_l = __lsx_vpickev_b(out0_h, out0_l);
378     __lsx_vst(out0_l, pDst, 0);
379     pDst += iDstStride;
380     psrc += iSrcStride;
381   }
382 }
383 
McHorVer20WidthEq17_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)384 void McHorVer20WidthEq17_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
385                              int32_t iDstStride, int32_t iHeight) {
386   McHorVer20WidthEq8_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
387   McHorVer20WidthEq9_lsx(&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
388 }
389 
McHorVer22WidthEq8_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)390 void McHorVer22WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
391                             int32_t iDstStride, int32_t iHeight) {
392   int32_t iStride1 = iSrcStride;
393   int32_t iStride2 = iSrcStride << 1;
394   int32_t iStride3 = iStride1 + iStride2;
395   uint8_t *psrc = (uint8_t*)pSrc - 2;
396   __m128i src0, src1, src2, src3, src4, src5;
397   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
398   __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
399   v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
400   v8i16 mask2 = {6, 7, 8, 9, 10, 11, 12 ,13};
401   for (int i = 0; i < iHeight; i++) {
402     DUP4_ARG2(__lsx_vldx,
403               psrc, -iStride2,
404               psrc, -iStride1,
405               psrc, iStride1,
406               psrc, iStride2,
407               src0, src1, src3, src4);
408     src2 = __lsx_vld(psrc, 0);
409     src5 = __lsx_vldx(psrc, iStride3);
410     //l part
411     DUP4_ARG2(__lsx_vsllwil_hu_bu,
412               src0, 0,
413               src1, 0,
414               src2, 0,
415               src3, 0,
416               tmp0, tmp1, tmp2, tmp3);
417     tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
418     tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
419     FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
420     //h part
421     DUP4_ARG1(__lsx_vexth_hu_bu,
422               src0,
423               src1,
424               src2,
425               src3,
426               tmp0, tmp1, tmp2, tmp3);
427     tmp4 = __lsx_vexth_hu_bu(src4);
428     tmp5 = __lsx_vexth_hu_bu(src5);
429     FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
430     dst0 = out0_l;
431     dst1 = __lsx_vbsrl_v(out0_l, 2);
432     dst2 = __lsx_vbsrl_v(out0_l, 4);
433     dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
434     dst4 = __lsx_vbsrl_v(dst3, 2);
435     dst5 = __lsx_vbsrl_v(dst3, 4);
436     dst6 = __lsx_vshuf_h((__m128i)mask2, out0_h, out0_l);
437     dst7 = __lsx_vbsrl_v(dst6, 2);
438     LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
439                        dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
440     //l part
441     DUP4_ARG2(__lsx_vsllwil_w_h,
442               dst0, 0,
443               dst1, 0,
444               dst2, 0,
445               dst3, 0,
446               tmp0, tmp1, tmp2, tmp3);
447     DUP2_ARG2(__lsx_vsllwil_w_h,
448               dst4, 0,
449               dst5, 0,
450               tmp4, tmp5);
451     HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
452     //h part
453     DUP4_ARG1(__lsx_vexth_w_h,
454               dst0,
455               dst1,
456               dst2,
457               dst3,
458               tmp0, tmp1, tmp2, tmp3);
459     DUP2_ARG1(__lsx_vexth_w_h,
460               dst4,
461               dst5,
462               tmp4, tmp5);
463     HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
464     out0_l = __lsx_vsrari_w(out0_l, 10);
465     out0_h = __lsx_vsrari_w(out0_h, 10);
466     DUP2_ARG1(__lsx_vclip255_w,
467               out0_l, out0_h,
468               out0_l, out0_h);
469     out0_l = __lsx_vpickev_h(out0_h, out0_l);
470     out0_l = __lsx_vpickev_b(out0_l, out0_l);
471     __lsx_vstelm_d(out0_l, pDst, 0, 0);
472     psrc += iSrcStride;
473     pDst += iDstStride;
474   }
475 }
476 
477 static
McHorVer22WidthEq4_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)478 void McHorVer22WidthEq4_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
479                             int32_t iDstStride, int32_t iHeight) {
480   int32_t iStride1 = iSrcStride;
481   int32_t iStride2 = iSrcStride << 1;
482   int32_t iStride3 = iStride1 + iStride2;
483   uint8_t *psrc = (uint8_t*)pSrc - 2;
484   __m128i src0, src1, src2, src3, src4, src5;
485   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
486   __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
487   v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
488   for (int i = 0; i < iHeight; i++) {
489     DUP4_ARG2(__lsx_vldx,
490               psrc, -iStride2,
491               psrc, -iStride1,
492               psrc, iStride1,
493               psrc, iStride2,
494               src0, src1, src3, src4);
495     src2 = __lsx_vld(psrc, 0);
496     src5 = __lsx_vldx(psrc, iStride3);
497     //l part
498     DUP4_ARG2(__lsx_vsllwil_hu_bu,
499               src0, 0,
500               src1, 0,
501               src2, 0,
502               src3, 0,
503               tmp0, tmp1, tmp2, tmp3);
504     tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
505     tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
506     FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
507     //h part
508     DUP4_ARG1(__lsx_vexth_hu_bu,
509               src0,
510               src1,
511               src2,
512               src3,
513               tmp0, tmp1, tmp2, tmp3);
514     tmp4 = __lsx_vexth_hu_bu(src4);
515     tmp5 = __lsx_vexth_hu_bu(src5);
516     FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
517     dst0 = out0_l;
518     dst1 = __lsx_vbsrl_v(out0_l, 2);
519     dst2 = __lsx_vbsrl_v(out0_l, 4);
520     dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
521     LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
522                        dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
523     //l part
524     DUP4_ARG2(__lsx_vsllwil_w_h,
525               dst0, 0,
526               dst1, 0,
527               dst2, 0,
528               dst3, 0,
529               tmp0, tmp1, tmp2, tmp3);
530     DUP2_ARG2(__lsx_vsllwil_w_h,
531               dst4, 0,
532               dst5, 0,
533               tmp4, tmp5);
534     HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
535     //h part
536     DUP4_ARG1(__lsx_vexth_w_h,
537               dst0,
538               dst1,
539               dst2,
540               dst3,
541               tmp0, tmp1, tmp2, tmp3);
542     DUP2_ARG1(__lsx_vexth_w_h,
543               dst4,
544               dst5,
545               tmp4, tmp5);
546     HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
547     out0_l = __lsx_vsrari_w(out0_l, 10);
548     out0_h = __lsx_vsrari_w(out0_h, 10);
549     DUP2_ARG1(__lsx_vclip255_w,
550               out0_l, out0_h,
551               out0_l, out0_h);
552     out0_l = __lsx_vpickev_h(out0_h, out0_l);
553     out0_l = __lsx_vpickev_b(out0_l, out0_l);
554     __lsx_vstelm_w(out0_l, pDst, 0, 0);
555     psrc += iSrcStride;
556     pDst += iDstStride;
557   }
558 }
559 
McHorVer22WidthEq5_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)560 void McHorVer22WidthEq5_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
561                             int32_t iDstStride, int32_t iHeight) {
562   int32_t iStride1 = iSrcStride;
563   int32_t iStride2 = iSrcStride << 1;
564   int32_t iStride3 = iStride1 + iStride2;
565   uint8_t *psrc = (uint8_t*)pSrc - 2;
566   __m128i src0, src1, src2, src3, src4, src5;
567   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
568   __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
569   v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
570   for (int i = 0; i < iHeight; i++) {
571     DUP4_ARG2(__lsx_vldx,
572               psrc, -iStride2,
573               psrc, -iStride1,
574               psrc, iStride1,
575               psrc, iStride2,
576               src0, src1, src3, src4);
577     src2 = __lsx_vld(psrc, 0);
578     src5 = __lsx_vldx(psrc, iStride3);
579     //l part
580     DUP4_ARG2(__lsx_vsllwil_hu_bu,
581               src0, 0,
582               src1, 0,
583               src2, 0,
584               src3, 0,
585               tmp0, tmp1, tmp2, tmp3);
586     tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
587     tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
588     FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
589     //h part
590     DUP4_ARG1(__lsx_vexth_hu_bu,
591               src0,
592               src1,
593               src2,
594               src3,
595               tmp0, tmp1, tmp2, tmp3);
596     tmp4 = __lsx_vexth_hu_bu(src4);
597     tmp5 = __lsx_vexth_hu_bu(src5);
598     FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
599     dst0 = out0_l;
600     dst1 = __lsx_vbsrl_v(out0_l, 2);
601     dst2 = __lsx_vbsrl_v(out0_l, 4);
602     dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
603     dst4 = __lsx_vbsrl_v(dst3, 2);
604     LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
605                        dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
606     //l part
607     DUP4_ARG2(__lsx_vsllwil_w_h,
608               dst0, 0,
609               dst1, 0,
610               dst2, 0,
611               dst3, 0,
612               tmp0, tmp1, tmp2, tmp3);
613     DUP2_ARG2(__lsx_vsllwil_w_h,
614               dst4, 0,
615               dst5, 0,
616               tmp4, tmp5);
617     HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
618     //h part
619     DUP4_ARG1(__lsx_vexth_w_h,
620               dst0,
621               dst1,
622               dst2,
623               dst3,
624               tmp0, tmp1, tmp2, tmp3);
625     DUP2_ARG1(__lsx_vexth_w_h,
626               dst4,
627               dst5,
628               tmp4, tmp5);
629     HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
630     out0_l = __lsx_vsrari_w(out0_l, 10);
631     out0_h = __lsx_vsrari_w(out0_h, 10);
632     DUP2_ARG1(__lsx_vclip255_w,
633               out0_l, out0_h,
634               out0_l, out0_h);
635     out0_l = __lsx_vpickev_h(out0_h, out0_l);
636     out0_l = __lsx_vpickev_b(out0_l, out0_l);
637     __lsx_vstelm_w(out0_l, pDst, 0, 0);
638     __lsx_vstelm_b(out0_l, pDst, 4, 4);
639     psrc += iSrcStride;
640     pDst += iDstStride;
641   }
642 }
643 
McHorVer22WidthEq9_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)644 void McHorVer22WidthEq9_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
645                             int32_t iDstStride, int32_t iHeight) {
646   McHorVer22WidthEq4_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
647   McHorVer22WidthEq5_lsx(&pSrc[4], iSrcStride, &pDst[4], iDstStride, iHeight);
648 }
649 
McHorVer22WidthEq17_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)650 void McHorVer22WidthEq17_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
651                              int32_t iDstStride, int32_t iHeight) {
652   McHorVer22WidthEq8_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
653   McHorVer22WidthEq9_lsx(&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
654 }
655