1 /*!
2 *************************************************************************************
3 * Copyright (c) 2022 Loongson Technology Corporation Limited
4 * Contributed by Jin Bo <jinbo@loongson.cn>
5 *
6 * \copy
7 * Copyright (c) 2022, Cisco Systems
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * * Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
20 * distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 *
35 *
36 * \file mc_horver_lsx.c
37 *
38 * \brief Loongson optimization
39 *
40 * \date 3/3/2022 Created
41 *
42 *************************************************************************************
43 */
44
45 #include <stdint.h>
46 #include "loongson_intrinsics.h"
47
48 #define FILTER_INPUT_8BIT(_in0, _in1, _in2, _in3, \
49 _in4, _in5, _out0) \
50 do { \
51 __m128i _tmp; \
52 _in0 = __lsx_vadd_h(_in0, _in5); \
53 _in1 = __lsx_vadd_h(_in1, _in4); \
54 _in2 = __lsx_vadd_h(_in2, _in3); \
55 _tmp = __lsx_vslli_h(_in1, 2); \
56 _in1 = __lsx_vadd_h(_tmp, _in1); \
57 _in0 = __lsx_vsub_h(_in0, _in1); \
58 _tmp = __lsx_vslli_h(_in2, 4); \
59 _in0 = __lsx_vadd_h(_in0, _tmp); \
60 _tmp = __lsx_vslli_h(_in2, 2); \
61 _out0 = __lsx_vadd_h(_in0, _tmp); \
62 }while(0)
63
64 #define HOR_FILTER_INPUT_16BIT(_in0, _in1, _in2, _in3, \
65 _in4, _in5, _out0) \
66 do { \
67 __m128i _pi05, _pi14, _pi23, _temp; \
68 _pi05 = __lsx_vadd_w(_in0, _in5); \
69 _pi14 = __lsx_vadd_w(_in1, _in4); \
70 _pi23 = __lsx_vadd_w(_in2, _in3); \
71 _temp = __lsx_vslli_w(_pi14, 2); \
72 _pi14 = __lsx_vadd_w(_temp, _pi14); \
73 _pi05 = __lsx_vsub_w(_pi05, _pi14); \
74 _temp = __lsx_vslli_w(_pi23, 4); \
75 _pi05 = __lsx_vadd_w(_pi05, _temp); \
76 _temp = __lsx_vslli_w(_pi23, 2); \
77 _out0 = __lsx_vadd_w(_pi05, _temp); \
78 }while(0)
79
PixelAvgWidthEq4_lsx(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iHeight)80 void PixelAvgWidthEq4_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
81 int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
82 int32_t iHeight ) {
83 int32_t i;
84 __m128i src0, src1;
85 for (i = 0; i < iHeight; i++) {
86 src0 = __lsx_vldrepl_w(pSrcA, 0);
87 src1 = __lsx_vldrepl_w(pSrcB, 0);
88 pSrcA += iSrcAStride;
89 pSrcB += iSrcBStride;
90 src0 = __lsx_vavgr_bu(src0, src1);
91 __lsx_vstelm_w(src0, pDst, 0, 0);
92 pDst += iDstStride;
93 }
94 }
95
PixelAvgWidthEq8_lsx(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iHeight)96 void PixelAvgWidthEq8_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
97 int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
98 int32_t iHeight ) {
99 int32_t i;
100 __m128i src0, src1, src2, src3;
101 for (i = 0; i < iHeight; i += 2) {
102 src0 = __lsx_vldrepl_d(pSrcA, 0);
103 src1 = __lsx_vldrepl_d(pSrcB, 0);
104 pSrcA += iSrcAStride;
105 pSrcB += iSrcBStride;
106 src0 = __lsx_vavgr_bu(src0, src1);
107 src2 = __lsx_vldrepl_d(pSrcA, 0);
108 src3 = __lsx_vldrepl_d(pSrcB, 0);
109 pSrcA += iSrcAStride;
110 pSrcB += iSrcBStride;
111 src2 = __lsx_vavgr_bu(src2, src3);
112 __lsx_vstelm_d(src0, pDst, 0, 0);
113 pDst += iDstStride;
114 __lsx_vstelm_d(src2, pDst, 0, 0);
115 pDst += iDstStride;
116 }
117 }
118
PixelAvgWidthEq16_lsx(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iHeight)119 void PixelAvgWidthEq16_lsx(uint8_t *pDst, int32_t iDstStride, const uint8_t *pSrcA,
120 int32_t iSrcAStride, const uint8_t *pSrcB, int32_t iSrcBStride,
121 int32_t iHeight ) {
122 int32_t i;
123 __m128i src0, src1, src2, src3;
124 __m128i src4, src5, src6, src7;
125 for (i = 0; i < iHeight; i += 4) {
126 src0 = __lsx_vld(pSrcA, 0);
127 src1 = __lsx_vld(pSrcB, 0);
128 pSrcA += iSrcAStride;
129 pSrcB += iSrcBStride;
130 src0 = __lsx_vavgr_bu(src0, src1);
131 src2 = __lsx_vld(pSrcA, 0);
132 src3 = __lsx_vld(pSrcB, 0);
133 pSrcA += iSrcAStride;
134 pSrcB += iSrcBStride;
135 src2 = __lsx_vavgr_bu(src2, src3);
136 src4 = __lsx_vld(pSrcA, 0);
137 src5 = __lsx_vld(pSrcB, 0);
138 pSrcA += iSrcAStride;
139 pSrcB += iSrcBStride;
140 src4 = __lsx_vavgr_bu(src4, src5);
141 src6 = __lsx_vld(pSrcA, 0);
142 src7 = __lsx_vld(pSrcB, 0);
143 pSrcA += iSrcAStride;
144 pSrcB += iSrcBStride;
145 src6 = __lsx_vavgr_bu(src6, src7);
146 __lsx_vst(src0, pDst, 0);
147 pDst += iDstStride;
148 __lsx_vst(src2, pDst, 0);
149 pDst += iDstStride;
150 __lsx_vst(src4, pDst, 0);
151 pDst += iDstStride;
152 __lsx_vst(src6, pDst, 0);
153 pDst += iDstStride;
154 }
155 }
156
McHorVer02WidthEq8_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)157 void McHorVer02WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
158 int32_t iDstStride, int32_t iHeight) {
159 int32_t iStride1 = iSrcStride;
160 int32_t iStride2 = iSrcStride << 1;
161 int32_t iStride3 = iStride1 + iStride2;
162 uint8_t *psrc = (uint8_t*)pSrc;
163 __m128i src0, src1, src2, src3, src4, src5;
164 for (int i = 0; i < iHeight; i++) {
165 DUP4_ARG2(__lsx_vldx,
166 psrc, -iStride2,
167 psrc, -iStride1,
168 psrc, iStride1,
169 psrc, iStride2,
170 src0, src1, src3, src4);
171 src2 = __lsx_vld(psrc, 0);
172 src5 = __lsx_vldx(psrc, iStride3);
173 DUP4_ARG2(__lsx_vsllwil_hu_bu,
174 src0, 0,
175 src1, 0,
176 src2, 0,
177 src3, 0,
178 src0, src1, src2, src3);
179 src4 = __lsx_vsllwil_hu_bu(src4, 0);
180 src5 = __lsx_vsllwil_hu_bu(src5, 0);
181 FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
182 src0 = __lsx_vsrari_h(src0, 5);
183 src0 = __lsx_vclip255_h(src0);
184 src0 = __lsx_vpickev_b(src0, src0);
185 __lsx_vstelm_d(src0, pDst, 0, 0);
186 pDst += iDstStride;
187 psrc += iSrcStride;
188 }
189 }
190
McHorVer02WidthEq16_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)191 void McHorVer02WidthEq16_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
192 int32_t iDstStride, int32_t iHeight) {
193 int32_t iStride1 = iSrcStride;
194 int32_t iStride2 = iSrcStride << 1;
195 int32_t iStride3 = iStride1 + iStride2;
196 uint8_t *psrc = (uint8_t*)pSrc;
197 __m128i src0, src1, src2, src3, src4, src5;
198 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
199 for (int i = 0; i < iHeight; i++) {
200 DUP4_ARG2(__lsx_vldx,
201 psrc, -iStride2,
202 psrc, -iStride1,
203 psrc, iStride1,
204 psrc, iStride2,
205 src0, src1, src3, src4);
206 src2 = __lsx_vld(psrc, 0);
207 src5 = __lsx_vldx(psrc, iStride3);
208 //l part
209 DUP4_ARG2(__lsx_vsllwil_hu_bu,
210 src0, 0,
211 src1, 0,
212 src2, 0,
213 src3, 0,
214 tmp0, tmp1, tmp2, tmp3);
215 tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
216 tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
217 FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
218 out0_l = __lsx_vsrari_h(out0_l, 5);
219 out0_l = __lsx_vclip255_h(out0_l);
220 //h part
221 DUP4_ARG1(__lsx_vexth_hu_bu,
222 src0,
223 src1,
224 src2,
225 src3,
226 tmp0, tmp1, tmp2, tmp3);
227 tmp4 = __lsx_vexth_hu_bu(src4);
228 tmp5 = __lsx_vexth_hu_bu(src5);
229 FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
230 out0_h = __lsx_vsrari_h(out0_h, 5);
231 out0_h = __lsx_vclip255_h(out0_h);
232 out0_l = __lsx_vpickev_b(out0_h, out0_l);
233 __lsx_vst(out0_l, pDst, 0);
234 pDst += iDstStride;
235 psrc += iSrcStride;
236 }
237 }
238
McHorVer20WidthEq4_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)239 void McHorVer20WidthEq4_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
240 int32_t iDstStride, int32_t iHeight) {
241 uint8_t *psrc = (uint8_t*)pSrc -2;
242 __m128i src0, src1, src2, src3, src4, src5;
243 for (int i = 0; i < iHeight; i++) {
244 src0 = __lsx_vld(psrc, 0);
245 DUP4_ARG2(__lsx_vbsrl_v,
246 src0, 1,
247 src0, 2,
248 src0, 3,
249 src0, 4,
250 src1, src2, src3, src4);
251 src5 = __lsx_vbsrl_v(src0, 5);
252 DUP4_ARG2(__lsx_vsllwil_hu_bu,
253 src0, 0,
254 src1, 0,
255 src2, 0,
256 src3, 0,
257 src0, src1, src2, src3);
258 src4 = __lsx_vsllwil_hu_bu(src4, 0);
259 src5 = __lsx_vsllwil_hu_bu(src5, 0);
260 FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
261 src0 = __lsx_vsrari_h(src0, 5);
262 src0 = __lsx_vclip255_h(src0);
263 src0 = __lsx_vpickev_b(src0, src0);
264 __lsx_vstelm_w(src0, pDst, 0, 0);
265 pDst += iDstStride;
266 psrc += iSrcStride;
267 }
268 }
269
McHorVer20WidthEq5_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)270 void McHorVer20WidthEq5_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
271 int32_t iDstStride, int32_t iHeight) {
272 uint8_t *psrc = (uint8_t*)pSrc -2;
273 __m128i src0, src1, src2, src3, src4, src5;
274 for (int i = 0; i < iHeight; i++) {
275 src0 = __lsx_vld(psrc, 0);
276 DUP4_ARG2(__lsx_vbsrl_v,
277 src0, 1,
278 src0, 2,
279 src0, 3,
280 src0, 4,
281 src1, src2, src3, src4);
282 src5 = __lsx_vbsrl_v(src0, 5);
283 DUP4_ARG2(__lsx_vsllwil_hu_bu,
284 src0, 0,
285 src1, 0,
286 src2, 0,
287 src3, 0,
288 src0, src1, src2, src3);
289 src4 = __lsx_vsllwil_hu_bu(src4, 0);
290 src5 = __lsx_vsllwil_hu_bu(src5, 0);
291 FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
292 src0 = __lsx_vsrari_h(src0, 5);
293 src0 = __lsx_vclip255_h(src0);
294 src0 = __lsx_vpickev_b(src0, src0);
295 __lsx_vstelm_w(src0, pDst, 0, 0);
296 __lsx_vstelm_b(src0, pDst, 4, 4);
297 pDst += iDstStride;
298 psrc += iSrcStride;
299 }
300 }
301
McHorVer20WidthEq8_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)302 void McHorVer20WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
303 int32_t iDstStride, int32_t iHeight) {
304 uint8_t *psrc = (uint8_t*)pSrc -2;
305 __m128i src0, src1, src2, src3, src4, src5;
306 for (int i = 0; i < iHeight; i++) {
307 src0 = __lsx_vld(psrc, 0);
308 DUP4_ARG2(__lsx_vbsrl_v,
309 src0, 1,
310 src0, 2,
311 src0, 3,
312 src0, 4,
313 src1, src2, src3, src4);
314 src5 = __lsx_vbsrl_v(src0, 5);
315 DUP4_ARG2(__lsx_vsllwil_hu_bu,
316 src0, 0,
317 src1, 0,
318 src2, 0,
319 src3, 0,
320 src0, src1, src2, src3);
321 src4 = __lsx_vsllwil_hu_bu(src4, 0);
322 src5 = __lsx_vsllwil_hu_bu(src5, 0);
323 FILTER_INPUT_8BIT(src0, src1, src2, src3 ,src4, src5 ,src0);
324 src0 = __lsx_vsrari_h(src0, 5);
325 src0 = __lsx_vclip255_h(src0);
326 src0 = __lsx_vpickev_b(src0, src0);
327 __lsx_vstelm_d(src0, pDst, 0, 0);
328 pDst += iDstStride;
329 psrc += iSrcStride;
330 }
331 }
332
McHorVer20WidthEq9_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)333 void McHorVer20WidthEq9_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
334 int32_t iDstStride, int32_t iHeight) {
335 McHorVer20WidthEq4_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
336 McHorVer20WidthEq5_lsx(&pSrc[4], iSrcStride, &pDst[4], iDstStride, iHeight);
337 }
338
McHorVer20WidthEq16_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)339 void McHorVer20WidthEq16_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
340 int32_t iDstStride, int32_t iHeight) {
341 uint8_t *psrc = (uint8_t*)pSrc - 2;
342 __m128i src0, src1, src2, src3, src4, src5;
343 __m128i tmp0, tmp1, tmp2 ,tmp3 ,tmp4, tmp5, out0_l, out0_h;
344 for (int i = 0; i < iHeight; i++) {
345 DUP4_ARG2(__lsx_vld,
346 psrc, 0,
347 psrc + 1, 0,
348 psrc + 2, 0,
349 psrc + 3, 0,
350 src0, src1, src2, src3);
351 src4 = __lsx_vld(psrc + 4, 0);
352 src5 = __lsx_vld(psrc + 5, 0);
353 //l part
354 DUP4_ARG2(__lsx_vsllwil_hu_bu,
355 src0, 0,
356 src1, 0,
357 src2, 0,
358 src3, 0,
359 tmp0, tmp1, tmp2, tmp3);
360 tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
361 tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
362 FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
363 out0_l = __lsx_vsrari_h(out0_l, 5);
364 out0_l = __lsx_vclip255_h(out0_l);
365 //h part
366 DUP4_ARG1(__lsx_vexth_hu_bu,
367 src0,
368 src1,
369 src2,
370 src3,
371 tmp0, tmp1, tmp2, tmp3);
372 tmp4 = __lsx_vexth_hu_bu(src4);
373 tmp5 = __lsx_vexth_hu_bu(src5);
374 FILTER_INPUT_8BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
375 out0_h = __lsx_vsrari_h(out0_h, 5);
376 out0_h = __lsx_vclip255_h(out0_h);
377 out0_l = __lsx_vpickev_b(out0_h, out0_l);
378 __lsx_vst(out0_l, pDst, 0);
379 pDst += iDstStride;
380 psrc += iSrcStride;
381 }
382 }
383
McHorVer20WidthEq17_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)384 void McHorVer20WidthEq17_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
385 int32_t iDstStride, int32_t iHeight) {
386 McHorVer20WidthEq8_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
387 McHorVer20WidthEq9_lsx(&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
388 }
389
McHorVer22WidthEq8_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)390 void McHorVer22WidthEq8_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
391 int32_t iDstStride, int32_t iHeight) {
392 int32_t iStride1 = iSrcStride;
393 int32_t iStride2 = iSrcStride << 1;
394 int32_t iStride3 = iStride1 + iStride2;
395 uint8_t *psrc = (uint8_t*)pSrc - 2;
396 __m128i src0, src1, src2, src3, src4, src5;
397 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
398 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
399 v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
400 v8i16 mask2 = {6, 7, 8, 9, 10, 11, 12 ,13};
401 for (int i = 0; i < iHeight; i++) {
402 DUP4_ARG2(__lsx_vldx,
403 psrc, -iStride2,
404 psrc, -iStride1,
405 psrc, iStride1,
406 psrc, iStride2,
407 src0, src1, src3, src4);
408 src2 = __lsx_vld(psrc, 0);
409 src5 = __lsx_vldx(psrc, iStride3);
410 //l part
411 DUP4_ARG2(__lsx_vsllwil_hu_bu,
412 src0, 0,
413 src1, 0,
414 src2, 0,
415 src3, 0,
416 tmp0, tmp1, tmp2, tmp3);
417 tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
418 tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
419 FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
420 //h part
421 DUP4_ARG1(__lsx_vexth_hu_bu,
422 src0,
423 src1,
424 src2,
425 src3,
426 tmp0, tmp1, tmp2, tmp3);
427 tmp4 = __lsx_vexth_hu_bu(src4);
428 tmp5 = __lsx_vexth_hu_bu(src5);
429 FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
430 dst0 = out0_l;
431 dst1 = __lsx_vbsrl_v(out0_l, 2);
432 dst2 = __lsx_vbsrl_v(out0_l, 4);
433 dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
434 dst4 = __lsx_vbsrl_v(dst3, 2);
435 dst5 = __lsx_vbsrl_v(dst3, 4);
436 dst6 = __lsx_vshuf_h((__m128i)mask2, out0_h, out0_l);
437 dst7 = __lsx_vbsrl_v(dst6, 2);
438 LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
439 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
440 //l part
441 DUP4_ARG2(__lsx_vsllwil_w_h,
442 dst0, 0,
443 dst1, 0,
444 dst2, 0,
445 dst3, 0,
446 tmp0, tmp1, tmp2, tmp3);
447 DUP2_ARG2(__lsx_vsllwil_w_h,
448 dst4, 0,
449 dst5, 0,
450 tmp4, tmp5);
451 HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
452 //h part
453 DUP4_ARG1(__lsx_vexth_w_h,
454 dst0,
455 dst1,
456 dst2,
457 dst3,
458 tmp0, tmp1, tmp2, tmp3);
459 DUP2_ARG1(__lsx_vexth_w_h,
460 dst4,
461 dst5,
462 tmp4, tmp5);
463 HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
464 out0_l = __lsx_vsrari_w(out0_l, 10);
465 out0_h = __lsx_vsrari_w(out0_h, 10);
466 DUP2_ARG1(__lsx_vclip255_w,
467 out0_l, out0_h,
468 out0_l, out0_h);
469 out0_l = __lsx_vpickev_h(out0_h, out0_l);
470 out0_l = __lsx_vpickev_b(out0_l, out0_l);
471 __lsx_vstelm_d(out0_l, pDst, 0, 0);
472 psrc += iSrcStride;
473 pDst += iDstStride;
474 }
475 }
476
477 static
McHorVer22WidthEq4_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)478 void McHorVer22WidthEq4_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
479 int32_t iDstStride, int32_t iHeight) {
480 int32_t iStride1 = iSrcStride;
481 int32_t iStride2 = iSrcStride << 1;
482 int32_t iStride3 = iStride1 + iStride2;
483 uint8_t *psrc = (uint8_t*)pSrc - 2;
484 __m128i src0, src1, src2, src3, src4, src5;
485 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
486 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
487 v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
488 for (int i = 0; i < iHeight; i++) {
489 DUP4_ARG2(__lsx_vldx,
490 psrc, -iStride2,
491 psrc, -iStride1,
492 psrc, iStride1,
493 psrc, iStride2,
494 src0, src1, src3, src4);
495 src2 = __lsx_vld(psrc, 0);
496 src5 = __lsx_vldx(psrc, iStride3);
497 //l part
498 DUP4_ARG2(__lsx_vsllwil_hu_bu,
499 src0, 0,
500 src1, 0,
501 src2, 0,
502 src3, 0,
503 tmp0, tmp1, tmp2, tmp3);
504 tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
505 tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
506 FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
507 //h part
508 DUP4_ARG1(__lsx_vexth_hu_bu,
509 src0,
510 src1,
511 src2,
512 src3,
513 tmp0, tmp1, tmp2, tmp3);
514 tmp4 = __lsx_vexth_hu_bu(src4);
515 tmp5 = __lsx_vexth_hu_bu(src5);
516 FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
517 dst0 = out0_l;
518 dst1 = __lsx_vbsrl_v(out0_l, 2);
519 dst2 = __lsx_vbsrl_v(out0_l, 4);
520 dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
521 LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
522 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
523 //l part
524 DUP4_ARG2(__lsx_vsllwil_w_h,
525 dst0, 0,
526 dst1, 0,
527 dst2, 0,
528 dst3, 0,
529 tmp0, tmp1, tmp2, tmp3);
530 DUP2_ARG2(__lsx_vsllwil_w_h,
531 dst4, 0,
532 dst5, 0,
533 tmp4, tmp5);
534 HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
535 //h part
536 DUP4_ARG1(__lsx_vexth_w_h,
537 dst0,
538 dst1,
539 dst2,
540 dst3,
541 tmp0, tmp1, tmp2, tmp3);
542 DUP2_ARG1(__lsx_vexth_w_h,
543 dst4,
544 dst5,
545 tmp4, tmp5);
546 HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
547 out0_l = __lsx_vsrari_w(out0_l, 10);
548 out0_h = __lsx_vsrari_w(out0_h, 10);
549 DUP2_ARG1(__lsx_vclip255_w,
550 out0_l, out0_h,
551 out0_l, out0_h);
552 out0_l = __lsx_vpickev_h(out0_h, out0_l);
553 out0_l = __lsx_vpickev_b(out0_l, out0_l);
554 __lsx_vstelm_w(out0_l, pDst, 0, 0);
555 psrc += iSrcStride;
556 pDst += iDstStride;
557 }
558 }
559
McHorVer22WidthEq5_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)560 void McHorVer22WidthEq5_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
561 int32_t iDstStride, int32_t iHeight) {
562 int32_t iStride1 = iSrcStride;
563 int32_t iStride2 = iSrcStride << 1;
564 int32_t iStride3 = iStride1 + iStride2;
565 uint8_t *psrc = (uint8_t*)pSrc - 2;
566 __m128i src0, src1, src2, src3, src4, src5;
567 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l, out0_h;
568 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
569 v8i16 mask1 = {3, 4, 5, 6, 7, 8, 9, 10};
570 for (int i = 0; i < iHeight; i++) {
571 DUP4_ARG2(__lsx_vldx,
572 psrc, -iStride2,
573 psrc, -iStride1,
574 psrc, iStride1,
575 psrc, iStride2,
576 src0, src1, src3, src4);
577 src2 = __lsx_vld(psrc, 0);
578 src5 = __lsx_vldx(psrc, iStride3);
579 //l part
580 DUP4_ARG2(__lsx_vsllwil_hu_bu,
581 src0, 0,
582 src1, 0,
583 src2, 0,
584 src3, 0,
585 tmp0, tmp1, tmp2, tmp3);
586 tmp4 = __lsx_vsllwil_hu_bu(src4, 0);
587 tmp5 = __lsx_vsllwil_hu_bu(src5, 0);
588 FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_l);
589 //h part
590 DUP4_ARG1(__lsx_vexth_hu_bu,
591 src0,
592 src1,
593 src2,
594 src3,
595 tmp0, tmp1, tmp2, tmp3);
596 tmp4 = __lsx_vexth_hu_bu(src4);
597 tmp5 = __lsx_vexth_hu_bu(src5);
598 FILTER_INPUT_8BIT(tmp0, tmp1 ,tmp2, tmp3, tmp4, tmp5, out0_h);
599 dst0 = out0_l;
600 dst1 = __lsx_vbsrl_v(out0_l, 2);
601 dst2 = __lsx_vbsrl_v(out0_l, 4);
602 dst3 = __lsx_vshuf_h((__m128i)mask1, out0_h, out0_l);
603 dst4 = __lsx_vbsrl_v(dst3, 2);
604 LSX_TRANSPOSE8x8_H(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
605 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
606 //l part
607 DUP4_ARG2(__lsx_vsllwil_w_h,
608 dst0, 0,
609 dst1, 0,
610 dst2, 0,
611 dst3, 0,
612 tmp0, tmp1, tmp2, tmp3);
613 DUP2_ARG2(__lsx_vsllwil_w_h,
614 dst4, 0,
615 dst5, 0,
616 tmp4, tmp5);
617 HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_l);
618 //h part
619 DUP4_ARG1(__lsx_vexth_w_h,
620 dst0,
621 dst1,
622 dst2,
623 dst3,
624 tmp0, tmp1, tmp2, tmp3);
625 DUP2_ARG1(__lsx_vexth_w_h,
626 dst4,
627 dst5,
628 tmp4, tmp5);
629 HOR_FILTER_INPUT_16BIT(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0_h);
630 out0_l = __lsx_vsrari_w(out0_l, 10);
631 out0_h = __lsx_vsrari_w(out0_h, 10);
632 DUP2_ARG1(__lsx_vclip255_w,
633 out0_l, out0_h,
634 out0_l, out0_h);
635 out0_l = __lsx_vpickev_h(out0_h, out0_l);
636 out0_l = __lsx_vpickev_b(out0_l, out0_l);
637 __lsx_vstelm_w(out0_l, pDst, 0, 0);
638 __lsx_vstelm_b(out0_l, pDst, 4, 4);
639 psrc += iSrcStride;
640 pDst += iDstStride;
641 }
642 }
643
McHorVer22WidthEq9_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)644 void McHorVer22WidthEq9_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
645 int32_t iDstStride, int32_t iHeight) {
646 McHorVer22WidthEq4_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
647 McHorVer22WidthEq5_lsx(&pSrc[4], iSrcStride, &pDst[4], iDstStride, iHeight);
648 }
649
McHorVer22WidthEq17_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)650 void McHorVer22WidthEq17_lsx(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
651 int32_t iDstStride, int32_t iHeight) {
652 McHorVer22WidthEq8_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight);
653 McHorVer22WidthEq9_lsx(&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
654 }
655