1 /*
2 * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3 *
4 * Copyright (c) 2022 Loongson Technology Corporation Limited
5 *
6 * Use of this source code is governed by a BSD-style license
7 * that can be found in the LICENSE file in the root of the source
8 * tree. An additional intellectual property rights grant can be found
9 * in the file PATENTS. All contributing project authors may
10 * be found in the AUTHORS file in the root of the source tree.
11 */
12
13 #include "libyuv/row.h"
14
15 #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
16 #include "libyuv/loongson_intrinsics.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 #define ALPHA_VAL (-1)
24
25 // Fill YUV -> RGB conversion constants into vectors
26 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
27 { \
28 ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \
29 vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \
30 ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \
31 vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \
32 yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \
33 yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
34 }
35
36 // Load 32 YUV422 pixel data
37 #define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
38 { \
39 __m256i temp0, temp1; \
40 \
41 DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
42 temp1 = __lasx_xvld(psrc_v, 0); \
43 temp0 = __lasx_xvsub_b(temp0, const_0x80); \
44 temp1 = __lasx_xvsub_b(temp1, const_0x80); \
45 temp0 = __lasx_vext2xv_h_b(temp0); \
46 temp1 = __lasx_vext2xv_h_b(temp1); \
47 uv_l = __lasx_xvilvl_h(temp0, temp1); \
48 uv_h = __lasx_xvilvh_h(temp0, temp1); \
49 }
50
51 // Load 16 YUV422 pixel data
52 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
53 { \
54 __m256i temp0, temp1; \
55 \
56 out_y = __lasx_xvld(psrc_y, 0); \
57 temp0 = __lasx_xvldrepl_d(psrc_u, 0); \
58 temp1 = __lasx_xvldrepl_d(psrc_v, 0); \
59 uv = __lasx_xvilvl_b(temp0, temp1); \
60 uv = __lasx_xvsub_b(uv, const_0x80); \
61 uv = __lasx_vext2xv_h_b(uv); \
62 }
63
64 // Convert 16 pixels of YUV420 to RGB.
65 #define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
66 g_h, r_l, r_h) \
67 { \
68 __m256i u_l, u_h, v_l, v_h; \
69 __m256i yl_ev, yl_od, yh_ev, yh_od; \
70 __m256i temp0, temp1, temp2, temp3; \
71 \
72 temp0 = __lasx_xvilvl_b(in_y, in_y); \
73 temp1 = __lasx_xvilvh_b(in_y, in_y); \
74 yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
75 yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
76 yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \
77 yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \
78 DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
79 yl_ev, yl_od, yh_ev, yh_od); \
80 yl_ev = __lasx_xvadd_w(yl_ev, yb); \
81 yl_od = __lasx_xvadd_w(yl_od, yb); \
82 yh_ev = __lasx_xvadd_w(yh_ev, yb); \
83 yh_od = __lasx_xvadd_w(yh_od, yb); \
84 v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \
85 u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \
86 v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \
87 u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \
88 temp0 = __lasx_xvadd_w(yl_ev, u_l); \
89 temp1 = __lasx_xvadd_w(yl_od, u_l); \
90 temp2 = __lasx_xvadd_w(yh_ev, u_h); \
91 temp3 = __lasx_xvadd_w(yh_od, u_h); \
92 DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
93 temp1, temp2, temp3); \
94 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
95 temp2, temp3); \
96 b_l = __lasx_xvpackev_h(temp1, temp0); \
97 b_h = __lasx_xvpackev_h(temp3, temp2); \
98 temp0 = __lasx_xvadd_w(yl_ev, v_l); \
99 temp1 = __lasx_xvadd_w(yl_od, v_l); \
100 temp2 = __lasx_xvadd_w(yh_ev, v_h); \
101 temp3 = __lasx_xvadd_w(yh_od, v_h); \
102 DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
103 temp1, temp2, temp3); \
104 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
105 temp2, temp3); \
106 r_l = __lasx_xvpackev_h(temp1, temp0); \
107 r_h = __lasx_xvpackev_h(temp3, temp2); \
108 DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
109 temp0 = __lasx_xvsub_w(yl_ev, u_l); \
110 temp1 = __lasx_xvsub_w(yl_od, u_l); \
111 temp2 = __lasx_xvsub_w(yh_ev, u_h); \
112 temp3 = __lasx_xvsub_w(yh_od, u_h); \
113 DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
114 temp1, temp2, temp3); \
115 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
116 temp2, temp3); \
117 g_l = __lasx_xvpackev_h(temp1, temp0); \
118 g_h = __lasx_xvpackev_h(temp3, temp2); \
119 }
120
121 // Convert 8 pixels of YUV420 to RGB.
122 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
123 { \
124 __m256i u_l, v_l, yl_ev, yl_od; \
125 __m256i temp0, temp1; \
126 \
127 in_y = __lasx_xvpermi_d(in_y, 0xD8); \
128 temp0 = __lasx_xvilvl_b(in_y, in_y); \
129 yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
130 yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
131 DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \
132 yl_ev = __lasx_xvadd_w(yl_ev, yb); \
133 yl_od = __lasx_xvadd_w(yl_od, yb); \
134 v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \
135 u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \
136 temp0 = __lasx_xvadd_w(yl_ev, u_l); \
137 temp1 = __lasx_xvadd_w(yl_od, u_l); \
138 DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
139 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
140 out_b = __lasx_xvpackev_h(temp1, temp0); \
141 temp0 = __lasx_xvadd_w(yl_ev, v_l); \
142 temp1 = __lasx_xvadd_w(yl_od, v_l); \
143 DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
144 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
145 out_r = __lasx_xvpackev_h(temp1, temp0); \
146 u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \
147 temp0 = __lasx_xvsub_w(yl_ev, u_l); \
148 temp1 = __lasx_xvsub_w(yl_od, u_l); \
149 DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
150 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
151 out_g = __lasx_xvpackev_h(temp1, temp0); \
152 }
153
154 // Pack and Store 16 ARGB values.
155 #define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
156 { \
157 __m256i temp0, temp1, temp2, temp3; \
158 \
159 temp0 = __lasx_xvpackev_b(g_l, b_l); \
160 temp1 = __lasx_xvpackev_b(a_l, r_l); \
161 temp2 = __lasx_xvpackev_b(g_h, b_h); \
162 temp3 = __lasx_xvpackev_b(a_h, r_h); \
163 r_l = __lasx_xvilvl_h(temp1, temp0); \
164 r_h = __lasx_xvilvh_h(temp1, temp0); \
165 g_l = __lasx_xvilvl_h(temp3, temp2); \
166 g_h = __lasx_xvilvh_h(temp3, temp2); \
167 temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \
168 temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \
169 temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \
170 temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \
171 __lasx_xvst(temp0, pdst_argb, 0); \
172 __lasx_xvst(temp1, pdst_argb, 32); \
173 __lasx_xvst(temp2, pdst_argb, 64); \
174 __lasx_xvst(temp3, pdst_argb, 96); \
175 pdst_argb += 128; \
176 }
177
178 // Pack and Store 8 ARGB values.
179 #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
180 { \
181 __m256i temp0, temp1, temp2, temp3; \
182 \
183 temp0 = __lasx_xvpackev_b(in_g, in_b); \
184 temp1 = __lasx_xvpackev_b(in_a, in_r); \
185 temp2 = __lasx_xvilvl_h(temp1, temp0); \
186 temp3 = __lasx_xvilvh_h(temp1, temp0); \
187 temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \
188 temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \
189 __lasx_xvst(temp0, pdst_argb, 0); \
190 __lasx_xvst(temp1, pdst_argb, 32); \
191 pdst_argb += 64; \
192 }
193
194 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \
195 { \
196 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
197 _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \
198 _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \
199 _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \
200 _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \
201 _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \
202 _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \
203 _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \
204 _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \
205 _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \
206 _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \
207 _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \
208 _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \
209 _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \
210 _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \
211 _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \
212 }
213
MirrorRow_LASX(const uint8_t * src,uint8_t * dst,int width)214 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
215 int x;
216 int len = width / 64;
217 __m256i src0, src1;
218 __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607,
219 0x08090A0B0C0D0E0F, 0x0001020304050607};
220 src += width - 64;
221 for (x = 0; x < len; x++) {
222 DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
223 DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
224 src1);
225 src0 = __lasx_xvpermi_q(src0, src0, 0x01);
226 src1 = __lasx_xvpermi_q(src1, src1, 0x01);
227 __lasx_xvst(src1, dst, 0);
228 __lasx_xvst(src0, dst, 32);
229 dst += 64;
230 src -= 64;
231 }
232 }
233
MirrorUVRow_LASX(const uint8_t * src_uv,uint8_t * dst_uv,int width)234 void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
235 int x;
236 int len = width / 16;
237 __m256i src, dst;
238 __m256i shuffler = {0x0004000500060007, 0x0000000100020003,
239 0x0004000500060007, 0x0000000100020003};
240
241 src_uv += (width - 16) << 1;
242 for (x = 0; x < len; x++) {
243 src = __lasx_xvld(src_uv, 0);
244 dst = __lasx_xvshuf_h(shuffler, src, src);
245 dst = __lasx_xvpermi_q(dst, dst, 0x01);
246 __lasx_xvst(dst, dst_uv, 0);
247 src_uv -= 32;
248 dst_uv += 32;
249 }
250 }
251
ARGBMirrorRow_LASX(const uint8_t * src,uint8_t * dst,int width)252 void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
253 int x;
254 int len = width / 16;
255 __m256i src0, src1;
256 __m256i dst0, dst1;
257 __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504,
258 0x0B0A09080F0E0D0C, 0x0302010007060504};
259 src += (width * 4) - 64;
260 for (x = 0; x < len; x++) {
261 DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
262 DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
263 src1);
264 dst1 = __lasx_xvpermi_q(src0, src0, 0x01);
265 dst0 = __lasx_xvpermi_q(src1, src1, 0x01);
266 __lasx_xvst(dst0, dst, 0);
267 __lasx_xvst(dst1, dst, 32);
268 dst += 64;
269 src -= 64;
270 }
271 }
272
I422ToYUY2Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)273 void I422ToYUY2Row_LASX(const uint8_t* src_y,
274 const uint8_t* src_u,
275 const uint8_t* src_v,
276 uint8_t* dst_yuy2,
277 int width) {
278 int x;
279 int len = width / 32;
280 __m256i src_u0, src_v0, src_y0, vec_uv0;
281 __m256i vec_yuy2_0, vec_yuy2_1;
282 __m256i dst_yuy2_0, dst_yuy2_1;
283
284 for (x = 0; x < len; x++) {
285 DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
286 src_y0 = __lasx_xvld(src_y, 0);
287 src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
288 src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
289 vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
290 vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0);
291 vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0);
292 dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20);
293 dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31);
294 __lasx_xvst(dst_yuy2_0, dst_yuy2, 0);
295 __lasx_xvst(dst_yuy2_1, dst_yuy2, 32);
296 src_u += 16;
297 src_v += 16;
298 src_y += 32;
299 dst_yuy2 += 64;
300 }
301 }
302
I422ToUYVYRow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)303 void I422ToUYVYRow_LASX(const uint8_t* src_y,
304 const uint8_t* src_u,
305 const uint8_t* src_v,
306 uint8_t* dst_uyvy,
307 int width) {
308 int x;
309 int len = width / 32;
310 __m256i src_u0, src_v0, src_y0, vec_uv0;
311 __m256i vec_uyvy0, vec_uyvy1;
312 __m256i dst_uyvy0, dst_uyvy1;
313
314 for (x = 0; x < len; x++) {
315 DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
316 src_y0 = __lasx_xvld(src_y, 0);
317 src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
318 src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
319 vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
320 vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0);
321 vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0);
322 dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20);
323 dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31);
324 __lasx_xvst(dst_uyvy0, dst_uyvy, 0);
325 __lasx_xvst(dst_uyvy1, dst_uyvy, 32);
326 src_u += 16;
327 src_v += 16;
328 src_y += 32;
329 dst_uyvy += 64;
330 }
331 }
332
I422ToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)333 void I422ToARGBRow_LASX(const uint8_t* src_y,
334 const uint8_t* src_u,
335 const uint8_t* src_v,
336 uint8_t* dst_argb,
337 const struct YuvConstants* yuvconstants,
338 int width) {
339 int x;
340 int len = width / 32;
341 __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
342 __m256i vec_ubvr, vec_ugvg;
343 __m256i alpha = __lasx_xvldi(0xFF);
344 __m256i const_0x80 = __lasx_xvldi(0x80);
345
346 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
347 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
348 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
349
350 for (x = 0; x < len; x++) {
351 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
352
353 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
354 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
355 g_h, r_l, r_h);
356 STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
357 src_y += 32;
358 src_u += 16;
359 src_v += 16;
360 }
361 }
362
I422ToRGBARow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)363 void I422ToRGBARow_LASX(const uint8_t* src_y,
364 const uint8_t* src_u,
365 const uint8_t* src_v,
366 uint8_t* dst_argb,
367 const struct YuvConstants* yuvconstants,
368 int width) {
369 int x;
370 int len = width / 32;
371 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
372 __m256i vec_ubvr, vec_ugvg;
373 __m256i alpha = __lasx_xvldi(0xFF);
374 __m256i const_0x80 = __lasx_xvldi(0x80);
375
376 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
377 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
378 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
379
380 for (x = 0; x < len; x++) {
381 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
382
383 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
384 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
385 g_h, r_l, r_h);
386 STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
387 src_y += 32;
388 src_u += 16;
389 src_v += 16;
390 }
391 }
392
I422AlphaToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)393 void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
394 const uint8_t* src_u,
395 const uint8_t* src_v,
396 const uint8_t* src_a,
397 uint8_t* dst_argb,
398 const struct YuvConstants* yuvconstants,
399 int width) {
400 int x;
401 int len = width / 32;
402 int res = width & 31;
403 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
404 __m256i vec_ubvr, vec_ugvg;
405 __m256i zero = __lasx_xvldi(0);
406 __m256i const_0x80 = __lasx_xvldi(0x80);
407
408 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
409 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
410 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
411
412 for (x = 0; x < len; x++) {
413 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
414
415 y = __lasx_xvld(src_a, 0);
416 a_l = __lasx_xvilvl_b(zero, y);
417 a_h = __lasx_xvilvh_b(zero, y);
418 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
419 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
420 g_h, r_l, r_h);
421 STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
422 src_y += 32;
423 src_u += 16;
424 src_v += 16;
425 src_a += 32;
426 }
427 if (res) {
428 __m256i y, uv, r, g, b, a;
429 a = __lasx_xvld(src_a, 0);
430 a = __lasx_vext2xv_hu_bu(a);
431 READYUV422(src_y, src_u, src_v, y, uv);
432 YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
433 STOREARGB(a, r, g, b, dst_argb);
434 }
435 }
436
I422ToRGB24Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int32_t width)437 void I422ToRGB24Row_LASX(const uint8_t* src_y,
438 const uint8_t* src_u,
439 const uint8_t* src_v,
440 uint8_t* dst_argb,
441 const struct YuvConstants* yuvconstants,
442 int32_t width) {
443 int x;
444 int len = width / 32;
445 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
446 __m256i vec_ubvr, vec_ugvg;
447 __m256i const_0x80 = __lasx_xvldi(0x80);
448 __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614,
449 0x0504120302100100, 0x0A18090816070614};
450 __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B,
451 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
452
453 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
454 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
455 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
456
457 for (x = 0; x < len; x++) {
458 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
459 __m256i temp0, temp1, temp2, temp3;
460
461 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
462 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
463 g_h, r_l, r_h);
464 temp0 = __lasx_xvpackev_b(g_l, b_l);
465 temp1 = __lasx_xvpackev_b(g_h, b_h);
466 DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1,
467 r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
468 temp1);
469
470 b_l = __lasx_xvilvl_d(temp1, temp2);
471 b_h = __lasx_xvilvh_d(temp3, temp1);
472 temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20);
473 temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30);
474 temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31);
475 __lasx_xvst(temp1, dst_argb, 0);
476 __lasx_xvst(temp2, dst_argb, 32);
477 __lasx_xvst(temp3, dst_argb, 64);
478 dst_argb += 96;
479 src_y += 32;
480 src_u += 16;
481 src_v += 16;
482 }
483 }
484
485 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
I422ToRGB565Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)486 void I422ToRGB565Row_LASX(const uint8_t* src_y,
487 const uint8_t* src_u,
488 const uint8_t* src_v,
489 uint8_t* dst_rgb565,
490 const struct YuvConstants* yuvconstants,
491 int width) {
492 int x;
493 int len = width / 32;
494 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
495 __m256i vec_ubvr, vec_ugvg;
496 __m256i const_0x80 = __lasx_xvldi(0x80);
497
498 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
499 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
500 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
501
502 for (x = 0; x < len; x++) {
503 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
504 __m256i dst_l, dst_h;
505
506 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
507 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
508 g_h, r_l, r_h);
509 b_l = __lasx_xvsrli_h(b_l, 3);
510 b_h = __lasx_xvsrli_h(b_h, 3);
511 g_l = __lasx_xvsrli_h(g_l, 2);
512 g_h = __lasx_xvsrli_h(g_h, 2);
513 r_l = __lasx_xvsrli_h(r_l, 3);
514 r_h = __lasx_xvsrli_h(r_h, 3);
515 r_l = __lasx_xvslli_h(r_l, 11);
516 r_h = __lasx_xvslli_h(r_h, 11);
517 g_l = __lasx_xvslli_h(g_l, 5);
518 g_h = __lasx_xvslli_h(g_h, 5);
519 r_l = __lasx_xvor_v(r_l, g_l);
520 r_l = __lasx_xvor_v(r_l, b_l);
521 r_h = __lasx_xvor_v(r_h, g_h);
522 r_h = __lasx_xvor_v(r_h, b_h);
523 dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
524 dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
525 __lasx_xvst(dst_l, dst_rgb565, 0);
526 __lasx_xvst(dst_h, dst_rgb565, 32);
527 dst_rgb565 += 64;
528 src_y += 32;
529 src_u += 16;
530 src_v += 16;
531 }
532 }
533
534 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
I422ToARGB4444Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)535 void I422ToARGB4444Row_LASX(const uint8_t* src_y,
536 const uint8_t* src_u,
537 const uint8_t* src_v,
538 uint8_t* dst_argb4444,
539 const struct YuvConstants* yuvconstants,
540 int width) {
541 int x;
542 int len = width / 32;
543 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
544 __m256i vec_ubvr, vec_ugvg;
545 __m256i const_0x80 = __lasx_xvldi(0x80);
546 __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000, 0xF000F000F000F000,
547 0xF000F000F000F000};
548 __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0,
549 0x00F000F000F000F0};
550
551 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
552 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
553 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
554
555 for (x = 0; x < len; x++) {
556 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
557 __m256i dst_l, dst_h;
558
559 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
560 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
561 g_h, r_l, r_h);
562 b_l = __lasx_xvsrli_h(b_l, 4);
563 b_h = __lasx_xvsrli_h(b_h, 4);
564 r_l = __lasx_xvsrli_h(r_l, 4);
565 r_h = __lasx_xvsrli_h(r_h, 4);
566 g_l = __lasx_xvand_v(g_l, mask);
567 g_h = __lasx_xvand_v(g_h, mask);
568 r_l = __lasx_xvslli_h(r_l, 8);
569 r_h = __lasx_xvslli_h(r_h, 8);
570 r_l = __lasx_xvor_v(r_l, alpha);
571 r_h = __lasx_xvor_v(r_h, alpha);
572 r_l = __lasx_xvor_v(r_l, g_l);
573 r_h = __lasx_xvor_v(r_h, g_h);
574 r_l = __lasx_xvor_v(r_l, b_l);
575 r_h = __lasx_xvor_v(r_h, b_h);
576 dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
577 dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
578 __lasx_xvst(dst_l, dst_argb4444, 0);
579 __lasx_xvst(dst_h, dst_argb4444, 32);
580 dst_argb4444 += 64;
581 src_y += 32;
582 src_u += 16;
583 src_v += 16;
584 }
585 }
586
I422ToARGB1555Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)587 void I422ToARGB1555Row_LASX(const uint8_t* src_y,
588 const uint8_t* src_u,
589 const uint8_t* src_v,
590 uint8_t* dst_argb1555,
591 const struct YuvConstants* yuvconstants,
592 int width) {
593 int x;
594 int len = width / 32;
595 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
596 __m256i vec_ubvr, vec_ugvg;
597 __m256i const_0x80 = __lasx_xvldi(0x80);
598 __m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000,
599 0x8000800080008000};
600
601 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
602 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
603 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
604
605 for (x = 0; x < len; x++) {
606 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
607 __m256i dst_l, dst_h;
608
609 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
610 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
611 g_h, r_l, r_h);
612 b_l = __lasx_xvsrli_h(b_l, 3);
613 b_h = __lasx_xvsrli_h(b_h, 3);
614 g_l = __lasx_xvsrli_h(g_l, 3);
615 g_h = __lasx_xvsrli_h(g_h, 3);
616 g_l = __lasx_xvslli_h(g_l, 5);
617 g_h = __lasx_xvslli_h(g_h, 5);
618 r_l = __lasx_xvsrli_h(r_l, 3);
619 r_h = __lasx_xvsrli_h(r_h, 3);
620 r_l = __lasx_xvslli_h(r_l, 10);
621 r_h = __lasx_xvslli_h(r_h, 10);
622 r_l = __lasx_xvor_v(r_l, alpha);
623 r_h = __lasx_xvor_v(r_h, alpha);
624 r_l = __lasx_xvor_v(r_l, g_l);
625 r_h = __lasx_xvor_v(r_h, g_h);
626 r_l = __lasx_xvor_v(r_l, b_l);
627 r_h = __lasx_xvor_v(r_h, b_h);
628 dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
629 dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
630 __lasx_xvst(dst_l, dst_argb1555, 0);
631 __lasx_xvst(dst_h, dst_argb1555, 32);
632 dst_argb1555 += 64;
633 src_y += 32;
634 src_u += 16;
635 src_v += 16;
636 }
637 }
638
YUY2ToYRow_LASX(const uint8_t * src_yuy2,uint8_t * dst_y,int width)639 void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
640 int x;
641 int len = width / 32;
642 __m256i src0, src1, dst0;
643
644 for (x = 0; x < len; x++) {
645 DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
646 dst0 = __lasx_xvpickev_b(src1, src0);
647 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
648 __lasx_xvst(dst0, dst_y, 0);
649 src_yuy2 += 64;
650 dst_y += 32;
651 }
652 }
653
YUY2ToUVRow_LASX(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)654 void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
655 int src_stride_yuy2,
656 uint8_t* dst_u,
657 uint8_t* dst_v,
658 int width) {
659 const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
660 int x;
661 int len = width / 32;
662 __m256i src0, src1, src2, src3;
663 __m256i tmp0, dst0, dst1;
664
665 for (x = 0; x < len; x++) {
666 DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0,
667 src_yuy2_next, 32, src0, src1, src2, src3);
668 src0 = __lasx_xvpickod_b(src1, src0);
669 src1 = __lasx_xvpickod_b(src3, src2);
670 tmp0 = __lasx_xvavgr_bu(src1, src0);
671 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
672 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
673 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
674 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
675 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
676 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
677 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
678 src_yuy2 += 64;
679 src_yuy2_next += 64;
680 dst_u += 16;
681 dst_v += 16;
682 }
683 }
684
YUY2ToUV422Row_LASX(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)685 void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
686 uint8_t* dst_u,
687 uint8_t* dst_v,
688 int width) {
689 int x;
690 int len = width / 32;
691 __m256i src0, src1, tmp0, dst0, dst1;
692
693 for (x = 0; x < len; x++) {
694 DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
695 tmp0 = __lasx_xvpickod_b(src1, src0);
696 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
697 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
698 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
699 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
700 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
701 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
702 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
703 src_yuy2 += 64;
704 dst_u += 16;
705 dst_v += 16;
706 }
707 }
708
UYVYToYRow_LASX(const uint8_t * src_uyvy,uint8_t * dst_y,int width)709 void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
710 int x;
711 int len = width / 32;
712 __m256i src0, src1, dst0;
713
714 for (x = 0; x < len; x++) {
715 DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
716 dst0 = __lasx_xvpickod_b(src1, src0);
717 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
718 __lasx_xvst(dst0, dst_y, 0);
719 src_uyvy += 64;
720 dst_y += 32;
721 }
722 }
723
UYVYToUVRow_LASX(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)724 void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
725 int src_stride_uyvy,
726 uint8_t* dst_u,
727 uint8_t* dst_v,
728 int width) {
729 const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
730 int x;
731 int len = width / 32;
732 __m256i src0, src1, src2, src3, tmp0, dst0, dst1;
733
734 for (x = 0; x < len; x++) {
735 DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0,
736 src_uyvy_next, 32, src0, src1, src2, src3);
737 src0 = __lasx_xvpickev_b(src1, src0);
738 src1 = __lasx_xvpickev_b(src3, src2);
739 tmp0 = __lasx_xvavgr_bu(src1, src0);
740 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
741 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
742 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
743 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
744 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
745 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
746 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
747 src_uyvy += 64;
748 src_uyvy_next += 64;
749 dst_u += 16;
750 dst_v += 16;
751 }
752 }
753
UYVYToUV422Row_LASX(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)754 void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
755 uint8_t* dst_u,
756 uint8_t* dst_v,
757 int width) {
758 int x;
759 int len = width / 32;
760 __m256i src0, src1, tmp0, dst0, dst1;
761
762 for (x = 0; x < len; x++) {
763 DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
764 tmp0 = __lasx_xvpickev_b(src1, src0);
765 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
766 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
767 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
768 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
769 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
770 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
771 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
772 src_uyvy += 64;
773 dst_u += 16;
774 dst_v += 16;
775 }
776 }
777
ARGBToYRow_LASX(const uint8_t * src_argb0,uint8_t * dst_y,int width)778 void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
779 int x;
780 int len = width / 32;
781 __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3;
782 __m256i tmp0, tmp1, dst0;
783 __m256i const_19 = __lasx_xvldi(0x19);
784 __m256i const_42 = __lasx_xvldi(0x42);
785 __m256i const_81 = __lasx_xvldi(0x81);
786 __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
787 0x1080108010801080, 0x1080108010801080};
788 __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
789 0x0000000700000003};
790
791 for (x = 0; x < len; x++) {
792 DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
793 src_argb0, 96, src0, src1, src2, src3);
794 vec0 = __lasx_xvpickev_b(src1, src0);
795 vec1 = __lasx_xvpickev_b(src3, src2);
796 vec2 = __lasx_xvpickod_b(src1, src0);
797 vec3 = __lasx_xvpickod_b(src3, src2);
798 tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19);
799 tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19);
800 tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81);
801 tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81);
802 tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42);
803 tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42);
804 dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8);
805 dst0 = __lasx_xvperm_w(dst0, control);
806 __lasx_xvst(dst0, dst_y, 0);
807 src_argb0 += 128;
808 dst_y += 32;
809 }
810 }
811
ARGBToUVRow_LASX(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)812 void ARGBToUVRow_LASX(const uint8_t* src_argb0,
813 int src_stride_argb,
814 uint8_t* dst_u,
815 uint8_t* dst_v,
816 int width) {
817 int x;
818 int len = width / 32;
819 const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
820
821 __m256i src0, src1, src2, src3, src4, src5, src6, src7;
822 __m256i vec0, vec1, vec2, vec3;
823 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
824 __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
825 0x0038003800380038, 0x0038003800380038};
826 __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
827 0x0025002500250025, 0x0025002500250025};
828 __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
829 0x0013001300130013, 0x0013001300130013};
830 __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
831 0x002f002f002f002f, 0x002f002f002f002f};
832 __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
833 0x0009000900090009, 0x0009000900090009};
834 __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
835 0x0000000700000003};
836 __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
837 0x8080808080808080, 0x8080808080808080};
838
839 for (x = 0; x < len; x++) {
840 DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
841 src_argb0, 96, src0, src1, src2, src3);
842 DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64,
843 src_argb1, 96, src4, src5, src6, src7);
844 vec0 = __lasx_xvaddwev_h_bu(src0, src4);
845 vec1 = __lasx_xvaddwev_h_bu(src1, src5);
846 vec2 = __lasx_xvaddwev_h_bu(src2, src6);
847 vec3 = __lasx_xvaddwev_h_bu(src3, src7);
848 tmp0 = __lasx_xvpickev_h(vec1, vec0);
849 tmp1 = __lasx_xvpickev_h(vec3, vec2);
850 tmp2 = __lasx_xvpickod_h(vec1, vec0);
851 tmp3 = __lasx_xvpickod_h(vec3, vec2);
852 vec0 = __lasx_xvaddwod_h_bu(src0, src4);
853 vec1 = __lasx_xvaddwod_h_bu(src1, src5);
854 vec2 = __lasx_xvaddwod_h_bu(src2, src6);
855 vec3 = __lasx_xvaddwod_h_bu(src3, src7);
856 tmp4 = __lasx_xvpickev_h(vec1, vec0);
857 tmp5 = __lasx_xvpickev_h(vec3, vec2);
858 vec0 = __lasx_xvpickev_h(tmp1, tmp0);
859 vec1 = __lasx_xvpickod_h(tmp1, tmp0);
860 src0 = __lasx_xvavgr_h(vec0, vec1);
861 vec0 = __lasx_xvpickev_h(tmp3, tmp2);
862 vec1 = __lasx_xvpickod_h(tmp3, tmp2);
863 src1 = __lasx_xvavgr_h(vec0, vec1);
864 vec0 = __lasx_xvpickev_h(tmp5, tmp4);
865 vec1 = __lasx_xvpickod_h(tmp5, tmp4);
866 src2 = __lasx_xvavgr_h(vec0, vec1);
867 dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
868 dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
869 dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
870 dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
871 dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
872 dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
873 dst0 = __lasx_xvperm_w(dst0, control);
874 dst1 = __lasx_xvperm_w(dst1, control);
875 dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8);
876 dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8);
877 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
878 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
879 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
880 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
881 src_argb0 += 128;
882 src_argb1 += 128;
883 dst_u += 16;
884 dst_v += 16;
885 }
886 }
887
ARGBToRGB24Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)888 void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
889 int x;
890 int len = (width / 32) - 1;
891 __m256i src0, src1, src2, src3;
892 __m256i tmp0, tmp1, tmp2, tmp3;
893 __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100,
894 0x000000000E0D0C0A};
895 __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
896 0x0000000700000003};
897 for (x = 0; x < len; x++) {
898 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
899 96, src0, src1, src2, src3);
900 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
901 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
902 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
903 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
904 tmp0 = __lasx_xvperm_w(tmp0, control);
905 tmp1 = __lasx_xvperm_w(tmp1, control);
906 tmp2 = __lasx_xvperm_w(tmp2, control);
907 tmp3 = __lasx_xvperm_w(tmp3, control);
908 __lasx_xvst(tmp0, dst_rgb, 0);
909 __lasx_xvst(tmp1, dst_rgb, 24);
910 __lasx_xvst(tmp2, dst_rgb, 48);
911 __lasx_xvst(tmp3, dst_rgb, 72);
912 dst_rgb += 96;
913 src_argb += 128;
914 }
915 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
916 src0, src1, src2, src3);
917 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
918 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
919 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
920 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
921 tmp0 = __lasx_xvperm_w(tmp0, control);
922 tmp1 = __lasx_xvperm_w(tmp1, control);
923 tmp2 = __lasx_xvperm_w(tmp2, control);
924 tmp3 = __lasx_xvperm_w(tmp3, control);
925 __lasx_xvst(tmp0, dst_rgb, 0);
926 __lasx_xvst(tmp1, dst_rgb, 24);
927 __lasx_xvst(tmp2, dst_rgb, 48);
928 dst_rgb += 72;
929 __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
930 __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
931 __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
932 }
933
ARGBToRAWRow_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)934 void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
935 int x;
936 int len = (width / 32) - 1;
937 __m256i src0, src1, src2, src3;
938 __m256i tmp0, tmp1, tmp2, tmp3;
939 __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102,
940 0x000000000C0D0E08};
941 __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
942 0x0000000700000003};
943 for (x = 0; x < len; x++) {
944 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
945 96, src0, src1, src2, src3);
946 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
947 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
948 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
949 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
950 tmp0 = __lasx_xvperm_w(tmp0, control);
951 tmp1 = __lasx_xvperm_w(tmp1, control);
952 tmp2 = __lasx_xvperm_w(tmp2, control);
953 tmp3 = __lasx_xvperm_w(tmp3, control);
954 __lasx_xvst(tmp0, dst_rgb, 0);
955 __lasx_xvst(tmp1, dst_rgb, 24);
956 __lasx_xvst(tmp2, dst_rgb, 48);
957 __lasx_xvst(tmp3, dst_rgb, 72);
958 dst_rgb += 96;
959 src_argb += 128;
960 }
961 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
962 src0, src1, src2, src3);
963 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
964 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
965 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
966 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
967 tmp0 = __lasx_xvperm_w(tmp0, control);
968 tmp1 = __lasx_xvperm_w(tmp1, control);
969 tmp2 = __lasx_xvperm_w(tmp2, control);
970 tmp3 = __lasx_xvperm_w(tmp3, control);
971 __lasx_xvst(tmp0, dst_rgb, 0);
972 __lasx_xvst(tmp1, dst_rgb, 24);
973 __lasx_xvst(tmp2, dst_rgb, 48);
974 dst_rgb += 72;
975 __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
976 __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
977 __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
978 }
979
ARGBToRGB565Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)980 void ARGBToRGB565Row_LASX(const uint8_t* src_argb,
981 uint8_t* dst_rgb,
982 int width) {
983 int x;
984 int len = width / 16;
985 __m256i zero = __lasx_xvldi(0);
986 __m256i src0, src1, tmp0, tmp1, dst0;
987 __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300,
988 0x0300030003000300};
989
990 for (x = 0; x < len; x++) {
991 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
992 tmp0 = __lasx_xvpickev_b(src1, src0);
993 tmp1 = __lasx_xvpickod_b(src1, src0);
994 tmp0 = __lasx_xvsrli_b(tmp0, 3);
995 tmp1 = __lasx_xvpackev_b(zero, tmp1);
996 tmp1 = __lasx_xvsrli_h(tmp1, 2);
997 tmp0 = __lasx_xvsll_b(tmp0, shift);
998 tmp1 = __lasx_xvslli_h(tmp1, 5);
999 dst0 = __lasx_xvor_v(tmp0, tmp1);
1000 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1001 __lasx_xvst(dst0, dst_rgb, 0);
1002 dst_rgb += 32;
1003 src_argb += 64;
1004 }
1005 }
1006
ARGBToARGB1555Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1007 void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
1008 uint8_t* dst_rgb,
1009 int width) {
1010 int x;
1011 int len = width / 16;
1012 __m256i zero = __lasx_xvldi(0);
1013 __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
1014 __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703,
1015 0x0703070307030703};
1016 __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200,
1017 0x0200020002000200};
1018
1019 for (x = 0; x < len; x++) {
1020 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1021 tmp0 = __lasx_xvpickev_b(src1, src0);
1022 tmp1 = __lasx_xvpickod_b(src1, src0);
1023 tmp0 = __lasx_xvsrli_b(tmp0, 3);
1024 tmp1 = __lasx_xvsrl_b(tmp1, shift1);
1025 tmp0 = __lasx_xvsll_b(tmp0, shift2);
1026 tmp2 = __lasx_xvpackev_b(zero, tmp1);
1027 tmp3 = __lasx_xvpackod_b(zero, tmp1);
1028 tmp2 = __lasx_xvslli_h(tmp2, 5);
1029 tmp3 = __lasx_xvslli_h(tmp3, 15);
1030 dst0 = __lasx_xvor_v(tmp0, tmp2);
1031 dst0 = __lasx_xvor_v(dst0, tmp3);
1032 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1033 __lasx_xvst(dst0, dst_rgb, 0);
1034 dst_rgb += 32;
1035 src_argb += 64;
1036 }
1037 }
1038
ARGBToARGB4444Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1039 void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
1040 uint8_t* dst_rgb,
1041 int width) {
1042 int x;
1043 int len = width / 16;
1044 __m256i src0, src1, tmp0, tmp1, dst0;
1045
1046 for (x = 0; x < len; x++) {
1047 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1048 tmp0 = __lasx_xvpickev_b(src1, src0);
1049 tmp1 = __lasx_xvpickod_b(src1, src0);
1050 tmp1 = __lasx_xvandi_b(tmp1, 0xF0);
1051 tmp0 = __lasx_xvsrli_b(tmp0, 4);
1052 dst0 = __lasx_xvor_v(tmp1, tmp0);
1053 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1054 __lasx_xvst(dst0, dst_rgb, 0);
1055 dst_rgb += 32;
1056 src_argb += 64;
1057 }
1058 }
1059
ARGBToUV444Row_LASX(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int32_t width)1060 void ARGBToUV444Row_LASX(const uint8_t* src_argb,
1061 uint8_t* dst_u,
1062 uint8_t* dst_v,
1063 int32_t width) {
1064 int x;
1065 int len = width / 32;
1066 __m256i src0, src1, src2, src3;
1067 __m256i tmp0, tmp1, tmp2, tmp3;
1068 __m256i reg0, reg1, reg2, reg3, dst0, dst1;
1069 __m256i const_112 = __lasx_xvldi(112);
1070 __m256i const_74 = __lasx_xvldi(74);
1071 __m256i const_38 = __lasx_xvldi(38);
1072 __m256i const_94 = __lasx_xvldi(94);
1073 __m256i const_18 = __lasx_xvldi(18);
1074 __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
1075 0x8080808080808080, 0x8080808080808080};
1076 __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
1077 0x0000000700000003};
1078 for (x = 0; x < len; x++) {
1079 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
1080 96, src0, src1, src2, src3);
1081 tmp0 = __lasx_xvpickev_h(src1, src0);
1082 tmp1 = __lasx_xvpickod_h(src1, src0);
1083 tmp2 = __lasx_xvpickev_h(src3, src2);
1084 tmp3 = __lasx_xvpickod_h(src3, src2);
1085 reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
1086 reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
1087 reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
1088 reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
1089 reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
1090 reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38);
1091 reg0 = __lasx_xvsub_h(reg0, reg2);
1092 reg1 = __lasx_xvsub_h(reg1, reg3);
1093 dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
1094 dst0 = __lasx_xvperm_w(dst0, control);
1095 reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
1096 reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
1097 reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
1098 reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
1099 reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
1100 reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94);
1101 reg0 = __lasx_xvsub_h(reg0, reg2);
1102 reg1 = __lasx_xvsub_h(reg1, reg3);
1103 dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8);
1104 dst1 = __lasx_xvperm_w(dst1, control);
1105 __lasx_xvst(dst0, dst_u, 0);
1106 __lasx_xvst(dst1, dst_v, 0);
1107 dst_u += 32;
1108 dst_v += 32;
1109 src_argb += 128;
1110 }
1111 }
1112
ARGBMultiplyRow_LASX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1113 void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
1114 const uint8_t* src_argb1,
1115 uint8_t* dst_argb,
1116 int width) {
1117 int x;
1118 int len = width / 8;
1119 __m256i zero = __lasx_xvldi(0);
1120 __m256i src0, src1, dst0, dst1;
1121 __m256i tmp0, tmp1, tmp2, tmp3;
1122
1123 for (x = 0; x < len; x++) {
1124 DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
1125 tmp0 = __lasx_xvilvl_b(src0, src0);
1126 tmp1 = __lasx_xvilvh_b(src0, src0);
1127 tmp2 = __lasx_xvilvl_b(zero, src1);
1128 tmp3 = __lasx_xvilvh_b(zero, src1);
1129 dst0 = __lasx_xvmuh_hu(tmp0, tmp2);
1130 dst1 = __lasx_xvmuh_hu(tmp1, tmp3);
1131 dst0 = __lasx_xvpickev_b(dst1, dst0);
1132 __lasx_xvst(dst0, dst_argb, 0);
1133 src_argb0 += 32;
1134 src_argb1 += 32;
1135 dst_argb += 32;
1136 }
1137 }
1138
ARGBAddRow_LASX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1139 void ARGBAddRow_LASX(const uint8_t* src_argb0,
1140 const uint8_t* src_argb1,
1141 uint8_t* dst_argb,
1142 int width) {
1143 int x;
1144 int len = width / 8;
1145 __m256i src0, src1, dst0;
1146
1147 for (x = 0; x < len; x++) {
1148 DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
1149 dst0 = __lasx_xvsadd_bu(src0, src1);
1150 __lasx_xvst(dst0, dst_argb, 0);
1151 src_argb0 += 32;
1152 src_argb1 += 32;
1153 dst_argb += 32;
1154 }
1155 }
1156
ARGBSubtractRow_LASX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1157 void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
1158 const uint8_t* src_argb1,
1159 uint8_t* dst_argb,
1160 int width) {
1161 int x;
1162 int len = width / 8;
1163 __m256i src0, src1, dst0;
1164
1165 for (x = 0; x < len; x++) {
1166 DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
1167 dst0 = __lasx_xvssub_bu(src0, src1);
1168 __lasx_xvst(dst0, dst_argb, 0);
1169 src_argb0 += 32;
1170 src_argb1 += 32;
1171 dst_argb += 32;
1172 }
1173 }
1174
ARGBAttenuateRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1175 void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
1176 uint8_t* dst_argb,
1177 int width) {
1178 int x;
1179 int len = width / 16;
1180 __m256i src0, src1, tmp0, tmp1;
1181 __m256i reg0, reg1, reg2, reg3, reg4, reg5;
1182 __m256i b, g, r, a, dst0, dst1;
1183 __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000,
1184 0x0007000300060002};
1185
1186 for (x = 0; x < len; x++) {
1187 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1188 tmp0 = __lasx_xvpickev_b(src1, src0);
1189 tmp1 = __lasx_xvpickod_b(src1, src0);
1190 b = __lasx_xvpackev_b(tmp0, tmp0);
1191 r = __lasx_xvpackod_b(tmp0, tmp0);
1192 g = __lasx_xvpackev_b(tmp1, tmp1);
1193 a = __lasx_xvpackod_b(tmp1, tmp1);
1194 reg0 = __lasx_xvmulwev_w_hu(b, a);
1195 reg1 = __lasx_xvmulwod_w_hu(b, a);
1196 reg2 = __lasx_xvmulwev_w_hu(r, a);
1197 reg3 = __lasx_xvmulwod_w_hu(r, a);
1198 reg4 = __lasx_xvmulwev_w_hu(g, a);
1199 reg5 = __lasx_xvmulwod_w_hu(g, a);
1200 reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24);
1201 reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24);
1202 reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24);
1203 reg0 = __lasx_xvshuf_h(control, reg0, reg0);
1204 reg2 = __lasx_xvshuf_h(control, reg2, reg2);
1205 reg4 = __lasx_xvshuf_h(control, reg4, reg4);
1206 tmp0 = __lasx_xvpackev_b(reg4, reg0);
1207 tmp1 = __lasx_xvpackev_b(a, reg2);
1208 dst0 = __lasx_xvilvl_h(tmp1, tmp0);
1209 dst1 = __lasx_xvilvh_h(tmp1, tmp0);
1210 __lasx_xvst(dst0, dst_argb, 0);
1211 __lasx_xvst(dst1, dst_argb, 32);
1212 dst_argb += 64;
1213 src_argb += 64;
1214 }
1215 }
1216
ARGBToRGB565DitherRow_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)1217 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
1218 uint8_t* dst_rgb,
1219 const uint32_t dither4,
1220 int width) {
1221 int x;
1222 int len = width / 16;
1223 __m256i src0, src1, tmp0, tmp1, dst0;
1224 __m256i b, g, r;
1225 __m256i zero = __lasx_xvldi(0);
1226 __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0);
1227
1228 vec_dither = __lasx_xvilvl_b(zero, vec_dither);
1229 for (x = 0; x < len; x++) {
1230 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1231 tmp0 = __lasx_xvpickev_b(src1, src0);
1232 tmp1 = __lasx_xvpickod_b(src1, src0);
1233 b = __lasx_xvpackev_b(zero, tmp0);
1234 r = __lasx_xvpackod_b(zero, tmp0);
1235 g = __lasx_xvpackev_b(zero, tmp1);
1236 b = __lasx_xvadd_h(b, vec_dither);
1237 g = __lasx_xvadd_h(g, vec_dither);
1238 r = __lasx_xvadd_h(r, vec_dither);
1239 DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g);
1240 r = __lasx_xvclip255_h(r);
1241 b = __lasx_xvsrai_h(b, 3);
1242 g = __lasx_xvsrai_h(g, 2);
1243 r = __lasx_xvsrai_h(r, 3);
1244 g = __lasx_xvslli_h(g, 5);
1245 r = __lasx_xvslli_h(r, 11);
1246 dst0 = __lasx_xvor_v(b, g);
1247 dst0 = __lasx_xvor_v(dst0, r);
1248 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1249 __lasx_xvst(dst0, dst_rgb, 0);
1250 src_argb += 64;
1251 dst_rgb += 32;
1252 }
1253 }
1254
ARGBShuffleRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1255 void ARGBShuffleRow_LASX(const uint8_t* src_argb,
1256 uint8_t* dst_argb,
1257 const uint8_t* shuffler,
1258 int width) {
1259 int x;
1260 int len = width / 16;
1261 __m256i src0, src1, dst0, dst1;
1262 __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000,
1263 0x0C0C0C0C08080808};
1264 __m256i temp = __lasx_xvldrepl_w(shuffler, 0);
1265
1266 shuf = __lasx_xvadd_b(shuf, temp);
1267 for (x = 0; x < len; x++) {
1268 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1269 dst0 = __lasx_xvshuf_b(src0, src0, shuf);
1270 dst1 = __lasx_xvshuf_b(src1, src1, shuf);
1271 __lasx_xvst(dst0, dst_argb, 0);
1272 __lasx_xvst(dst1, dst_argb, 32);
1273 src_argb += 64;
1274 dst_argb += 64;
1275 }
1276 }
1277
ARGBShadeRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1278 void ARGBShadeRow_LASX(const uint8_t* src_argb,
1279 uint8_t* dst_argb,
1280 int width,
1281 uint32_t value) {
1282 int x;
1283 int len = width / 8;
1284 __m256i src0, dst0, tmp0, tmp1;
1285 __m256i vec_value = __lasx_xvreplgr2vr_w(value);
1286
1287 vec_value = __lasx_xvilvl_b(vec_value, vec_value);
1288 for (x = 0; x < len; x++) {
1289 src0 = __lasx_xvld(src_argb, 0);
1290 tmp0 = __lasx_xvilvl_b(src0, src0);
1291 tmp1 = __lasx_xvilvh_b(src0, src0);
1292 tmp0 = __lasx_xvmuh_hu(tmp0, vec_value);
1293 tmp1 = __lasx_xvmuh_hu(tmp1, vec_value);
1294 dst0 = __lasx_xvpickod_b(tmp1, tmp0);
1295 __lasx_xvst(dst0, dst_argb, 0);
1296 src_argb += 32;
1297 dst_argb += 32;
1298 }
1299 }
1300
ARGBGrayRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1301 void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1302 int x;
1303 int len = width / 16;
1304 __m256i src0, src1, tmp0, tmp1;
1305 __m256i reg0, reg1, reg2, dst0, dst1;
1306 __m256i const_128 = __lasx_xvldi(0x480);
1307 __m256i const_150 = __lasx_xvldi(0x96);
1308 __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
1309 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
1310
1311 for (x = 0; x < len; x++) {
1312 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1313 tmp0 = __lasx_xvpickev_b(src1, src0);
1314 tmp1 = __lasx_xvpickod_b(src1, src0);
1315 reg0 = __lasx_xvdp2_h_bu(tmp0, const_br);
1316 reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
1317 reg2 = __lasx_xvadd_h(reg0, reg1);
1318 tmp0 = __lasx_xvpackod_b(reg2, reg2);
1319 tmp1 = __lasx_xvpackod_b(tmp1, reg2);
1320 dst0 = __lasx_xvilvl_h(tmp1, tmp0);
1321 dst1 = __lasx_xvilvh_h(tmp1, tmp0);
1322 __lasx_xvst(dst0, dst_argb, 0);
1323 __lasx_xvst(dst1, dst_argb, 32);
1324 src_argb += 64;
1325 dst_argb += 64;
1326 }
1327 }
1328
ARGBSepiaRow_LASX(uint8_t * dst_argb,int width)1329 void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) {
1330 int x;
1331 int len = width / 16;
1332 __m256i src0, src1, tmp0, tmp1;
1333 __m256i reg0, reg1, spb, spg, spr;
1334 __m256i dst0, dst1;
1335 __m256i spb_g = __lasx_xvldi(68);
1336 __m256i spg_g = __lasx_xvldi(88);
1337 __m256i spr_g = __lasx_xvldi(98);
1338 __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311,
1339 0x2311231123112311};
1340 __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16,
1341 0x2D162D162D162D16};
1342 __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218,
1343 0x3218321832183218};
1344 __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100,
1345 0x1F0E1D0C1B0A1908};
1346
1347 for (x = 0; x < len; x++) {
1348 DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1);
1349 tmp0 = __lasx_xvpickev_b(src1, src0);
1350 tmp1 = __lasx_xvpickod_b(src1, src0);
1351 DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
1352 spr = __lasx_xvdp2_h_bu(tmp0, spr_br);
1353 spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g);
1354 spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g);
1355 spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g);
1356 spb = __lasx_xvsrli_h(spb, 7);
1357 spg = __lasx_xvsrli_h(spg, 7);
1358 spr = __lasx_xvsrli_h(spr, 7);
1359 spg = __lasx_xvsat_hu(spg, 7);
1360 spr = __lasx_xvsat_hu(spr, 7);
1361 reg0 = __lasx_xvpackev_b(spg, spb);
1362 reg1 = __lasx_xvshuf_b(tmp1, spr, shuff);
1363 dst0 = __lasx_xvilvl_h(reg1, reg0);
1364 dst1 = __lasx_xvilvh_h(reg1, reg0);
1365 __lasx_xvst(dst0, dst_argb, 0);
1366 __lasx_xvst(dst1, dst_argb, 32);
1367 dst_argb += 64;
1368 }
1369 }
1370
ARGB4444ToARGBRow_LASX(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1371 void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444,
1372 uint8_t* dst_argb,
1373 int width) {
1374 int x;
1375 int len = width / 32;
1376 __m256i src0, src1;
1377 __m256i tmp0, tmp1, tmp2, tmp3;
1378 __m256i reg0, reg1, reg2, reg3;
1379 __m256i dst0, dst1, dst2, dst3;
1380
1381 for (x = 0; x < len; x++) {
1382 src0 = __lasx_xvld(src_argb4444, 0);
1383 src1 = __lasx_xvld(src_argb4444, 32);
1384 DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0,
1385 tmp0, tmp1, tmp2, tmp3);
1386 DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2);
1387 DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3);
1388 DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3,
1389 tmp0, tmp1, tmp2, tmp3);
1390 DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2);
1391 DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3);
1392 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2,
1393 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
1394 __lasx_xvst(dst0, dst_argb, 0);
1395 __lasx_xvst(dst1, dst_argb, 32);
1396 __lasx_xvst(dst2, dst_argb, 64);
1397 __lasx_xvst(dst3, dst_argb, 96);
1398 src_argb4444 += 64;
1399 dst_argb += 128;
1400 }
1401 }
1402
ARGB1555ToARGBRow_LASX(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1403 void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555,
1404 uint8_t* dst_argb,
1405 int width) {
1406 int x;
1407 int len = width / 32;
1408 __m256i src0, src1;
1409 __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
1410 __m256i reg0, reg1, reg2, reg3;
1411 __m256i dst0, dst1, dst2, dst3;
1412
1413 for (x = 0; x < len; x++) {
1414 src0 = __lasx_xvld(src_argb1555, 0);
1415 src1 = __lasx_xvld(src_argb1555, 32);
1416 tmp0 = __lasx_xvpickev_b(src1, src0);
1417 tmp1 = __lasx_xvpickod_b(src1, src0);
1418 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1419 tmpg = __lasx_xvsrli_b(tmp0, 5);
1420 reg0 = __lasx_xvandi_b(tmp1, 0x03);
1421 reg0 = __lasx_xvslli_b(reg0, 3);
1422 tmpg = __lasx_xvor_v(tmpg, reg0);
1423 reg1 = __lasx_xvandi_b(tmp1, 0x7C);
1424 tmpr = __lasx_xvsrli_b(reg1, 2);
1425 tmpa = __lasx_xvsrli_b(tmp1, 7);
1426 tmpa = __lasx_xvneg_b(tmpa);
1427 reg0 = __lasx_xvslli_b(tmpb, 3);
1428 reg1 = __lasx_xvslli_b(tmpg, 3);
1429 reg2 = __lasx_xvslli_b(tmpr, 3);
1430 tmpb = __lasx_xvsrli_b(tmpb, 2);
1431 tmpg = __lasx_xvsrli_b(tmpg, 2);
1432 tmpr = __lasx_xvsrli_b(tmpr, 2);
1433 tmpb = __lasx_xvor_v(reg0, tmpb);
1434 tmpg = __lasx_xvor_v(reg1, tmpg);
1435 tmpr = __lasx_xvor_v(reg2, tmpr);
1436 DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
1437 DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3);
1438 dst0 = __lasx_xvilvl_h(reg1, reg0);
1439 dst1 = __lasx_xvilvh_h(reg1, reg0);
1440 dst2 = __lasx_xvilvl_h(reg3, reg2);
1441 dst3 = __lasx_xvilvh_h(reg3, reg2);
1442 DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
1443 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
1444 __lasx_xvst(reg0, dst_argb, 0);
1445 __lasx_xvst(reg1, dst_argb, 32);
1446 __lasx_xvst(reg2, dst_argb, 64);
1447 __lasx_xvst(reg3, dst_argb, 96);
1448 src_argb1555 += 64;
1449 dst_argb += 128;
1450 }
1451 }
1452
RGB565ToARGBRow_LASX(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1453 void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
1454 uint8_t* dst_argb,
1455 int width) {
1456 int x;
1457 int len = width / 32;
1458 __m256i src0, src1;
1459 __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
1460 __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3;
1461 __m256i alpha = __lasx_xvldi(0xFF);
1462
1463 for (x = 0; x < len; x++) {
1464 src0 = __lasx_xvld(src_rgb565, 0);
1465 src1 = __lasx_xvld(src_rgb565, 32);
1466 tmp0 = __lasx_xvpickev_b(src1, src0);
1467 tmp1 = __lasx_xvpickod_b(src1, src0);
1468 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1469 tmpr = __lasx_xvandi_b(tmp1, 0xF8);
1470 reg1 = __lasx_xvandi_b(tmp1, 0x07);
1471 reg0 = __lasx_xvsrli_b(tmp0, 5);
1472 reg1 = __lasx_xvslli_b(reg1, 3);
1473 tmpg = __lasx_xvor_v(reg1, reg0);
1474 reg0 = __lasx_xvslli_b(tmpb, 3);
1475 reg1 = __lasx_xvsrli_b(tmpb, 2);
1476 tmpb = __lasx_xvor_v(reg1, reg0);
1477 reg0 = __lasx_xvslli_b(tmpg, 2);
1478 reg1 = __lasx_xvsrli_b(tmpg, 4);
1479 tmpg = __lasx_xvor_v(reg1, reg0);
1480 reg0 = __lasx_xvsrli_b(tmpr, 5);
1481 tmpr = __lasx_xvor_v(tmpr, reg0);
1482 DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1483 dst0 = __lasx_xvilvl_h(reg1, reg0);
1484 dst1 = __lasx_xvilvh_h(reg1, reg0);
1485 DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1486 dst2 = __lasx_xvilvl_h(reg1, reg0);
1487 dst3 = __lasx_xvilvh_h(reg1, reg0);
1488 DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
1489 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
1490 __lasx_xvst(reg0, dst_argb, 0);
1491 __lasx_xvst(reg1, dst_argb, 32);
1492 __lasx_xvst(reg2, dst_argb, 64);
1493 __lasx_xvst(reg3, dst_argb, 96);
1494 src_rgb565 += 64;
1495 dst_argb += 128;
1496 }
1497 }
1498
RGB24ToARGBRow_LASX(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1499 void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
1500 uint8_t* dst_argb,
1501 int width) {
1502 int x;
1503 int len = width / 32;
1504 __m256i src0, src1, src2;
1505 __m256i tmp0, tmp1, tmp2;
1506 __m256i dst0, dst1, dst2, dst3;
1507 __m256i reg0, reg1, reg2, reg3;
1508 __m256i alpha = __lasx_xvldi(0xFF);
1509 __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
1510 0x1B1A191817161514};
1511 __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
1512 0x0706050403020100};
1513 __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
1514 0x131211100F0E0D0C};
1515 __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100,
1516 0x100B0A0910080706};
1517
1518 for (x = 0; x < len; x++) {
1519 reg0 = __lasx_xvld(src_rgb24, 0);
1520 reg1 = __lasx_xvld(src_rgb24, 32);
1521 reg2 = __lasx_xvld(src_rgb24, 64);
1522 src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
1523 src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
1524 src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
1525 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
1526 tmp1);
1527 tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
1528 DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1529 tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
1530 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
1531 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
1532 __lasx_xvst(dst0, dst_argb, 0);
1533 __lasx_xvst(dst1, dst_argb, 32);
1534 __lasx_xvst(dst2, dst_argb, 64);
1535 __lasx_xvst(dst3, dst_argb, 96);
1536 src_rgb24 += 96;
1537 dst_argb += 128;
1538 }
1539 }
1540
RAWToARGBRow_LASX(const uint8_t * src_raw,uint8_t * dst_argb,int width)1541 void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1542 int x;
1543 int len = width / 32;
1544 __m256i src0, src1, src2;
1545 __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3;
1546 __m256i dst0, dst1, dst2, dst3;
1547 __m256i alpha = __lasx_xvldi(0xFF);
1548 __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
1549 0x1B1A191817161514};
1550 __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
1551 0x0706050403020100};
1552 __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
1553 0x131211100F0E0D0C};
1554 __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102,
1555 0x10090A0B10060708};
1556
1557 for (x = 0; x < len; x++) {
1558 reg0 = __lasx_xvld(src_raw, 0);
1559 reg1 = __lasx_xvld(src_raw, 32);
1560 reg2 = __lasx_xvld(src_raw, 64);
1561 src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
1562 src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
1563 src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
1564 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
1565 tmp1);
1566 tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
1567 DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1568 tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
1569 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
1570 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
1571 __lasx_xvst(dst0, dst_argb, 0);
1572 __lasx_xvst(dst1, dst_argb, 32);
1573 __lasx_xvst(dst2, dst_argb, 64);
1574 __lasx_xvst(dst3, dst_argb, 96);
1575 src_raw += 96;
1576 dst_argb += 128;
1577 }
1578 }
1579
ARGB1555ToYRow_LASX(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1580 void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
1581 uint8_t* dst_y,
1582 int width) {
1583 int x;
1584 int len = width / 32;
1585 __m256i src0, src1;
1586 __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
1587 __m256i reg0, reg1, reg2, dst0;
1588 __m256i const_66 = __lasx_xvldi(66);
1589 __m256i const_129 = __lasx_xvldi(129);
1590 __m256i const_25 = __lasx_xvldi(25);
1591 __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
1592 0x1080108010801080, 0x1080108010801080};
1593
1594 for (x = 0; x < len; x++) {
1595 src0 = __lasx_xvld(src_argb1555, 0);
1596 src1 = __lasx_xvld(src_argb1555, 32);
1597 tmp0 = __lasx_xvpickev_b(src1, src0);
1598 tmp1 = __lasx_xvpickod_b(src1, src0);
1599 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1600 tmpg = __lasx_xvsrli_b(tmp0, 5);
1601 reg0 = __lasx_xvandi_b(tmp1, 0x03);
1602 reg0 = __lasx_xvslli_b(reg0, 3);
1603 tmpg = __lasx_xvor_v(tmpg, reg0);
1604 reg1 = __lasx_xvandi_b(tmp1, 0x7C);
1605 tmpr = __lasx_xvsrli_b(reg1, 2);
1606 reg0 = __lasx_xvslli_b(tmpb, 3);
1607 reg1 = __lasx_xvslli_b(tmpg, 3);
1608 reg2 = __lasx_xvslli_b(tmpr, 3);
1609 tmpb = __lasx_xvsrli_b(tmpb, 2);
1610 tmpg = __lasx_xvsrli_b(tmpg, 2);
1611 tmpr = __lasx_xvsrli_b(tmpr, 2);
1612 tmpb = __lasx_xvor_v(reg0, tmpb);
1613 tmpg = __lasx_xvor_v(reg1, tmpg);
1614 tmpr = __lasx_xvor_v(reg2, tmpr);
1615 reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
1616 reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
1617 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
1618 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
1619 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
1620 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
1621 dst0 = __lasx_xvpackod_b(reg1, reg0);
1622 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1623 __lasx_xvst(dst0, dst_y, 0);
1624 src_argb1555 += 64;
1625 dst_y += 32;
1626 }
1627 }
1628
ARGB1555ToUVRow_LASX(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1629 void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
1630 int src_stride_argb1555,
1631 uint8_t* dst_u,
1632 uint8_t* dst_v,
1633 int width) {
1634 int x;
1635 int len = width / 32;
1636 const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
1637 __m256i src0, src1, src2, src3;
1638 __m256i tmp0, tmp1, tmp2, tmp3;
1639 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1640 __m256i reg0, reg1, reg2, reg3, dst0;
1641 __m256i const_112 = __lasx_xvldi(0x438);
1642 __m256i const_74 = __lasx_xvldi(0x425);
1643 __m256i const_38 = __lasx_xvldi(0x413);
1644 __m256i const_94 = __lasx_xvldi(0x42F);
1645 __m256i const_18 = __lasx_xvldi(0x409);
1646 __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
1647 0x8080808080808080, 0x8080808080808080};
1648
1649 for (x = 0; x < len; x++) {
1650 DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
1651 next_argb1555, 32, src0, src1, src2, src3);
1652 DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1653 DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1654 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1655 nexb = __lasx_xvandi_b(tmp2, 0x1F);
1656 tmpg = __lasx_xvsrli_b(tmp0, 5);
1657 nexg = __lasx_xvsrli_b(tmp2, 5);
1658 reg0 = __lasx_xvandi_b(tmp1, 0x03);
1659 reg2 = __lasx_xvandi_b(tmp3, 0x03);
1660 reg0 = __lasx_xvslli_b(reg0, 3);
1661 reg2 = __lasx_xvslli_b(reg2, 3);
1662 tmpg = __lasx_xvor_v(tmpg, reg0);
1663 nexg = __lasx_xvor_v(nexg, reg2);
1664 reg1 = __lasx_xvandi_b(tmp1, 0x7C);
1665 reg3 = __lasx_xvandi_b(tmp3, 0x7C);
1666 tmpr = __lasx_xvsrli_b(reg1, 2);
1667 nexr = __lasx_xvsrli_b(reg3, 2);
1668 reg0 = __lasx_xvslli_b(tmpb, 3);
1669 reg1 = __lasx_xvslli_b(tmpg, 3);
1670 reg2 = __lasx_xvslli_b(tmpr, 3);
1671 tmpb = __lasx_xvsrli_b(tmpb, 2);
1672 tmpg = __lasx_xvsrli_b(tmpg, 2);
1673 tmpr = __lasx_xvsrli_b(tmpr, 2);
1674 tmpb = __lasx_xvor_v(reg0, tmpb);
1675 tmpg = __lasx_xvor_v(reg1, tmpg);
1676 tmpr = __lasx_xvor_v(reg2, tmpr);
1677 reg0 = __lasx_xvslli_b(nexb, 3);
1678 reg1 = __lasx_xvslli_b(nexg, 3);
1679 reg2 = __lasx_xvslli_b(nexr, 3);
1680 nexb = __lasx_xvsrli_b(nexb, 2);
1681 nexg = __lasx_xvsrli_b(nexg, 2);
1682 nexr = __lasx_xvsrli_b(nexr, 2);
1683 nexb = __lasx_xvor_v(reg0, nexb);
1684 nexg = __lasx_xvor_v(reg1, nexg);
1685 nexr = __lasx_xvor_v(reg2, nexr);
1686 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
1687 reg0 = __lasx_xvpermi_d(reg0, 0xD8);
1688 reg1 = __lasx_xvpermi_d(reg1, 0xD8);
1689 dst0 = __lasx_xvpickod_b(reg1, reg0);
1690 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
1691 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
1692 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
1693 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
1694 src_argb1555 += 64;
1695 next_argb1555 += 64;
1696 dst_u += 16;
1697 dst_v += 16;
1698 }
1699 }
1700
RGB565ToYRow_LASX(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1701 void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1702 int x;
1703 int len = width / 32;
1704 __m256i src0, src1;
1705 __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
1706 __m256i reg0, reg1, dst0;
1707 __m256i const_66 = __lasx_xvldi(66);
1708 __m256i const_129 = __lasx_xvldi(129);
1709 __m256i const_25 = __lasx_xvldi(25);
1710 __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
1711 0x1080108010801080, 0x1080108010801080};
1712
1713 for (x = 0; x < len; x++) {
1714 src0 = __lasx_xvld(src_rgb565, 0);
1715 src1 = __lasx_xvld(src_rgb565, 32);
1716 tmp0 = __lasx_xvpickev_b(src1, src0);
1717 tmp1 = __lasx_xvpickod_b(src1, src0);
1718 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1719 tmpr = __lasx_xvandi_b(tmp1, 0xF8);
1720 reg1 = __lasx_xvandi_b(tmp1, 0x07);
1721 reg0 = __lasx_xvsrli_b(tmp0, 5);
1722 reg1 = __lasx_xvslli_b(reg1, 3);
1723 tmpg = __lasx_xvor_v(reg1, reg0);
1724 reg0 = __lasx_xvslli_b(tmpb, 3);
1725 reg1 = __lasx_xvsrli_b(tmpb, 2);
1726 tmpb = __lasx_xvor_v(reg1, reg0);
1727 reg0 = __lasx_xvslli_b(tmpg, 2);
1728 reg1 = __lasx_xvsrli_b(tmpg, 4);
1729 tmpg = __lasx_xvor_v(reg1, reg0);
1730 reg0 = __lasx_xvsrli_b(tmpr, 5);
1731 tmpr = __lasx_xvor_v(tmpr, reg0);
1732 reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
1733 reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
1734 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
1735 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
1736 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
1737 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
1738 dst0 = __lasx_xvpackod_b(reg1, reg0);
1739 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1740 __lasx_xvst(dst0, dst_y, 0);
1741 dst_y += 32;
1742 src_rgb565 += 64;
1743 }
1744 }
1745
RGB565ToUVRow_LASX(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1746 void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
1747 int src_stride_rgb565,
1748 uint8_t* dst_u,
1749 uint8_t* dst_v,
1750 int width) {
1751 int x;
1752 int len = width / 32;
1753 const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
1754 __m256i src0, src1, src2, src3;
1755 __m256i tmp0, tmp1, tmp2, tmp3;
1756 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1757 __m256i reg0, reg1, reg2, reg3, dst0;
1758 __m256i const_112 = __lasx_xvldi(0x438);
1759 __m256i const_74 = __lasx_xvldi(0x425);
1760 __m256i const_38 = __lasx_xvldi(0x413);
1761 __m256i const_94 = __lasx_xvldi(0x42F);
1762 __m256i const_18 = __lasx_xvldi(0x409);
1763 __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
1764 0x8080808080808080, 0x8080808080808080};
1765
1766 for (x = 0; x < len; x++) {
1767 DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
1768 next_rgb565, 32, src0, src1, src2, src3);
1769 DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1770 DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1771 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1772 tmpr = __lasx_xvandi_b(tmp1, 0xF8);
1773 nexb = __lasx_xvandi_b(tmp2, 0x1F);
1774 nexr = __lasx_xvandi_b(tmp3, 0xF8);
1775 reg1 = __lasx_xvandi_b(tmp1, 0x07);
1776 reg3 = __lasx_xvandi_b(tmp3, 0x07);
1777 reg0 = __lasx_xvsrli_b(tmp0, 5);
1778 reg1 = __lasx_xvslli_b(reg1, 3);
1779 reg2 = __lasx_xvsrli_b(tmp2, 5);
1780 reg3 = __lasx_xvslli_b(reg3, 3);
1781 tmpg = __lasx_xvor_v(reg1, reg0);
1782 nexg = __lasx_xvor_v(reg2, reg3);
1783 reg0 = __lasx_xvslli_b(tmpb, 3);
1784 reg1 = __lasx_xvsrli_b(tmpb, 2);
1785 reg2 = __lasx_xvslli_b(nexb, 3);
1786 reg3 = __lasx_xvsrli_b(nexb, 2);
1787 tmpb = __lasx_xvor_v(reg1, reg0);
1788 nexb = __lasx_xvor_v(reg2, reg3);
1789 reg0 = __lasx_xvslli_b(tmpg, 2);
1790 reg1 = __lasx_xvsrli_b(tmpg, 4);
1791 reg2 = __lasx_xvslli_b(nexg, 2);
1792 reg3 = __lasx_xvsrli_b(nexg, 4);
1793 tmpg = __lasx_xvor_v(reg1, reg0);
1794 nexg = __lasx_xvor_v(reg2, reg3);
1795 reg0 = __lasx_xvsrli_b(tmpr, 5);
1796 reg2 = __lasx_xvsrli_b(nexr, 5);
1797 tmpr = __lasx_xvor_v(tmpr, reg0);
1798 nexr = __lasx_xvor_v(nexr, reg2);
1799 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
1800 reg0 = __lasx_xvpermi_d(reg0, 0xD8);
1801 reg1 = __lasx_xvpermi_d(reg1, 0xD8);
1802 dst0 = __lasx_xvpickod_b(reg1, reg0);
1803 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
1804 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
1805 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
1806 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
1807 dst_u += 16;
1808 dst_v += 16;
1809 src_rgb565 += 64;
1810 next_rgb565 += 64;
1811 }
1812 }
1813
RGB24ToYRow_LASX(const uint8_t * src_rgb24,uint8_t * dst_y,int width)1814 void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
1815 int x;
1816 int len = width / 32;
1817 __m256i src0, src1, src2;
1818 __m256i tmp0, tmp1, tmp2, tmp3;
1819 __m256i reg0, reg1, reg2, dst0;
1820 __m256i const_129 = __lasx_xvldi(129);
1821 __m256i const_br = {0x4219421942194219, 0x4219421942194219,
1822 0x4219421942194219, 0x4219421942194219};
1823 __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
1824 0x1080108010801080, 0x1080108010801080};
1825 __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
1826 0x17151412110F0E0C};
1827 __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
1828 0x0F0D0C0A09070604};
1829 __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
1830 0x001600130010000D};
1831 __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
1832 0x000E000B00080005};
1833
1834 for (x = 0; x < len; x++) {
1835 reg0 = __lasx_xvld(src_rgb24, 0);
1836 reg1 = __lasx_xvld(src_rgb24, 32);
1837 reg2 = __lasx_xvld(src_rgb24, 64);
1838 src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
1839 src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
1840 src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
1841 tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
1842 tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
1843 tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
1844 tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
1845 reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
1846 reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
1847 reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
1848 reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
1849 dst0 = __lasx_xvpickod_b(reg1, reg0);
1850 __lasx_xvst(dst0, dst_y, 0);
1851 dst_y += 32;
1852 src_rgb24 += 96;
1853 }
1854 }
1855
RGB24ToUVRow_LASX(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1856 void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
1857 int src_stride_rgb24,
1858 uint8_t* dst_u,
1859 uint8_t* dst_v,
1860 int width) {
1861 int x;
1862 const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
1863 int len = width / 32;
1864 __m256i src0, src1, src2, reg0, reg1, reg2;
1865 __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
1866 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1867 __m256i const_112 = __lasx_xvldi(0x438);
1868 __m256i const_74 = __lasx_xvldi(0x425);
1869 __m256i const_38 = __lasx_xvldi(0x413);
1870 __m256i const_94 = __lasx_xvldi(0x42F);
1871 __m256i const_18 = __lasx_xvldi(0x409);
1872 __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
1873 0x8080808080808080, 0x8080808080808080};
1874 __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
1875 0x15120F0C09060300, 0x00000000001E1B18};
1876 __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
1877 0x0706050403020100, 0x1D1A1714110A0908};
1878 __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
1879 0x1613100D0A070401, 0x00000000001F1C19};
1880 __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
1881 0x0706050403020100, 0x1E1B1815120A0908};
1882 __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A,
1883 0x1714110E0B080502, 0x0000000000001D1A};
1884 __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908,
1885 0x0706050403020100, 0x1F1C191613100908};
1886
1887 for (x = 0; x < len; x++) {
1888 DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64,
1889 next_rgb24, 0, reg0, reg1, reg2, tmp0);
1890 DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2);
1891 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
1892 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
1893 DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
1894 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
1895 nexb);
1896 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
1897 nexg);
1898 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
1899 nexr);
1900 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
1901 nexb);
1902 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
1903 nexg);
1904 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
1905 nexr);
1906 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
1907 dst0 = __lasx_xvpickod_b(reg1, reg0);
1908 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
1909 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
1910 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
1911 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
1912 src_rgb24 += 96;
1913 next_rgb24 += 96;
1914 dst_u += 16;
1915 dst_v += 16;
1916 }
1917 }
1918
RAWToYRow_LASX(const uint8_t * src_raw,uint8_t * dst_y,int width)1919 void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
1920 int x;
1921 int len = width / 32;
1922 __m256i src0, src1, src2;
1923 __m256i tmp0, tmp1, tmp2, tmp3;
1924 __m256i reg0, reg1, reg2, dst0;
1925 __m256i const_129 = __lasx_xvldi(129);
1926 __m256i const_br = {0x1942194219421942, 0x1942194219421942,
1927 0x1942194219421942, 0x1942194219421942};
1928 __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
1929 0x1080108010801080, 0x1080108010801080};
1930 __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
1931 0x17151412110F0E0C};
1932 __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
1933 0x0F0D0C0A09070604};
1934 __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
1935 0x001600130010000D};
1936 __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
1937 0x000E000B00080005};
1938
1939 for (x = 0; x < len; x++) {
1940 reg0 = __lasx_xvld(src_raw, 0);
1941 reg1 = __lasx_xvld(src_raw, 32);
1942 reg2 = __lasx_xvld(src_raw, 64);
1943 src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
1944 src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
1945 src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
1946 tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
1947 tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
1948 tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
1949 tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
1950 reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
1951 reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
1952 reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
1953 reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
1954 dst0 = __lasx_xvpickod_b(reg1, reg0);
1955 __lasx_xvst(dst0, dst_y, 0);
1956 dst_y += 32;
1957 src_raw += 96;
1958 }
1959 }
1960
RAWToUVRow_LASX(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1961 void RAWToUVRow_LASX(const uint8_t* src_raw,
1962 int src_stride_raw,
1963 uint8_t* dst_u,
1964 uint8_t* dst_v,
1965 int width) {
1966 int x;
1967 const uint8_t* next_raw = src_raw + src_stride_raw;
1968 int len = width / 32;
1969 __m256i src0, src1, src2, reg0, reg1, reg2;
1970 __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
1971 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1972 __m256i const_112 = __lasx_xvldi(0x438);
1973 __m256i const_74 = __lasx_xvldi(0x425);
1974 __m256i const_38 = __lasx_xvldi(0x413);
1975 __m256i const_94 = __lasx_xvldi(0x42F);
1976 __m256i const_18 = __lasx_xvldi(0x409);
1977 __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
1978 0x8080808080808080, 0x8080808080808080};
1979 __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
1980 0x15120F0C09060300, 0x00000000001E1B18};
1981 __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
1982 0x0706050403020100, 0x1D1A1714110A0908};
1983 __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
1984 0x1613100D0A070401, 0x00000000001F1C19};
1985 __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
1986 0x0706050403020100, 0x1E1B1815120A0908};
1987 __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A,
1988 0x1714110E0B080502, 0x0000000000001D1A};
1989 __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908,
1990 0x0706050403020100, 0x1F1C191613100908};
1991
1992 for (x = 0; x < len; x++) {
1993 DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0,
1994 reg0, reg1, reg2, tmp0);
1995 DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2);
1996 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
1997 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
1998 DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
1999 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
2000 nexb);
2001 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
2002 nexg);
2003 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
2004 nexr);
2005 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
2006 nexb);
2007 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
2008 nexg);
2009 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
2010 nexr);
2011 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
2012 dst0 = __lasx_xvpickod_b(reg1, reg0);
2013 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
2014 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
2015 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
2016 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
2017 src_raw += 96;
2018 next_raw += 96;
2019 dst_u += 16;
2020 dst_v += 16;
2021 }
2022 }
2023
NV12ToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2024 void NV12ToARGBRow_LASX(const uint8_t* src_y,
2025 const uint8_t* src_uv,
2026 uint8_t* dst_argb,
2027 const struct YuvConstants* yuvconstants,
2028 int width) {
2029 int x;
2030 int len = width / 16;
2031 __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
2032 __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
2033 __m256i out_b, out_g, out_r;
2034 __m256i const_0x80 = __lasx_xvldi(0x80);
2035 __m256i alpha = __lasx_xvldi(0xFF);
2036
2037 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2038 vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
2039 vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
2040
2041 for (x = 0; x < len; x++) {
2042 vec_y = __lasx_xvld(src_y, 0);
2043 vec_vu = __lasx_xvld(src_uv, 0);
2044 vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
2045 vec_vu = __lasx_vext2xv_h_b(vec_vu);
2046 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
2047 out_b);
2048 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2049 src_y += 16;
2050 src_uv += 16;
2051 }
2052 }
2053
NV12ToRGB565Row_LASX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2054 void NV12ToRGB565Row_LASX(const uint8_t* src_y,
2055 const uint8_t* src_uv,
2056 uint8_t* dst_rgb565,
2057 const struct YuvConstants* yuvconstants,
2058 int width) {
2059 int x;
2060 int len = width / 16;
2061 __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
2062 __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
2063 __m256i out_b, out_g, out_r;
2064 __m256i const_0x80 = __lasx_xvldi(0x80);
2065
2066 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2067 vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
2068 vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
2069
2070 for (x = 0; x < len; x++) {
2071 vec_y = __lasx_xvld(src_y, 0);
2072 vec_vu = __lasx_xvld(src_uv, 0);
2073 vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
2074 vec_vu = __lasx_vext2xv_h_b(vec_vu);
2075 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
2076 out_b);
2077 out_b = __lasx_xvsrli_h(out_b, 3);
2078 out_g = __lasx_xvsrli_h(out_g, 2);
2079 out_r = __lasx_xvsrli_h(out_r, 3);
2080 out_g = __lasx_xvslli_h(out_g, 5);
2081 out_r = __lasx_xvslli_h(out_r, 11);
2082 out_r = __lasx_xvor_v(out_r, out_g);
2083 out_r = __lasx_xvor_v(out_r, out_b);
2084 __lasx_xvst(out_r, dst_rgb565, 0);
2085 src_y += 16;
2086 src_uv += 16;
2087 dst_rgb565 += 32;
2088 }
2089 }
2090
NV21ToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2091 void NV21ToARGBRow_LASX(const uint8_t* src_y,
2092 const uint8_t* src_uv,
2093 uint8_t* dst_argb,
2094 const struct YuvConstants* yuvconstants,
2095 int width) {
2096 int x;
2097 int len = width / 16;
2098 __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
2099 __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv;
2100 __m256i out_b, out_g, out_r;
2101 __m256i const_0x80 = __lasx_xvldi(0x80);
2102 __m256i alpha = __lasx_xvldi(0xFF);
2103
2104 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2105 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
2106 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
2107
2108 for (x = 0; x < len; x++) {
2109 vec_y = __lasx_xvld(src_y, 0);
2110 vec_uv = __lasx_xvld(src_uv, 0);
2111 vec_uv = __lasx_xvsub_b(vec_uv, const_0x80);
2112 vec_uv = __lasx_vext2xv_h_b(vec_uv);
2113 YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g,
2114 out_r);
2115 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2116 src_y += 16;
2117 src_uv += 16;
2118 }
2119 }
2120
ARGBToYJRow_LASX(const uint8_t * src_argb,uint8_t * dst_y,int width)2121 void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2122 int x;
2123 int len = width / 32;
2124 __m256i src0, src1, src2, src3, dst0;
2125 __m256i tmp0, tmp1, tmp2, tmp3;
2126 __m256i reg0, reg1;
2127 __m256i const_128 = __lasx_xvldi(0x480);
2128 __m256i const_150 = __lasx_xvldi(0x96);
2129 __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
2130 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
2131 __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
2132 0x0000000700000003};
2133
2134 for (x = 0; x < len; x++) {
2135 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
2136 96, src0, src1, src2, src3);
2137 tmp0 = __lasx_xvpickev_b(src1, src0);
2138 tmp1 = __lasx_xvpickod_b(src1, src0);
2139 tmp2 = __lasx_xvpickev_b(src3, src2);
2140 tmp3 = __lasx_xvpickod_b(src3, src2);
2141 reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
2142 reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150);
2143 reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
2144 reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2);
2145 dst0 = __lasx_xvpickod_b(reg1, reg0);
2146 dst0 = __lasx_xvperm_w(dst0, shuff);
2147 __lasx_xvst(dst0, dst_y, 0);
2148 dst_y += 32;
2149 src_argb += 128;
2150 }
2151 }
2152
ARGBToUVJRow_LASX(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2153 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
2154 int src_stride_argb,
2155 uint8_t* dst_u,
2156 uint8_t* dst_v,
2157 int width) {
2158 int x;
2159 const uint8_t* next_argb = src_argb + src_stride_argb;
2160 int len = width / 32;
2161 __m256i src0, src1, src2, src3;
2162 __m256i nex0, nex1, nex2, nex3;
2163 __m256i tmp0, tmp1, tmp2, tmp3;
2164 __m256i reg0, reg1, dst0;
2165 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2166 __m256i const_63 = __lasx_xvldi(0x43F);
2167 __m256i const_42 = __lasx_xvldi(0x42A);
2168 __m256i const_21 = __lasx_xvldi(0x415);
2169 __m256i const_53 = __lasx_xvldi(0x435);
2170 __m256i const_10 = __lasx_xvldi(0x40A);
2171 __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
2172 0x8080808080808080, 0x8080808080808080};
2173 __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
2174 0x1F1D0F0D1B190B09};
2175
2176 for (x = 0; x < len; x++) {
2177 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
2178 96, src0, src1, src2, src3);
2179 DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64,
2180 next_argb, 96, nex0, nex1, nex2, nex3);
2181 tmp0 = __lasx_xvpickev_b(src1, src0);
2182 tmp1 = __lasx_xvpickod_b(src1, src0);
2183 tmp2 = __lasx_xvpickev_b(src3, src2);
2184 tmp3 = __lasx_xvpickod_b(src3, src2);
2185 tmpr = __lasx_xvpickod_b(tmp2, tmp0);
2186 tmpb = __lasx_xvpickev_b(tmp2, tmp0);
2187 tmpg = __lasx_xvpickev_b(tmp3, tmp1);
2188 tmp0 = __lasx_xvpickev_b(nex1, nex0);
2189 tmp1 = __lasx_xvpickod_b(nex1, nex0);
2190 tmp2 = __lasx_xvpickev_b(nex3, nex2);
2191 tmp3 = __lasx_xvpickod_b(nex3, nex2);
2192 nexr = __lasx_xvpickod_b(tmp2, tmp0);
2193 nexb = __lasx_xvpickev_b(tmp2, tmp0);
2194 nexg = __lasx_xvpickev_b(tmp3, tmp1);
2195 tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb);
2196 tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb);
2197 tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg);
2198 tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg);
2199 reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr);
2200 reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr);
2201 tmpb = __lasx_xvavgr_hu(tmp0, tmp1);
2202 tmpg = __lasx_xvavgr_hu(tmp2, tmp3);
2203 tmpr = __lasx_xvavgr_hu(reg0, reg1);
2204 reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb);
2205 reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr);
2206 reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg);
2207 reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg);
2208 reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr);
2209 reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb);
2210 dst0 = __lasx_xvpackod_b(reg1, reg0);
2211 tmp0 = __lasx_xvpermi_d(dst0, 0x44);
2212 tmp1 = __lasx_xvpermi_d(dst0, 0xEE);
2213 dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff);
2214 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
2215 __lasx_xvstelm_d(dst0, dst_v, 0, 2);
2216 __lasx_xvstelm_d(dst0, dst_u, 8, 1);
2217 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
2218 dst_u += 16;
2219 dst_v += 16;
2220 src_argb += 128;
2221 next_argb += 128;
2222 }
2223 }
2224
2225 #ifdef __cplusplus
2226 } // extern "C"
2227 } // namespace libyuv
2228 #endif
2229
2230 #endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
2231