1 /*
2 * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3 *
4 * Copyright (c) 2022 Loongson Technology Corporation Limited
5 *
6 * Use of this source code is governed by a BSD-style license
7 * that can be found in the LICENSE file in the root of the source
8 * tree. An additional intellectual property rights grant can be found
9 * in the file PATENTS. All contributing project authors may
10 * be found in the AUTHORS file in the root of the source tree.
11 */
12
13 #include "libyuv/row.h"
14
15 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
16 #include "libyuv/loongson_intrinsics.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 // Fill YUV -> RGB conversion constants into vectors
24 #define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \
25 { \
26 ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]); \
27 vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]); \
28 ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]); \
29 vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]); \
30 yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]); \
31 yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
32 }
33
34 // Convert 8 pixels of YUV420 to RGB.
35 #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
36 { \
37 __m128i y_ev, y_od, u_l, v_l; \
38 __m128i tmp0, tmp1, tmp2, tmp3; \
39 \
40 tmp0 = __lsx_vilvl_b(in_y, in_y); \
41 y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg); \
42 y_od = __lsx_vmulwod_w_hu_h(tmp0, yg); \
43 y_ev = __lsx_vsrai_w(y_ev, 16); \
44 y_od = __lsx_vsrai_w(y_od, 16); \
45 y_ev = __lsx_vadd_w(y_ev, yb); \
46 y_od = __lsx_vadd_w(y_od, yb); \
47 in_vu = __lsx_vilvl_b(zero, in_vu); \
48 in_vu = __lsx_vsub_h(in_vu, const_80); \
49 u_l = __lsx_vmulwev_w_h(in_vu, vrub); \
50 v_l = __lsx_vmulwod_w_h(in_vu, vrub); \
51 tmp0 = __lsx_vadd_w(y_ev, u_l); \
52 tmp1 = __lsx_vadd_w(y_od, u_l); \
53 tmp2 = __lsx_vadd_w(y_ev, v_l); \
54 tmp3 = __lsx_vadd_w(y_od, v_l); \
55 tmp0 = __lsx_vsrai_w(tmp0, 6); \
56 tmp1 = __lsx_vsrai_w(tmp1, 6); \
57 tmp2 = __lsx_vsrai_w(tmp2, 6); \
58 tmp3 = __lsx_vsrai_w(tmp3, 6); \
59 tmp0 = __lsx_vclip255_w(tmp0); \
60 tmp1 = __lsx_vclip255_w(tmp1); \
61 tmp2 = __lsx_vclip255_w(tmp2); \
62 tmp3 = __lsx_vclip255_w(tmp3); \
63 out_b = __lsx_vpackev_h(tmp1, tmp0); \
64 out_r = __lsx_vpackev_h(tmp3, tmp2); \
65 tmp0 = __lsx_vdp2_w_h(in_vu, vgug); \
66 tmp1 = __lsx_vsub_w(y_ev, tmp0); \
67 tmp2 = __lsx_vsub_w(y_od, tmp0); \
68 tmp1 = __lsx_vsrai_w(tmp1, 6); \
69 tmp2 = __lsx_vsrai_w(tmp2, 6); \
70 tmp1 = __lsx_vclip255_w(tmp1); \
71 tmp2 = __lsx_vclip255_w(tmp2); \
72 out_g = __lsx_vpackev_h(tmp2, tmp1); \
73 }
74
75 // Convert I444 pixels of YUV420 to RGB.
76 #define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \
77 out_r) \
78 { \
79 __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od; \
80 __m128i tmp0, tmp1, tmp2, tmp3; \
81 \
82 y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg); \
83 y_od = __lsx_vmulwod_w_hu_h(in_yy, yg); \
84 y_ev = __lsx_vsrai_w(y_ev, 16); \
85 y_od = __lsx_vsrai_w(y_od, 16); \
86 y_ev = __lsx_vadd_w(y_ev, yb); \
87 y_od = __lsx_vadd_w(y_od, yb); \
88 in_u = __lsx_vsub_h(in_u, const_80); \
89 in_v = __lsx_vsub_h(in_v, const_80); \
90 u_ev = __lsx_vmulwev_w_h(in_u, ub); \
91 u_od = __lsx_vmulwod_w_h(in_u, ub); \
92 v_ev = __lsx_vmulwev_w_h(in_v, vr); \
93 v_od = __lsx_vmulwod_w_h(in_v, vr); \
94 tmp0 = __lsx_vadd_w(y_ev, u_ev); \
95 tmp1 = __lsx_vadd_w(y_od, u_od); \
96 tmp2 = __lsx_vadd_w(y_ev, v_ev); \
97 tmp3 = __lsx_vadd_w(y_od, v_od); \
98 tmp0 = __lsx_vsrai_w(tmp0, 6); \
99 tmp1 = __lsx_vsrai_w(tmp1, 6); \
100 tmp2 = __lsx_vsrai_w(tmp2, 6); \
101 tmp3 = __lsx_vsrai_w(tmp3, 6); \
102 tmp0 = __lsx_vclip255_w(tmp0); \
103 tmp1 = __lsx_vclip255_w(tmp1); \
104 tmp2 = __lsx_vclip255_w(tmp2); \
105 tmp3 = __lsx_vclip255_w(tmp3); \
106 out_b = __lsx_vpackev_h(tmp1, tmp0); \
107 out_r = __lsx_vpackev_h(tmp3, tmp2); \
108 u_ev = __lsx_vpackev_h(in_u, in_v); \
109 u_od = __lsx_vpackod_h(in_u, in_v); \
110 v_ev = __lsx_vdp2_w_h(u_ev, ugvg); \
111 v_od = __lsx_vdp2_w_h(u_od, ugvg); \
112 tmp0 = __lsx_vsub_w(y_ev, v_ev); \
113 tmp1 = __lsx_vsub_w(y_od, v_od); \
114 tmp0 = __lsx_vsrai_w(tmp0, 6); \
115 tmp1 = __lsx_vsrai_w(tmp1, 6); \
116 tmp0 = __lsx_vclip255_w(tmp0); \
117 tmp1 = __lsx_vclip255_w(tmp1); \
118 out_g = __lsx_vpackev_h(tmp1, tmp0); \
119 }
120
121 // Pack and Store 8 ARGB values.
122 #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
123 { \
124 __m128i temp0, temp1; \
125 __m128i dst0, dst1; \
126 \
127 temp0 = __lsx_vpackev_b(in_g, in_b); \
128 temp1 = __lsx_vpackev_b(in_a, in_r); \
129 dst0 = __lsx_vilvl_h(temp1, temp0); \
130 dst1 = __lsx_vilvh_h(temp1, temp0); \
131 __lsx_vst(dst0, pdst_argb, 0); \
132 __lsx_vst(dst1, pdst_argb, 16); \
133 pdst_argb += 32; \
134 }
135
136 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
137 { \
138 __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
139 __m128i _reg0, _reg1; \
140 _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \
141 _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \
142 _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \
143 _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \
144 _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \
145 _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \
146 _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \
147 _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \
148 _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \
149 _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \
150 _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \
151 _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \
152 _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \
153 _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \
154 _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \
155 _dst0 = __lsx_vpickod_b(_reg1, _reg0); \
156 }
157
ARGB4444ToARGBRow_LSX(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)158 void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
159 uint8_t* dst_argb,
160 int width) {
161 int x;
162 int len = width / 16;
163 __m128i src0, src1;
164 __m128i tmp0, tmp1, tmp2, tmp3;
165 __m128i reg0, reg1, reg2, reg3;
166 __m128i dst0, dst1, dst2, dst3;
167
168 for (x = 0; x < len; x++) {
169 src0 = __lsx_vld(src_argb4444, 0);
170 src1 = __lsx_vld(src_argb4444, 16);
171 tmp0 = __lsx_vandi_b(src0, 0x0F);
172 tmp1 = __lsx_vandi_b(src0, 0xF0);
173 tmp2 = __lsx_vandi_b(src1, 0x0F);
174 tmp3 = __lsx_vandi_b(src1, 0xF0);
175 reg0 = __lsx_vslli_b(tmp0, 4);
176 reg2 = __lsx_vslli_b(tmp2, 4);
177 reg1 = __lsx_vsrli_b(tmp1, 4);
178 reg3 = __lsx_vsrli_b(tmp3, 4);
179 DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0,
180 tmp1, tmp2, tmp3);
181 dst0 = __lsx_vilvl_b(tmp1, tmp0);
182 dst2 = __lsx_vilvl_b(tmp3, tmp2);
183 dst1 = __lsx_vilvh_b(tmp1, tmp0);
184 dst3 = __lsx_vilvh_b(tmp3, tmp2);
185 __lsx_vst(dst0, dst_argb, 0);
186 __lsx_vst(dst1, dst_argb, 16);
187 __lsx_vst(dst2, dst_argb, 32);
188 __lsx_vst(dst3, dst_argb, 48);
189 dst_argb += 64;
190 src_argb4444 += 32;
191 }
192 }
193
ARGB1555ToARGBRow_LSX(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)194 void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
195 uint8_t* dst_argb,
196 int width) {
197 int x;
198 int len = width / 16;
199 __m128i src0, src1;
200 __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
201 __m128i reg0, reg1, reg2;
202 __m128i dst0, dst1, dst2, dst3;
203
204 for (x = 0; x < len; x++) {
205 src0 = __lsx_vld(src_argb1555, 0);
206 src1 = __lsx_vld(src_argb1555, 16);
207 tmp0 = __lsx_vpickev_b(src1, src0);
208 tmp1 = __lsx_vpickod_b(src1, src0);
209 tmpb = __lsx_vandi_b(tmp0, 0x1F);
210 tmpg = __lsx_vsrli_b(tmp0, 5);
211 reg0 = __lsx_vandi_b(tmp1, 0x03);
212 reg0 = __lsx_vslli_b(reg0, 3);
213 tmpg = __lsx_vor_v(tmpg, reg0);
214 reg1 = __lsx_vandi_b(tmp1, 0x7C);
215 tmpr = __lsx_vsrli_b(reg1, 2);
216 tmpa = __lsx_vsrli_b(tmp1, 7);
217 tmpa = __lsx_vneg_b(tmpa);
218 reg0 = __lsx_vslli_b(tmpb, 3);
219 reg1 = __lsx_vslli_b(tmpg, 3);
220 reg2 = __lsx_vslli_b(tmpr, 3);
221 tmpb = __lsx_vsrli_b(tmpb, 2);
222 tmpg = __lsx_vsrli_b(tmpg, 2);
223 tmpr = __lsx_vsrli_b(tmpr, 2);
224 tmpb = __lsx_vor_v(reg0, tmpb);
225 tmpg = __lsx_vor_v(reg1, tmpg);
226 tmpr = __lsx_vor_v(reg2, tmpr);
227 DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
228 dst0 = __lsx_vilvl_h(reg1, reg0);
229 dst1 = __lsx_vilvh_h(reg1, reg0);
230 DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
231 dst2 = __lsx_vilvl_h(reg1, reg0);
232 dst3 = __lsx_vilvh_h(reg1, reg0);
233 __lsx_vst(dst0, dst_argb, 0);
234 __lsx_vst(dst1, dst_argb, 16);
235 __lsx_vst(dst2, dst_argb, 32);
236 __lsx_vst(dst3, dst_argb, 48);
237 dst_argb += 64;
238 src_argb1555 += 32;
239 }
240 }
241
RGB565ToARGBRow_LSX(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)242 void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
243 uint8_t* dst_argb,
244 int width) {
245 int x;
246 int len = width / 16;
247 __m128i src0, src1;
248 __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
249 __m128i reg0, reg1, dst0, dst1, dst2, dst3;
250 __m128i alpha = __lsx_vldi(0xFF);
251
252 for (x = 0; x < len; x++) {
253 src0 = __lsx_vld(src_rgb565, 0);
254 src1 = __lsx_vld(src_rgb565, 16);
255 tmp0 = __lsx_vpickev_b(src1, src0);
256 tmp1 = __lsx_vpickod_b(src1, src0);
257 tmpb = __lsx_vandi_b(tmp0, 0x1F);
258 tmpr = __lsx_vandi_b(tmp1, 0xF8);
259 reg1 = __lsx_vandi_b(tmp1, 0x07);
260 reg0 = __lsx_vsrli_b(tmp0, 5);
261 reg1 = __lsx_vslli_b(reg1, 3);
262 tmpg = __lsx_vor_v(reg1, reg0);
263 reg0 = __lsx_vslli_b(tmpb, 3);
264 reg1 = __lsx_vsrli_b(tmpb, 2);
265 tmpb = __lsx_vor_v(reg1, reg0);
266 reg0 = __lsx_vslli_b(tmpg, 2);
267 reg1 = __lsx_vsrli_b(tmpg, 4);
268 tmpg = __lsx_vor_v(reg1, reg0);
269 reg0 = __lsx_vsrli_b(tmpr, 5);
270 tmpr = __lsx_vor_v(tmpr, reg0);
271 DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
272 dst0 = __lsx_vilvl_h(reg1, reg0);
273 dst1 = __lsx_vilvh_h(reg1, reg0);
274 DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
275 dst2 = __lsx_vilvl_h(reg1, reg0);
276 dst3 = __lsx_vilvh_h(reg1, reg0);
277 __lsx_vst(dst0, dst_argb, 0);
278 __lsx_vst(dst1, dst_argb, 16);
279 __lsx_vst(dst2, dst_argb, 32);
280 __lsx_vst(dst3, dst_argb, 48);
281 dst_argb += 64;
282 src_rgb565 += 32;
283 }
284 }
285
RGB24ToARGBRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)286 void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24,
287 uint8_t* dst_argb,
288 int width) {
289 int x;
290 int len = width / 16;
291 __m128i src0, src1, src2;
292 __m128i tmp0, tmp1, tmp2;
293 __m128i dst0, dst1, dst2, dst3;
294 __m128i alpha = __lsx_vldi(0xFF);
295 __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
296 __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
297 __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
298 __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706};
299
300 for (x = 0; x < len; x++) {
301 src0 = __lsx_vld(src_rgb24, 0);
302 src1 = __lsx_vld(src_rgb24, 16);
303 src2 = __lsx_vld(src_rgb24, 32);
304 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
305 tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
306 DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
307 tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
308 __lsx_vst(dst0, dst_argb, 0);
309 __lsx_vst(dst1, dst_argb, 16);
310 __lsx_vst(dst2, dst_argb, 32);
311 __lsx_vst(dst3, dst_argb, 48);
312 dst_argb += 64;
313 src_rgb24 += 48;
314 }
315 }
316
RAWToARGBRow_LSX(const uint8_t * src_raw,uint8_t * dst_argb,int width)317 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
318 int x;
319 int len = width / 16;
320 __m128i src0, src1, src2;
321 __m128i tmp0, tmp1, tmp2;
322 __m128i dst0, dst1, dst2, dst3;
323 __m128i alpha = __lsx_vldi(0xFF);
324 __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
325 __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
326 __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
327 __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708};
328
329 for (x = 0; x < len; x++) {
330 src0 = __lsx_vld(src_raw, 0);
331 src1 = __lsx_vld(src_raw, 16);
332 src2 = __lsx_vld(src_raw, 32);
333 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
334 tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
335 DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
336 tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
337 __lsx_vst(dst0, dst_argb, 0);
338 __lsx_vst(dst1, dst_argb, 16);
339 __lsx_vst(dst2, dst_argb, 32);
340 __lsx_vst(dst3, dst_argb, 48);
341 dst_argb += 64;
342 src_raw += 48;
343 }
344 }
345
ARGB1555ToYRow_LSX(const uint8_t * src_argb1555,uint8_t * dst_y,int width)346 void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555,
347 uint8_t* dst_y,
348 int width) {
349 int x;
350 int len = width / 16;
351 __m128i src0, src1;
352 __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
353 __m128i reg0, reg1, reg2, dst0;
354 __m128i const_66 = __lsx_vldi(66);
355 __m128i const_129 = __lsx_vldi(129);
356 __m128i const_25 = __lsx_vldi(25);
357 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
358
359 for (x = 0; x < len; x++) {
360 src0 = __lsx_vld(src_argb1555, 0);
361 src1 = __lsx_vld(src_argb1555, 16);
362 tmp0 = __lsx_vpickev_b(src1, src0);
363 tmp1 = __lsx_vpickod_b(src1, src0);
364 tmpb = __lsx_vandi_b(tmp0, 0x1F);
365 tmpg = __lsx_vsrli_b(tmp0, 5);
366 reg0 = __lsx_vandi_b(tmp1, 0x03);
367 reg0 = __lsx_vslli_b(reg0, 3);
368 tmpg = __lsx_vor_v(tmpg, reg0);
369 reg1 = __lsx_vandi_b(tmp1, 0x7C);
370 tmpr = __lsx_vsrli_b(reg1, 2);
371 reg0 = __lsx_vslli_b(tmpb, 3);
372 reg1 = __lsx_vslli_b(tmpg, 3);
373 reg2 = __lsx_vslli_b(tmpr, 3);
374 tmpb = __lsx_vsrli_b(tmpb, 2);
375 tmpg = __lsx_vsrli_b(tmpg, 2);
376 tmpr = __lsx_vsrli_b(tmpr, 2);
377 tmpb = __lsx_vor_v(reg0, tmpb);
378 tmpg = __lsx_vor_v(reg1, tmpg);
379 tmpr = __lsx_vor_v(reg2, tmpr);
380 reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
381 reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
382 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
383 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
384 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
385 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
386 dst0 = __lsx_vpackod_b(reg1, reg0);
387 __lsx_vst(dst0, dst_y, 0);
388 dst_y += 16;
389 src_argb1555 += 32;
390 }
391 }
392
ARGB1555ToUVRow_LSX(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)393 void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
394 int src_stride_argb1555,
395 uint8_t* dst_u,
396 uint8_t* dst_v,
397 int width) {
398 int x;
399 int len = width / 16;
400 const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
401 __m128i src0, src1, src2, src3;
402 __m128i tmp0, tmp1, tmp2, tmp3;
403 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
404 __m128i reg0, reg1, reg2, reg3, dst0;
405 __m128i const_112 = __lsx_vldi(0x438);
406 __m128i const_74 = __lsx_vldi(0x425);
407 __m128i const_38 = __lsx_vldi(0x413);
408 __m128i const_94 = __lsx_vldi(0x42F);
409 __m128i const_18 = __lsx_vldi(0x409);
410 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
411
412 for (x = 0; x < len; x++) {
413 DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
414 next_argb1555, 16, src0, src1, src2, src3);
415 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
416 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
417 tmpb = __lsx_vandi_b(tmp0, 0x1F);
418 nexb = __lsx_vandi_b(tmp2, 0x1F);
419 tmpg = __lsx_vsrli_b(tmp0, 5);
420 nexg = __lsx_vsrli_b(tmp2, 5);
421 reg0 = __lsx_vandi_b(tmp1, 0x03);
422 reg2 = __lsx_vandi_b(tmp3, 0x03);
423 reg0 = __lsx_vslli_b(reg0, 3);
424 reg2 = __lsx_vslli_b(reg2, 3);
425 tmpg = __lsx_vor_v(tmpg, reg0);
426 nexg = __lsx_vor_v(nexg, reg2);
427 reg1 = __lsx_vandi_b(tmp1, 0x7C);
428 reg3 = __lsx_vandi_b(tmp3, 0x7C);
429 tmpr = __lsx_vsrli_b(reg1, 2);
430 nexr = __lsx_vsrli_b(reg3, 2);
431 reg0 = __lsx_vslli_b(tmpb, 3);
432 reg1 = __lsx_vslli_b(tmpg, 3);
433 reg2 = __lsx_vslli_b(tmpr, 3);
434 tmpb = __lsx_vsrli_b(tmpb, 2);
435 tmpg = __lsx_vsrli_b(tmpg, 2);
436 tmpr = __lsx_vsrli_b(tmpr, 2);
437 tmpb = __lsx_vor_v(reg0, tmpb);
438 tmpg = __lsx_vor_v(reg1, tmpg);
439 tmpr = __lsx_vor_v(reg2, tmpr);
440 reg0 = __lsx_vslli_b(nexb, 3);
441 reg1 = __lsx_vslli_b(nexg, 3);
442 reg2 = __lsx_vslli_b(nexr, 3);
443 nexb = __lsx_vsrli_b(nexb, 2);
444 nexg = __lsx_vsrli_b(nexg, 2);
445 nexr = __lsx_vsrli_b(nexr, 2);
446 nexb = __lsx_vor_v(reg0, nexb);
447 nexg = __lsx_vor_v(reg1, nexg);
448 nexr = __lsx_vor_v(reg2, nexr);
449 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
450 __lsx_vstelm_d(dst0, dst_u, 0, 0);
451 __lsx_vstelm_d(dst0, dst_v, 0, 1);
452 dst_u += 8;
453 dst_v += 8;
454 src_argb1555 += 32;
455 next_argb1555 += 32;
456 }
457 }
458
RGB565ToYRow_LSX(const uint8_t * src_rgb565,uint8_t * dst_y,int width)459 void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
460 int x;
461 int len = width / 16;
462 __m128i src0, src1;
463 __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
464 __m128i reg0, reg1, dst0;
465 __m128i const_66 = __lsx_vldi(66);
466 __m128i const_129 = __lsx_vldi(129);
467 __m128i const_25 = __lsx_vldi(25);
468 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
469
470 for (x = 0; x < len; x++) {
471 src0 = __lsx_vld(src_rgb565, 0);
472 src1 = __lsx_vld(src_rgb565, 16);
473 tmp0 = __lsx_vpickev_b(src1, src0);
474 tmp1 = __lsx_vpickod_b(src1, src0);
475 tmpb = __lsx_vandi_b(tmp0, 0x1F);
476 tmpr = __lsx_vandi_b(tmp1, 0xF8);
477 reg1 = __lsx_vandi_b(tmp1, 0x07);
478 reg0 = __lsx_vsrli_b(tmp0, 5);
479 reg1 = __lsx_vslli_b(reg1, 3);
480 tmpg = __lsx_vor_v(reg1, reg0);
481 reg0 = __lsx_vslli_b(tmpb, 3);
482 reg1 = __lsx_vsrli_b(tmpb, 2);
483 tmpb = __lsx_vor_v(reg1, reg0);
484 reg0 = __lsx_vslli_b(tmpg, 2);
485 reg1 = __lsx_vsrli_b(tmpg, 4);
486 tmpg = __lsx_vor_v(reg1, reg0);
487 reg0 = __lsx_vsrli_b(tmpr, 5);
488 tmpr = __lsx_vor_v(tmpr, reg0);
489 reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
490 reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
491 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
492 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
493 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
494 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
495 dst0 = __lsx_vpackod_b(reg1, reg0);
496 __lsx_vst(dst0, dst_y, 0);
497 dst_y += 16;
498 src_rgb565 += 32;
499 }
500 }
501
RGB565ToUVRow_LSX(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)502 void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
503 int src_stride_rgb565,
504 uint8_t* dst_u,
505 uint8_t* dst_v,
506 int width) {
507 int x;
508 int len = width / 16;
509 const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
510 __m128i src0, src1, src2, src3;
511 __m128i tmp0, tmp1, tmp2, tmp3;
512 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
513 __m128i reg0, reg1, reg2, reg3, dst0;
514 __m128i const_112 = __lsx_vldi(0x438);
515 __m128i const_74 = __lsx_vldi(0x425);
516 __m128i const_38 = __lsx_vldi(0x413);
517 __m128i const_94 = __lsx_vldi(0x42F);
518 __m128i const_18 = __lsx_vldi(0x409);
519 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
520
521 for (x = 0; x < len; x++) {
522 DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
523 next_rgb565, 16, src0, src1, src2, src3);
524 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
525 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
526 tmpb = __lsx_vandi_b(tmp0, 0x1F);
527 tmpr = __lsx_vandi_b(tmp1, 0xF8);
528 nexb = __lsx_vandi_b(tmp2, 0x1F);
529 nexr = __lsx_vandi_b(tmp3, 0xF8);
530 reg1 = __lsx_vandi_b(tmp1, 0x07);
531 reg3 = __lsx_vandi_b(tmp3, 0x07);
532 reg0 = __lsx_vsrli_b(tmp0, 5);
533 reg1 = __lsx_vslli_b(reg1, 3);
534 reg2 = __lsx_vsrli_b(tmp2, 5);
535 reg3 = __lsx_vslli_b(reg3, 3);
536 tmpg = __lsx_vor_v(reg1, reg0);
537 nexg = __lsx_vor_v(reg2, reg3);
538 reg0 = __lsx_vslli_b(tmpb, 3);
539 reg1 = __lsx_vsrli_b(tmpb, 2);
540 reg2 = __lsx_vslli_b(nexb, 3);
541 reg3 = __lsx_vsrli_b(nexb, 2);
542 tmpb = __lsx_vor_v(reg1, reg0);
543 nexb = __lsx_vor_v(reg2, reg3);
544 reg0 = __lsx_vslli_b(tmpg, 2);
545 reg1 = __lsx_vsrli_b(tmpg, 4);
546 reg2 = __lsx_vslli_b(nexg, 2);
547 reg3 = __lsx_vsrli_b(nexg, 4);
548 tmpg = __lsx_vor_v(reg1, reg0);
549 nexg = __lsx_vor_v(reg2, reg3);
550 reg0 = __lsx_vsrli_b(tmpr, 5);
551 reg2 = __lsx_vsrli_b(nexr, 5);
552 tmpr = __lsx_vor_v(tmpr, reg0);
553 nexr = __lsx_vor_v(nexr, reg2);
554 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
555 __lsx_vstelm_d(dst0, dst_u, 0, 0);
556 __lsx_vstelm_d(dst0, dst_v, 0, 1);
557 dst_u += 8;
558 dst_v += 8;
559 src_rgb565 += 32;
560 next_rgb565 += 32;
561 }
562 }
563
RGB24ToYRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_y,int width)564 void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
565 int x;
566 int len = width / 16;
567 __m128i src0, src1, src2;
568 __m128i tmp0, tmp1, tmp2, tmp3;
569 __m128i reg0, reg1, dst0;
570 __m128i const_129 = __lsx_vldi(129);
571 __m128i const_br = {0x4219421942194219, 0x4219421942194219};
572 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
573 __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
574 __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
575 __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
576 __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
577
578 for (x = 0; x < len; x++) {
579 src0 = __lsx_vld(src_rgb24, 0);
580 src1 = __lsx_vld(src_rgb24, 16);
581 src2 = __lsx_vld(src_rgb24, 32);
582 tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
583 tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
584 tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
585 tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
586 reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
587 reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
588 reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
589 reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
590 dst0 = __lsx_vpickod_b(reg1, reg0);
591 __lsx_vst(dst0, dst_y, 0);
592 dst_y += 16;
593 src_rgb24 += 48;
594 }
595 }
596
RGB24ToUVRow_LSX(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)597 void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
598 int src_stride_rgb24,
599 uint8_t* dst_u,
600 uint8_t* dst_v,
601 int width) {
602 int x;
603 const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
604 int len = width / 16;
605 __m128i src0, src1, src2;
606 __m128i nex0, nex1, nex2, dst0;
607 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
608 __m128i const_112 = __lsx_vldi(0x438);
609 __m128i const_74 = __lsx_vldi(0x425);
610 __m128i const_38 = __lsx_vldi(0x413);
611 __m128i const_94 = __lsx_vldi(0x42F);
612 __m128i const_18 = __lsx_vldi(0x409);
613 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
614 __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
615 __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
616 __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
617 __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
618 __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A};
619 __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908};
620
621 for (x = 0; x < len; x++) {
622 src0 = __lsx_vld(src_rgb24, 0);
623 src1 = __lsx_vld(src_rgb24, 16);
624 src2 = __lsx_vld(src_rgb24, 32);
625 nex0 = __lsx_vld(next_rgb24, 0);
626 nex1 = __lsx_vld(next_rgb24, 16);
627 nex2 = __lsx_vld(next_rgb24, 32);
628 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
629 nexb);
630 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
631 nexg);
632 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
633 nexr);
634 DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
635 nexb);
636 DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
637 nexg);
638 DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
639 nexr);
640 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
641 __lsx_vstelm_d(dst0, dst_u, 0, 0);
642 __lsx_vstelm_d(dst0, dst_v, 0, 1);
643 dst_u += 8;
644 dst_v += 8;
645 src_rgb24 += 48;
646 next_rgb24 += 48;
647 }
648 }
649
RAWToYRow_LSX(const uint8_t * src_raw,uint8_t * dst_y,int width)650 void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
651 int x;
652 int len = width / 16;
653 __m128i src0, src1, src2;
654 __m128i tmp0, tmp1, tmp2, tmp3;
655 __m128i reg0, reg1, dst0;
656 __m128i const_129 = __lsx_vldi(129);
657 __m128i const_br = {0x1942194219421942, 0x1942194219421942};
658 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
659 __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
660 __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
661 __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
662 __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
663
664 for (x = 0; x < len; x++) {
665 src0 = __lsx_vld(src_raw, 0);
666 src1 = __lsx_vld(src_raw, 16);
667 src2 = __lsx_vld(src_raw, 32);
668 tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
669 tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
670 tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
671 tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
672 reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
673 reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
674 reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
675 reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
676 dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
677 __lsx_vst(dst0, dst_y, 0);
678 dst_y += 16;
679 src_raw += 48;
680 }
681 }
682
RAWToUVRow_LSX(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)683 void RAWToUVRow_LSX(const uint8_t* src_raw,
684 int src_stride_raw,
685 uint8_t* dst_u,
686 uint8_t* dst_v,
687 int width) {
688 int x;
689 const uint8_t* next_raw = src_raw + src_stride_raw;
690 int len = width / 16;
691 __m128i src0, src1, src2;
692 __m128i nex0, nex1, nex2, dst0;
693 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
694 __m128i const_112 = __lsx_vldi(0x438);
695 __m128i const_74 = __lsx_vldi(0x425);
696 __m128i const_38 = __lsx_vldi(0x413);
697 __m128i const_94 = __lsx_vldi(0x42F);
698 __m128i const_18 = __lsx_vldi(0x409);
699 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
700 __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
701 __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
702 __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
703 __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
704 __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A};
705 __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908};
706
707 for (x = 0; x < len; x++) {
708 src0 = __lsx_vld(src_raw, 0);
709 src1 = __lsx_vld(src_raw, 16);
710 src2 = __lsx_vld(src_raw, 32);
711 nex0 = __lsx_vld(next_raw, 0);
712 nex1 = __lsx_vld(next_raw, 16);
713 nex2 = __lsx_vld(next_raw, 32);
714 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
715 nexb);
716 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
717 nexg);
718 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
719 nexr);
720 DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
721 nexb);
722 DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
723 nexg);
724 DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
725 nexr);
726 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
727 __lsx_vstelm_d(dst0, dst_u, 0, 0);
728 __lsx_vstelm_d(dst0, dst_v, 0, 1);
729 dst_u += 8;
730 dst_v += 8;
731 src_raw += 48;
732 next_raw += 48;
733 }
734 }
735
NV12ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)736 void NV12ToARGBRow_LSX(const uint8_t* src_y,
737 const uint8_t* src_uv,
738 uint8_t* dst_argb,
739 const struct YuvConstants* yuvconstants,
740 int width) {
741 int x;
742 int len = width / 8;
743 __m128i vec_y, vec_vu;
744 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
745 __m128i vec_vrub, vec_vgug;
746 __m128i out_b, out_g, out_r;
747 __m128i const_80 = __lsx_vldi(0x480);
748 __m128i alpha = __lsx_vldi(0xFF);
749 __m128i zero = __lsx_vldi(0);
750
751 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
752 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
753 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
754
755 for (x = 0; x < len; x++) {
756 vec_y = __lsx_vld(src_y, 0);
757 vec_vu = __lsx_vld(src_uv, 0);
758 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
759 out_r);
760 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
761 src_y += 8;
762 src_uv += 8;
763 }
764 }
765
NV12ToRGB565Row_LSX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)766 void NV12ToRGB565Row_LSX(const uint8_t* src_y,
767 const uint8_t* src_uv,
768 uint8_t* dst_rgb565,
769 const struct YuvConstants* yuvconstants,
770 int width) {
771 int x;
772 int len = width / 8;
773 __m128i vec_y, vec_vu;
774 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
775 __m128i vec_vrub, vec_vgug;
776 __m128i out_b, out_g, out_r;
777 __m128i const_80 = __lsx_vldi(0x480);
778 __m128i zero = __lsx_vldi(0);
779
780 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
781 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
782 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
783
784 for (x = 0; x < len; x++) {
785 vec_y = __lsx_vld(src_y, 0);
786 vec_vu = __lsx_vld(src_uv, 0);
787 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
788 out_r);
789 out_b = __lsx_vsrli_h(out_b, 3);
790 out_g = __lsx_vsrli_h(out_g, 2);
791 out_r = __lsx_vsrli_h(out_r, 3);
792 out_g = __lsx_vslli_h(out_g, 5);
793 out_r = __lsx_vslli_h(out_r, 11);
794 out_r = __lsx_vor_v(out_r, out_g);
795 out_r = __lsx_vor_v(out_r, out_b);
796 __lsx_vst(out_r, dst_rgb565, 0);
797 src_y += 8;
798 src_uv += 8;
799 dst_rgb565 += 16;
800 }
801 }
802
NV21ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)803 void NV21ToARGBRow_LSX(const uint8_t* src_y,
804 const uint8_t* src_vu,
805 uint8_t* dst_argb,
806 const struct YuvConstants* yuvconstants,
807 int width) {
808 int x;
809 int len = width / 8;
810 __m128i vec_y, vec_uv;
811 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
812 __m128i vec_ubvr, vec_ugvg;
813 __m128i out_b, out_g, out_r;
814 __m128i const_80 = __lsx_vldi(0x480);
815 __m128i alpha = __lsx_vldi(0xFF);
816 __m128i zero = __lsx_vldi(0);
817
818 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
819 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
820 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
821
822 for (x = 0; x < len; x++) {
823 vec_y = __lsx_vld(src_y, 0);
824 vec_uv = __lsx_vld(src_vu, 0);
825 YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g,
826 out_b);
827 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
828 src_y += 8;
829 src_vu += 8;
830 }
831 }
832
SobelRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)833 void SobelRow_LSX(const uint8_t* src_sobelx,
834 const uint8_t* src_sobely,
835 uint8_t* dst_argb,
836 int width) {
837 int x;
838 int len = width / 16;
839 __m128i src0, src1, tmp0;
840 __m128i out0, out1, out2, out3;
841 __m128i alpha = __lsx_vldi(0xFF);
842 __m128i shuff0 = {0x1001010110000000, 0x1003030310020202};
843 __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04);
844 __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04);
845 __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04);
846
847 for (x = 0; x < len; x++) {
848 src0 = __lsx_vld(src_sobelx, 0);
849 src1 = __lsx_vld(src_sobely, 0);
850 tmp0 = __lsx_vsadd_bu(src0, src1);
851 DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha,
852 tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3);
853 __lsx_vst(out0, dst_argb, 0);
854 __lsx_vst(out1, dst_argb, 16);
855 __lsx_vst(out2, dst_argb, 32);
856 __lsx_vst(out3, dst_argb, 48);
857 src_sobelx += 16;
858 src_sobely += 16;
859 dst_argb += 64;
860 }
861 }
862
SobelToPlaneRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)863 void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
864 const uint8_t* src_sobely,
865 uint8_t* dst_y,
866 int width) {
867 int x;
868 int len = width / 32;
869 __m128i src0, src1, src2, src3, dst0, dst1;
870
871 for (x = 0; x < len; x++) {
872 DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1);
873 DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3);
874 dst0 = __lsx_vsadd_bu(src0, src2);
875 dst1 = __lsx_vsadd_bu(src1, src3);
876 __lsx_vst(dst0, dst_y, 0);
877 __lsx_vst(dst1, dst_y, 16);
878 src_sobelx += 32;
879 src_sobely += 32;
880 dst_y += 32;
881 }
882 }
883
SobelXYRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)884 void SobelXYRow_LSX(const uint8_t* src_sobelx,
885 const uint8_t* src_sobely,
886 uint8_t* dst_argb,
887 int width) {
888 int x;
889 int len = width / 16;
890 __m128i src_r, src_b, src_g;
891 __m128i tmp0, tmp1, tmp2, tmp3;
892 __m128i dst0, dst1, dst2, dst3;
893 __m128i alpha = __lsx_vldi(0xFF);
894
895 for (x = 0; x < len; x++) {
896 src_r = __lsx_vld(src_sobelx, 0);
897 src_b = __lsx_vld(src_sobely, 0);
898 src_g = __lsx_vsadd_bu(src_r, src_b);
899 tmp0 = __lsx_vilvl_b(src_g, src_b);
900 tmp1 = __lsx_vilvh_b(src_g, src_b);
901 tmp2 = __lsx_vilvl_b(alpha, src_r);
902 tmp3 = __lsx_vilvh_b(alpha, src_r);
903 dst0 = __lsx_vilvl_h(tmp2, tmp0);
904 dst1 = __lsx_vilvh_h(tmp2, tmp0);
905 dst2 = __lsx_vilvl_h(tmp3, tmp1);
906 dst3 = __lsx_vilvh_h(tmp3, tmp1);
907 __lsx_vst(dst0, dst_argb, 0);
908 __lsx_vst(dst1, dst_argb, 16);
909 __lsx_vst(dst2, dst_argb, 32);
910 __lsx_vst(dst3, dst_argb, 48);
911 src_sobelx += 16;
912 src_sobely += 16;
913 dst_argb += 64;
914 }
915 }
916
ARGBToYJRow_LSX(const uint8_t * src_argb,uint8_t * dst_y,int width)917 void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
918 int x;
919 int len = width / 16;
920 __m128i src0, src1, src2, src3, dst0;
921 __m128i tmp0, tmp1, tmp2, tmp3;
922 __m128i reg0, reg1;
923 __m128i const_128 = __lsx_vldi(0x480);
924 __m128i const_150 = __lsx_vldi(0x96);
925 __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
926
927 for (x = 0; x < len; x++) {
928 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
929 src0, src1, src2, src3);
930 tmp0 = __lsx_vpickev_b(src1, src0);
931 tmp1 = __lsx_vpickod_b(src1, src0);
932 tmp2 = __lsx_vpickev_b(src3, src2);
933 tmp3 = __lsx_vpickod_b(src3, src2);
934 reg0 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
935 reg1 = __lsx_vmaddwev_h_bu(const_128, tmp3, const_150);
936 reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
937 reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
938 dst0 = __lsx_vpickod_b(reg1, reg0);
939 __lsx_vst(dst0, dst_y, 0);
940 dst_y += 16;
941 src_argb += 64;
942 }
943 }
944
BGRAToYRow_LSX(const uint8_t * src_bgra,uint8_t * dst_y,int width)945 void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
946 int x;
947 int len = width / 16;
948 __m128i src0, src1, src2, src3, dst0;
949 __m128i tmp0, tmp1, tmp2, tmp3;
950 __m128i reg0, reg1;
951 __m128i const_129 = __lsx_vldi(0x81);
952 __m128i const_br = {0x1942194219421942, 0x1942194219421942};
953 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
954
955 for (x = 0; x < len; x++) {
956 DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
957 src0, src1, src2, src3);
958 tmp0 = __lsx_vpickod_b(src1, src0);
959 tmp1 = __lsx_vpickev_b(src1, src0);
960 tmp2 = __lsx_vpickod_b(src3, src2);
961 tmp3 = __lsx_vpickev_b(src3, src2);
962 reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
963 reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
964 reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
965 reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
966 dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
967 __lsx_vst(dst0, dst_y, 0);
968 dst_y += 16;
969 src_bgra += 64;
970 }
971 }
972
BGRAToUVRow_LSX(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)973 void BGRAToUVRow_LSX(const uint8_t* src_bgra,
974 int src_stride_bgra,
975 uint8_t* dst_u,
976 uint8_t* dst_v,
977 int width) {
978 int x;
979 const uint8_t* next_bgra = src_bgra + src_stride_bgra;
980 int len = width / 16;
981 __m128i src0, src1, src2, src3;
982 __m128i nex0, nex1, nex2, nex3;
983 __m128i tmp0, tmp1, tmp2, tmp3, dst0;
984 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
985 __m128i const_112 = __lsx_vldi(0x438);
986 __m128i const_74 = __lsx_vldi(0x425);
987 __m128i const_38 = __lsx_vldi(0x413);
988 __m128i const_94 = __lsx_vldi(0x42F);
989 __m128i const_18 = __lsx_vldi(0x409);
990 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
991
992 for (x = 0; x < len; x++) {
993 DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
994 src0, src1, src2, src3);
995 DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra,
996 48, nex0, nex1, nex2, nex3);
997 tmp0 = __lsx_vpickod_b(src1, src0);
998 tmp1 = __lsx_vpickev_b(src1, src0);
999 tmp2 = __lsx_vpickod_b(src3, src2);
1000 tmp3 = __lsx_vpickev_b(src3, src2);
1001 tmpb = __lsx_vpickod_b(tmp2, tmp0);
1002 tmpr = __lsx_vpickev_b(tmp2, tmp0);
1003 tmpg = __lsx_vpickod_b(tmp3, tmp1);
1004 tmp0 = __lsx_vpickod_b(nex1, nex0);
1005 tmp1 = __lsx_vpickev_b(nex1, nex0);
1006 tmp2 = __lsx_vpickod_b(nex3, nex2);
1007 tmp3 = __lsx_vpickev_b(nex3, nex2);
1008 nexb = __lsx_vpickod_b(tmp2, tmp0);
1009 nexr = __lsx_vpickev_b(tmp2, tmp0);
1010 nexg = __lsx_vpickod_b(tmp3, tmp1);
1011 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1012 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1013 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1014 dst_u += 8;
1015 dst_v += 8;
1016 src_bgra += 64;
1017 next_bgra += 64;
1018 }
1019 }
1020
ABGRToYRow_LSX(const uint8_t * src_abgr,uint8_t * dst_y,int width)1021 void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1022 int x;
1023 int len = width / 16;
1024 __m128i src0, src1, src2, src3, dst0;
1025 __m128i tmp0, tmp1, tmp2, tmp3;
1026 __m128i reg0, reg1;
1027 __m128i const_129 = __lsx_vldi(0x81);
1028 __m128i const_br = {0x1942194219421942, 0x1942194219421942};
1029 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
1030
1031 for (x = 0; x < len; x++) {
1032 DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
1033 src0, src1, src2, src3);
1034 tmp0 = __lsx_vpickev_b(src1, src0);
1035 tmp1 = __lsx_vpickod_b(src1, src0);
1036 tmp2 = __lsx_vpickev_b(src3, src2);
1037 tmp3 = __lsx_vpickod_b(src3, src2);
1038 reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp1, const_129);
1039 reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
1040 reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
1041 reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
1042 dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
1043 __lsx_vst(dst0, dst_y, 0);
1044 dst_y += 16;
1045 src_abgr += 64;
1046 }
1047 }
1048
ABGRToUVRow_LSX(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1049 void ABGRToUVRow_LSX(const uint8_t* src_abgr,
1050 int src_stride_abgr,
1051 uint8_t* dst_u,
1052 uint8_t* dst_v,
1053 int width) {
1054 int x;
1055 const uint8_t* next_abgr = src_abgr + src_stride_abgr;
1056 int len = width / 16;
1057 __m128i src0, src1, src2, src3;
1058 __m128i nex0, nex1, nex2, nex3;
1059 __m128i tmp0, tmp1, tmp2, tmp3, dst0;
1060 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1061 __m128i const_112 = __lsx_vldi(0x438);
1062 __m128i const_74 = __lsx_vldi(0x425);
1063 __m128i const_38 = __lsx_vldi(0x413);
1064 __m128i const_94 = __lsx_vldi(0x42F);
1065 __m128i const_18 = __lsx_vldi(0x409);
1066 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
1067
1068 for (x = 0; x < len; x++) {
1069 DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
1070 src0, src1, src2, src3);
1071 DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr,
1072 48, nex0, nex1, nex2, nex3);
1073 tmp0 = __lsx_vpickev_b(src1, src0);
1074 tmp1 = __lsx_vpickod_b(src1, src0);
1075 tmp2 = __lsx_vpickev_b(src3, src2);
1076 tmp3 = __lsx_vpickod_b(src3, src2);
1077 tmpb = __lsx_vpickod_b(tmp2, tmp0);
1078 tmpr = __lsx_vpickev_b(tmp2, tmp0);
1079 tmpg = __lsx_vpickev_b(tmp3, tmp1);
1080 tmp0 = __lsx_vpickev_b(nex1, nex0);
1081 tmp1 = __lsx_vpickod_b(nex1, nex0);
1082 tmp2 = __lsx_vpickev_b(nex3, nex2);
1083 tmp3 = __lsx_vpickod_b(nex3, nex2);
1084 nexb = __lsx_vpickod_b(tmp2, tmp0);
1085 nexr = __lsx_vpickev_b(tmp2, tmp0);
1086 nexg = __lsx_vpickev_b(tmp3, tmp1);
1087 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1088 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1089 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1090 dst_u += 8;
1091 dst_v += 8;
1092 src_abgr += 64;
1093 next_abgr += 64;
1094 }
1095 }
1096
RGBAToYRow_LSX(const uint8_t * src_rgba,uint8_t * dst_y,int width)1097 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1098 int x;
1099 int len = width / 16;
1100 __m128i src0, src1, src2, src3, dst0;
1101 __m128i tmp0, tmp1, tmp2, tmp3;
1102 __m128i reg0, reg1;
1103 __m128i const_129 = __lsx_vldi(0x81);
1104 __m128i const_br = {0x4219421942194219, 0x4219421942194219};
1105 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
1106
1107 for (x = 0; x < len; x++) {
1108 DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
1109 src0, src1, src2, src3);
1110 tmp0 = __lsx_vpickod_b(src1, src0);
1111 tmp1 = __lsx_vpickev_b(src1, src0);
1112 tmp2 = __lsx_vpickod_b(src3, src2);
1113 tmp3 = __lsx_vpickev_b(src3, src2);
1114 reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
1115 reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
1116 reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
1117 reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
1118 dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
1119 __lsx_vst(dst0, dst_y, 0);
1120 dst_y += 16;
1121 src_rgba += 64;
1122 }
1123 }
1124
RGBAToUVRow_LSX(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1125 void RGBAToUVRow_LSX(const uint8_t* src_rgba,
1126 int src_stride_rgba,
1127 uint8_t* dst_u,
1128 uint8_t* dst_v,
1129 int width) {
1130 int x;
1131 const uint8_t* next_rgba = src_rgba + src_stride_rgba;
1132 int len = width / 16;
1133 __m128i src0, src1, src2, src3;
1134 __m128i nex0, nex1, nex2, nex3;
1135 __m128i tmp0, tmp1, tmp2, tmp3, dst0;
1136 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1137 __m128i const_112 = __lsx_vldi(0x438);
1138 __m128i const_74 = __lsx_vldi(0x425);
1139 __m128i const_38 = __lsx_vldi(0x413);
1140 __m128i const_94 = __lsx_vldi(0x42F);
1141 __m128i const_18 = __lsx_vldi(0x409);
1142 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
1143
1144 for (x = 0; x < len; x++) {
1145 DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
1146 src0, src1, src2, src3);
1147 DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba,
1148 48, nex0, nex1, nex2, nex3);
1149 tmp0 = __lsx_vpickod_b(src1, src0);
1150 tmp1 = __lsx_vpickev_b(src1, src0);
1151 tmp2 = __lsx_vpickod_b(src3, src2);
1152 tmp3 = __lsx_vpickev_b(src3, src2);
1153 tmpr = __lsx_vpickod_b(tmp2, tmp0);
1154 tmpb = __lsx_vpickev_b(tmp2, tmp0);
1155 tmpg = __lsx_vpickod_b(tmp3, tmp1);
1156 tmp0 = __lsx_vpickod_b(nex1, nex0);
1157 tmp1 = __lsx_vpickev_b(nex1, nex0);
1158 tmp2 = __lsx_vpickod_b(nex3, nex2);
1159 tmp3 = __lsx_vpickev_b(nex3, nex2);
1160 nexr = __lsx_vpickod_b(tmp2, tmp0);
1161 nexb = __lsx_vpickev_b(tmp2, tmp0);
1162 nexg = __lsx_vpickod_b(tmp3, tmp1);
1163 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1164 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1165 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1166 dst_u += 8;
1167 dst_v += 8;
1168 src_rgba += 64;
1169 next_rgba += 64;
1170 }
1171 }
1172
ARGBToUVJRow_LSX(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1173 void ARGBToUVJRow_LSX(const uint8_t* src_argb,
1174 int src_stride_argb,
1175 uint8_t* dst_u,
1176 uint8_t* dst_v,
1177 int width) {
1178 int x;
1179 const uint8_t* next_argb = src_argb + src_stride_argb;
1180 int len = width / 16;
1181 __m128i src0, src1, src2, src3;
1182 __m128i nex0, nex1, nex2, nex3;
1183 __m128i tmp0, tmp1, tmp2, tmp3;
1184 __m128i reg0, reg1, dst0;
1185 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1186 __m128i const_63 = __lsx_vldi(0x43F);
1187 __m128i const_42 = __lsx_vldi(0x42A);
1188 __m128i const_21 = __lsx_vldi(0x415);
1189 __m128i const_53 = __lsx_vldi(0x435);
1190 __m128i const_10 = __lsx_vldi(0x40A);
1191 __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
1192
1193 for (x = 0; x < len; x++) {
1194 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
1195 src0, src1, src2, src3);
1196 DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb,
1197 48, nex0, nex1, nex2, nex3);
1198 tmp0 = __lsx_vpickev_b(src1, src0);
1199 tmp1 = __lsx_vpickod_b(src1, src0);
1200 tmp2 = __lsx_vpickev_b(src3, src2);
1201 tmp3 = __lsx_vpickod_b(src3, src2);
1202 tmpr = __lsx_vpickod_b(tmp2, tmp0);
1203 tmpb = __lsx_vpickev_b(tmp2, tmp0);
1204 tmpg = __lsx_vpickev_b(tmp3, tmp1);
1205 tmp0 = __lsx_vpickev_b(nex1, nex0);
1206 tmp1 = __lsx_vpickod_b(nex1, nex0);
1207 tmp2 = __lsx_vpickev_b(nex3, nex2);
1208 tmp3 = __lsx_vpickod_b(nex3, nex2);
1209 nexr = __lsx_vpickod_b(tmp2, tmp0);
1210 nexb = __lsx_vpickev_b(tmp2, tmp0);
1211 nexg = __lsx_vpickev_b(tmp3, tmp1);
1212 tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb);
1213 tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb);
1214 tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg);
1215 tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
1216 reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
1217 reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
1218 tmpb = __lsx_vavgr_hu(tmp0, tmp1);
1219 tmpg = __lsx_vavgr_hu(tmp2, tmp3);
1220 tmpr = __lsx_vavgr_hu(reg0, reg1);
1221 reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
1222 reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
1223 reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
1224 reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
1225 reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
1226 reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
1227 dst0 = __lsx_vpickod_b(reg1, reg0);
1228 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1229 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1230 dst_u += 8;
1231 dst_v += 8;
1232 src_argb += 64;
1233 next_argb += 64;
1234 }
1235 }
1236
I444ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1237 void I444ToARGBRow_LSX(const uint8_t* src_y,
1238 const uint8_t* src_u,
1239 const uint8_t* src_v,
1240 uint8_t* dst_argb,
1241 const struct YuvConstants* yuvconstants,
1242 int width) {
1243 int x;
1244 int len = width / 16;
1245 __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r;
1246 __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh;
1247 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg;
1248 __m128i const_80 = __lsx_vldi(0x480);
1249 __m128i alpha = __lsx_vldi(0xFF);
1250 __m128i zero = __lsx_vldi(0);
1251
1252 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1253 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
1254
1255 for (x = 0; x < len; x++) {
1256 vec_y = __lsx_vld(src_y, 0);
1257 vec_u = __lsx_vld(src_u, 0);
1258 vec_v = __lsx_vld(src_v, 0);
1259 vec_yl = __lsx_vilvl_b(vec_y, vec_y);
1260 vec_ul = __lsx_vilvl_b(zero, vec_u);
1261 vec_vl = __lsx_vilvl_b(zero, vec_v);
1262 I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
1263 out_b, out_g, out_r);
1264 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1265 vec_yh = __lsx_vilvh_b(vec_y, vec_y);
1266 vec_uh = __lsx_vilvh_b(zero, vec_u);
1267 vec_vh = __lsx_vilvh_b(zero, vec_v);
1268 I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
1269 out_b, out_g, out_r);
1270 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1271 src_y += 16;
1272 src_u += 16;
1273 src_v += 16;
1274 }
1275 }
1276
I400ToARGBRow_LSX(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1277 void I400ToARGBRow_LSX(const uint8_t* src_y,
1278 uint8_t* dst_argb,
1279 const struct YuvConstants* yuvconstants,
1280 int width) {
1281 int x;
1282 int len = width / 16;
1283 __m128i vec_y, vec_yl, vec_yh, out0;
1284 __m128i y_ev, y_od, dst0, dst1, dst2, dst3;
1285 __m128i temp0, temp1;
1286 __m128i alpha = __lsx_vldi(0xFF);
1287 __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]);
1288 __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]);
1289
1290 for (x = 0; x < len; x++) {
1291 vec_y = __lsx_vld(src_y, 0);
1292 vec_yl = __lsx_vilvl_b(vec_y, vec_y);
1293 y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg);
1294 y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg);
1295 y_ev = __lsx_vsrai_w(y_ev, 16);
1296 y_od = __lsx_vsrai_w(y_od, 16);
1297 y_ev = __lsx_vadd_w(y_ev, vec_yb);
1298 y_od = __lsx_vadd_w(y_od, vec_yb);
1299 y_ev = __lsx_vsrai_w(y_ev, 6);
1300 y_od = __lsx_vsrai_w(y_od, 6);
1301 y_ev = __lsx_vclip255_w(y_ev);
1302 y_od = __lsx_vclip255_w(y_od);
1303 out0 = __lsx_vpackev_h(y_od, y_ev);
1304 temp0 = __lsx_vpackev_b(out0, out0);
1305 temp1 = __lsx_vpackev_b(alpha, out0);
1306 dst0 = __lsx_vilvl_h(temp1, temp0);
1307 dst1 = __lsx_vilvh_h(temp1, temp0);
1308 vec_yh = __lsx_vilvh_b(vec_y, vec_y);
1309 y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg);
1310 y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg);
1311 y_ev = __lsx_vsrai_w(y_ev, 16);
1312 y_od = __lsx_vsrai_w(y_od, 16);
1313 y_ev = __lsx_vadd_w(y_ev, vec_yb);
1314 y_od = __lsx_vadd_w(y_od, vec_yb);
1315 y_ev = __lsx_vsrai_w(y_ev, 6);
1316 y_od = __lsx_vsrai_w(y_od, 6);
1317 y_ev = __lsx_vclip255_w(y_ev);
1318 y_od = __lsx_vclip255_w(y_od);
1319 out0 = __lsx_vpackev_h(y_od, y_ev);
1320 temp0 = __lsx_vpackev_b(out0, out0);
1321 temp1 = __lsx_vpackev_b(alpha, out0);
1322 dst2 = __lsx_vilvl_h(temp1, temp0);
1323 dst3 = __lsx_vilvh_h(temp1, temp0);
1324 __lsx_vst(dst0, dst_argb, 0);
1325 __lsx_vst(dst1, dst_argb, 16);
1326 __lsx_vst(dst2, dst_argb, 32);
1327 __lsx_vst(dst3, dst_argb, 48);
1328 dst_argb += 64;
1329 src_y += 16;
1330 }
1331 }
1332
J400ToARGBRow_LSX(const uint8_t * src_y,uint8_t * dst_argb,int width)1333 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) {
1334 int x;
1335 int len = width / 16;
1336 __m128i vec_y, dst0, dst1, dst2, dst3;
1337 __m128i tmp0, tmp1, tmp2, tmp3;
1338 __m128i alpha = __lsx_vldi(0xFF);
1339
1340 for (x = 0; x < len; x++) {
1341 vec_y = __lsx_vld(src_y, 0);
1342 tmp0 = __lsx_vilvl_b(vec_y, vec_y);
1343 tmp1 = __lsx_vilvh_b(vec_y, vec_y);
1344 tmp2 = __lsx_vilvl_b(alpha, vec_y);
1345 tmp3 = __lsx_vilvh_b(alpha, vec_y);
1346 dst0 = __lsx_vilvl_h(tmp2, tmp0);
1347 dst1 = __lsx_vilvh_h(tmp2, tmp0);
1348 dst2 = __lsx_vilvl_h(tmp3, tmp1);
1349 dst3 = __lsx_vilvh_h(tmp3, tmp1);
1350 __lsx_vst(dst0, dst_argb, 0);
1351 __lsx_vst(dst1, dst_argb, 16);
1352 __lsx_vst(dst2, dst_argb, 32);
1353 __lsx_vst(dst3, dst_argb, 48);
1354 dst_argb += 64;
1355 src_y += 16;
1356 }
1357 }
1358
YUY2ToARGBRow_LSX(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1359 void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
1360 uint8_t* dst_argb,
1361 const struct YuvConstants* yuvconstants,
1362 int width) {
1363 int x;
1364 int len = width / 8;
1365 __m128i src0, vec_y, vec_vu;
1366 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1367 __m128i vec_vrub, vec_vgug;
1368 __m128i out_b, out_g, out_r;
1369 __m128i const_80 = __lsx_vldi(0x480);
1370 __m128i zero = __lsx_vldi(0);
1371 __m128i alpha = __lsx_vldi(0xFF);
1372
1373 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1374 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
1375 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
1376
1377 for (x = 0; x < len; x++) {
1378 src0 = __lsx_vld(src_yuy2, 0);
1379 vec_y = __lsx_vpickev_b(src0, src0);
1380 vec_vu = __lsx_vpickod_b(src0, src0);
1381 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
1382 out_r);
1383 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1384 src_yuy2 += 16;
1385 }
1386 }
1387
UYVYToARGBRow_LSX(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1388 void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
1389 uint8_t* dst_argb,
1390 const struct YuvConstants* yuvconstants,
1391 int width) {
1392 int x;
1393 int len = width / 8;
1394 __m128i src0, vec_y, vec_vu;
1395 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1396 __m128i vec_vrub, vec_vgug;
1397 __m128i out_b, out_g, out_r;
1398 __m128i const_80 = __lsx_vldi(0x480);
1399 __m128i zero = __lsx_vldi(0);
1400 __m128i alpha = __lsx_vldi(0xFF);
1401
1402 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1403 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
1404 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
1405
1406 for (x = 0; x < len; x++) {
1407 src0 = __lsx_vld(src_uyvy, 0);
1408 vec_y = __lsx_vpickod_b(src0, src0);
1409 vec_vu = __lsx_vpickev_b(src0, src0);
1410 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
1411 out_r);
1412 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1413 src_uyvy += 16;
1414 }
1415 }
1416
InterpolateRow_LSX(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int32_t source_y_fraction)1417 void InterpolateRow_LSX(uint8_t* dst_ptr,
1418 const uint8_t* src_ptr,
1419 ptrdiff_t src_stride,
1420 int width,
1421 int32_t source_y_fraction) {
1422 int x;
1423 int y1_fraction = source_y_fraction;
1424 int y0_fraction = 256 - y1_fraction;
1425 const uint8_t* nex_ptr = src_ptr + src_stride;
1426 uint16_t y_fractions;
1427 int len = width / 32;
1428 __m128i src0, src1, nex0, nex1;
1429 __m128i dst0, dst1, y_frac;
1430 __m128i tmp0, tmp1, tmp2, tmp3;
1431 __m128i const_128 = __lsx_vldi(0x480);
1432
1433 if (y1_fraction == 0) {
1434 for (x = 0; x < len; x++) {
1435 DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
1436 __lsx_vst(src0, dst_ptr, 0);
1437 __lsx_vst(src1, dst_ptr, 16);
1438 src_ptr += 32;
1439 dst_ptr += 32;
1440 }
1441 return;
1442 }
1443
1444 if (y1_fraction == 128) {
1445 for (x = 0; x < len; x++) {
1446 DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
1447 DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
1448 dst0 = __lsx_vavgr_bu(src0, nex0);
1449 dst1 = __lsx_vavgr_bu(src1, nex1);
1450 __lsx_vst(dst0, dst_ptr, 0);
1451 __lsx_vst(dst1, dst_ptr, 16);
1452 src_ptr += 32;
1453 nex_ptr += 32;
1454 dst_ptr += 32;
1455 }
1456 return;
1457 }
1458
1459 y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
1460 y_frac = __lsx_vreplgr2vr_h(y_fractions);
1461
1462 for (x = 0; x < len; x++) {
1463 DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
1464 DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
1465 tmp0 = __lsx_vilvl_b(nex0, src0);
1466 tmp1 = __lsx_vilvh_b(nex0, src0);
1467 tmp2 = __lsx_vilvl_b(nex1, src1);
1468 tmp3 = __lsx_vilvh_b(nex1, src1);
1469 tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac);
1470 tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac);
1471 tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac);
1472 tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac);
1473 dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8);
1474 dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8);
1475 __lsx_vst(dst0, dst_ptr, 0);
1476 __lsx_vst(dst1, dst_ptr, 16);
1477 src_ptr += 32;
1478 nex_ptr += 32;
1479 dst_ptr += 32;
1480 }
1481 }
1482
ARGBSetRow_LSX(uint8_t * dst_argb,uint32_t v32,int width)1483 void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) {
1484 int x;
1485 int len = width / 4;
1486 __m128i dst0 = __lsx_vreplgr2vr_w(v32);
1487
1488 for (x = 0; x < len; x++) {
1489 __lsx_vst(dst0, dst_argb, 0);
1490 dst_argb += 16;
1491 }
1492 }
1493
RAWToRGB24Row_LSX(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)1494 void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
1495 int x;
1496 int len = width / 16;
1497 __m128i src0, src1, src2;
1498 __m128i dst0, dst1, dst2;
1499 __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06};
1500 __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A};
1501 __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708};
1502
1503 for (x = 0; x < len; x++) {
1504 DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1);
1505 src2 = __lsx_vld(src_raw, 32);
1506 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1);
1507 dst2 = __lsx_vshuf_b(src1, src2, shuf2);
1508 dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E);
1509 __lsx_vst(dst0, dst_rgb24, 0);
1510 __lsx_vst(dst1, dst_rgb24, 16);
1511 __lsx_vst(dst2, dst_rgb24, 32);
1512 dst_rgb24 += 48;
1513 src_raw += 48;
1514 }
1515 }
1516
MergeUVRow_LSX(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)1517 void MergeUVRow_LSX(const uint8_t* src_u,
1518 const uint8_t* src_v,
1519 uint8_t* dst_uv,
1520 int width) {
1521 int x;
1522 int len = width / 16;
1523 __m128i src0, src1, dst0, dst1;
1524
1525 for (x = 0; x < len; x++) {
1526 DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1);
1527 dst0 = __lsx_vilvl_b(src1, src0);
1528 dst1 = __lsx_vilvh_b(src1, src0);
1529 __lsx_vst(dst0, dst_uv, 0);
1530 __lsx_vst(dst1, dst_uv, 16);
1531 src_u += 16;
1532 src_v += 16;
1533 dst_uv += 32;
1534 }
1535 }
1536
ARGBExtractAlphaRow_LSX(const uint8_t * src_argb,uint8_t * dst_a,int width)1537 void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
1538 uint8_t* dst_a,
1539 int width) {
1540 int x;
1541 int len = width / 16;
1542 __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
1543
1544 for (x = 0; x < len; x++) {
1545 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
1546 src0, src1, src2, src3);
1547 tmp0 = __lsx_vpickod_b(src1, src0);
1548 tmp1 = __lsx_vpickod_b(src3, src2);
1549 dst0 = __lsx_vpickod_b(tmp1, tmp0);
1550 __lsx_vst(dst0, dst_a, 0);
1551 src_argb += 64;
1552 dst_a += 16;
1553 }
1554 }
1555
ARGBBlendRow_LSX(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1556 void ARGBBlendRow_LSX(const uint8_t* src_argb,
1557 const uint8_t* src_argb1,
1558 uint8_t* dst_argb,
1559 int width) {
1560 int x;
1561 int len = width / 8;
1562 __m128i src0, src1, src2, src3;
1563 __m128i tmp0, tmp1, dst0, dst1;
1564 __m128i reg0, reg1, reg2, reg3;
1565 __m128i a0, a1, a2, a3;
1566 __m128i const_256 = __lsx_vldi(0x500);
1567 __m128i zero = __lsx_vldi(0);
1568 __m128i alpha = __lsx_vldi(0xFF);
1569 __m128i control = {0xFF000000FF000000, 0xFF000000FF000000};
1570
1571 for (x = 0; x < len; x++) {
1572 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16,
1573 src0, src1, src2, src3);
1574 tmp0 = __lsx_vshuf4i_b(src0, 0xFF);
1575 tmp1 = __lsx_vshuf4i_b(src1, 0xFF);
1576 a0 = __lsx_vilvl_b(zero, tmp0);
1577 a1 = __lsx_vilvh_b(zero, tmp0);
1578 a2 = __lsx_vilvl_b(zero, tmp1);
1579 a3 = __lsx_vilvh_b(zero, tmp1);
1580 reg0 = __lsx_vilvl_b(zero, src2);
1581 reg1 = __lsx_vilvh_b(zero, src2);
1582 reg2 = __lsx_vilvl_b(zero, src3);
1583 reg3 = __lsx_vilvh_b(zero, src3);
1584 DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2,
1585 const_256, a3, a0, a1, a2, a3);
1586 DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1,
1587 reg2, reg3);
1588 DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1);
1589 dst0 = __lsx_vsadd_bu(dst0, src0);
1590 dst1 = __lsx_vsadd_bu(dst1, src1);
1591 dst0 = __lsx_vbitsel_v(dst0, alpha, control);
1592 dst1 = __lsx_vbitsel_v(dst1, alpha, control);
1593 __lsx_vst(dst0, dst_argb, 0);
1594 __lsx_vst(dst1, dst_argb, 16);
1595 src_argb += 32;
1596 src_argb1 += 32;
1597 dst_argb += 32;
1598 }
1599 }
1600
ARGBQuantizeRow_LSX(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)1601 void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
1602 int scale,
1603 int interval_size,
1604 int interval_offset,
1605 int width) {
1606 int x;
1607 int len = width / 16;
1608 __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1609 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1610 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
1611 __m128i vec_size = __lsx_vreplgr2vr_b(interval_size);
1612 __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset);
1613 __m128i vec_scale = __lsx_vreplgr2vr_w(scale);
1614 __m128i zero = __lsx_vldi(0);
1615 __m128i control = {0xFF000000FF000000, 0xFF000000FF000000};
1616
1617 for (x = 0; x < len; x++) {
1618 DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48,
1619 src0, src1, src2, src3);
1620 reg0 = __lsx_vilvl_b(zero, src0);
1621 reg1 = __lsx_vilvh_b(zero, src0);
1622 reg2 = __lsx_vilvl_b(zero, src1);
1623 reg3 = __lsx_vilvh_b(zero, src1);
1624 reg4 = __lsx_vilvl_b(zero, src2);
1625 reg5 = __lsx_vilvh_b(zero, src2);
1626 reg6 = __lsx_vilvl_b(zero, src3);
1627 reg7 = __lsx_vilvh_b(zero, src3);
1628 tmp0 = __lsx_vilvl_h(zero, reg0);
1629 tmp1 = __lsx_vilvh_h(zero, reg0);
1630 tmp2 = __lsx_vilvl_h(zero, reg1);
1631 tmp3 = __lsx_vilvh_h(zero, reg1);
1632 tmp4 = __lsx_vilvl_h(zero, reg2);
1633 tmp5 = __lsx_vilvh_h(zero, reg2);
1634 tmp6 = __lsx_vilvl_h(zero, reg3);
1635 tmp7 = __lsx_vilvh_h(zero, reg3);
1636 DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
1637 tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
1638 DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
1639 tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
1640 DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
1641 tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
1642 dst0 = __lsx_vpickev_b(reg1, reg0);
1643 dst1 = __lsx_vpickev_b(reg3, reg2);
1644 tmp0 = __lsx_vilvl_h(zero, reg4);
1645 tmp1 = __lsx_vilvh_h(zero, reg4);
1646 tmp2 = __lsx_vilvl_h(zero, reg5);
1647 tmp3 = __lsx_vilvh_h(zero, reg5);
1648 tmp4 = __lsx_vilvl_h(zero, reg6);
1649 tmp5 = __lsx_vilvh_h(zero, reg6);
1650 tmp6 = __lsx_vilvl_h(zero, reg7);
1651 tmp7 = __lsx_vilvh_h(zero, reg7);
1652 DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
1653 tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
1654 DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
1655 tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
1656 DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
1657 tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
1658 dst2 = __lsx_vpickev_b(reg1, reg0);
1659 dst3 = __lsx_vpickev_b(reg3, reg2);
1660 DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size,
1661 dst3, vec_size, dst0, dst1, dst2, dst3);
1662 DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2,
1663 vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3);
1664 DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2,
1665 src2, control, dst3, src3, control, dst0, dst1, dst2, dst3);
1666 __lsx_vst(dst0, dst_argb, 0);
1667 __lsx_vst(dst1, dst_argb, 16);
1668 __lsx_vst(dst2, dst_argb, 32);
1669 __lsx_vst(dst3, dst_argb, 48);
1670 dst_argb += 64;
1671 }
1672 }
1673
ARGBColorMatrixRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)1674 void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
1675 uint8_t* dst_argb,
1676 const int8_t* matrix_argb,
1677 int width) {
1678 int x;
1679 int len = width / 8;
1680 __m128i src0, src1, tmp0, tmp1, dst0, dst1;
1681 __m128i tmp_b, tmp_g, tmp_r, tmp_a;
1682 __m128i reg_b, reg_g, reg_r, reg_a;
1683 __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0);
1684 __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4);
1685 __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8);
1686 __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12);
1687
1688 for (x = 0; x < len; x++) {
1689 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1690 DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r,
1691 src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a);
1692 DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r,
1693 src1, matrix_a, reg_b, reg_g, reg_r, reg_a);
1694 DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a,
1695 tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
1696 DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a,
1697 reg_a, reg_b, reg_g, reg_r, reg_a);
1698 DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b,
1699 tmp_g, tmp_r, tmp_a);
1700 DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b,
1701 reg_g, reg_r, reg_a);
1702 DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r,
1703 tmp_a)
1704 DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r,
1705 reg_a)
1706 DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a,
1707 tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
1708 tmp0 = __lsx_vpackev_b(tmp_g, tmp_b);
1709 tmp1 = __lsx_vpackev_b(tmp_a, tmp_r);
1710 dst0 = __lsx_vilvl_h(tmp1, tmp0);
1711 dst1 = __lsx_vilvh_h(tmp1, tmp0);
1712 __lsx_vst(dst0, dst_argb, 0);
1713 __lsx_vst(dst1, dst_argb, 16);
1714 src_argb += 32;
1715 dst_argb += 32;
1716 }
1717 }
1718
SplitUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)1719 void SplitUVRow_LSX(const uint8_t* src_uv,
1720 uint8_t* dst_u,
1721 uint8_t* dst_v,
1722 int width) {
1723 int x;
1724 int len = width / 32;
1725 __m128i src0, src1, src2, src3;
1726 __m128i dst0, dst1, dst2, dst3;
1727
1728 for (x = 0; x < len; x++) {
1729 DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0,
1730 src1, src2, src3);
1731 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1);
1732 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3);
1733 __lsx_vst(dst0, dst_u, 0);
1734 __lsx_vst(dst1, dst_u, 16);
1735 __lsx_vst(dst2, dst_v, 0);
1736 __lsx_vst(dst3, dst_v, 16);
1737 src_uv += 64;
1738 dst_u += 32;
1739 dst_v += 32;
1740 }
1741 }
1742
SetRow_LSX(uint8_t * dst,uint8_t v8,int width)1743 void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) {
1744 int x;
1745 int len = width / 16;
1746 __m128i dst0 = __lsx_vreplgr2vr_b(v8);
1747
1748 for (x = 0; x < len; x++) {
1749 __lsx_vst(dst0, dst, 0);
1750 dst += 16;
1751 }
1752 }
1753
MirrorSplitUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)1754 void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
1755 uint8_t* dst_u,
1756 uint8_t* dst_v,
1757 int width) {
1758 int x;
1759 int len = width / 32;
1760 __m128i src0, src1, src2, src3;
1761 __m128i dst0, dst1, dst2, dst3;
1762 __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E};
1763 __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F};
1764
1765 src_uv += (width << 1);
1766 for (x = 0; x < len; x++) {
1767 src_uv -= 64;
1768 DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2,
1769 src3, src0, src1);
1770 DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0,
1771 shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3);
1772 __lsx_vst(dst0, dst_v, 0);
1773 __lsx_vst(dst1, dst_v, 16);
1774 __lsx_vst(dst2, dst_u, 0);
1775 __lsx_vst(dst3, dst_u, 16);
1776 dst_u += 32;
1777 dst_v += 32;
1778 }
1779 }
1780
HalfFloatRow_LSX(const uint16_t * src,uint16_t * dst,float scale,int width)1781 void HalfFloatRow_LSX(const uint16_t* src,
1782 uint16_t* dst,
1783 float scale,
1784 int width) {
1785 int x;
1786 int len = width / 32;
1787 float mult = 1.9259299444e-34f * scale;
1788 __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1789 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1790 __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
1791 __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0);
1792 __m128i zero = __lsx_vldi(0);
1793
1794 for (x = 0; x < len; x++) {
1795 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
1796 src3);
1797 DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3,
1798 tmp0, tmp2, tmp4, tmp6);
1799 DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3,
1800 tmp1, tmp3, tmp5, tmp7);
1801 DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4,
1802 reg6);
1803 DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5,
1804 reg7);
1805 DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult,
1806 reg3, vec_mult, reg0, reg1, reg2, reg3);
1807 DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult,
1808 reg7, vec_mult, reg4, reg5, reg6, reg7);
1809 DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13,
1810 (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3);
1811 DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13,
1812 (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7);
1813 DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1814 dst0, dst1, dst2, dst3);
1815 __lsx_vst(dst0, dst, 0);
1816 __lsx_vst(dst1, dst, 16);
1817 __lsx_vst(dst2, dst, 32);
1818 __lsx_vst(dst3, dst, 48);
1819 src += 32;
1820 dst += 32;
1821 }
1822 }
1823
1824 #ifdef __cplusplus
1825 } // extern "C"
1826 } // namespace libyuv
1827 #endif
1828
1829 #endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
1830