1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <string.h>
12
13 #include "libyuv/row.h"
14
15 // This module is for GCC MSA
16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
17 #include "libyuv/macros_msa.h"
18
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23
24 #define ALPHA_VAL (-1)
25
26 // Fill YUV -> RGB conversion constants into vectors
27 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
28 { \
29 ub = __msa_fill_w(yuvconst->kUVToB[0]); \
30 vr = __msa_fill_w(yuvconst->kUVToR[1]); \
31 ug = __msa_fill_w(yuvconst->kUVToG[0]); \
32 vg = __msa_fill_w(yuvconst->kUVToG[1]); \
33 yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
34 yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \
35 }
36
37 // Load YUV 422 pixel data
38 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
39 { \
40 uint64_t y_m; \
41 uint32_t u_m, v_m; \
42 v4i32 zero_m = {0}; \
43 y_m = LD(psrc_y); \
44 u_m = LW(psrc_u); \
45 v_m = LW(psrc_v); \
46 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
47 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \
48 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \
49 }
50
51 // Clip input vector elements between 0 to 255
52 #define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
53 { \
54 v4i32 max_m = __msa_ldi_w(0xFF); \
55 \
56 in0 = __msa_maxi_s_w(in0, 0); \
57 in1 = __msa_maxi_s_w(in1, 0); \
58 in2 = __msa_maxi_s_w(in2, 0); \
59 in3 = __msa_maxi_s_w(in3, 0); \
60 in4 = __msa_maxi_s_w(in4, 0); \
61 in5 = __msa_maxi_s_w(in5, 0); \
62 in0 = __msa_min_s_w(max_m, in0); \
63 in1 = __msa_min_s_w(max_m, in1); \
64 in2 = __msa_min_s_w(max_m, in2); \
65 in3 = __msa_min_s_w(max_m, in3); \
66 in4 = __msa_min_s_w(max_m, in4); \
67 in5 = __msa_min_s_w(max_m, in5); \
68 }
69
70 // Convert 8 pixels of YUV 420 to RGB.
71 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
72 { \
73 v8i16 vec0_m, vec1_m; \
74 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
75 v4i32 reg5_m, reg6_m, reg7_m; \
76 v16i8 temp_m, zero_m = {0}; \
77 \
78 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
79 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
80 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
81 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
82 vec1_m = (v8i16)__msa_subv_h(vec1_m, const_0x80); \
83 temp_m = (v16i8)__msa_clti_s_h(vec1_m, 0); \
84 reg2_m = (v4i32)__msa_ilvr_h((v8i16)temp_m, (v8i16)vec1_m); \
85 reg3_m = (v4i32)__msa_ilvl_h((v8i16)temp_m, (v8i16)vec1_m); \
86 reg0_m *= yg; \
87 reg1_m *= yg; \
88 reg2_m *= ubvr; \
89 reg3_m *= ubvr; \
90 reg0_m = __msa_srai_w(reg0_m, 16); \
91 reg1_m = __msa_srai_w(reg1_m, 16); \
92 reg0_m += yb; \
93 reg1_m += yb; \
94 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
95 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
96 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
97 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
98 reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
99 reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
100 reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
101 reg5_m = reg0_m + reg5_m; \
102 reg6_m = reg1_m + reg6_m; \
103 reg2_m = reg0_m + reg2_m; \
104 reg3_m = reg1_m + reg3_m; \
105 reg7_m = reg0_m - reg7_m; \
106 reg4_m = reg1_m - reg4_m; \
107 reg5_m = __msa_srai_w(reg5_m, 6); \
108 reg6_m = __msa_srai_w(reg6_m, 6); \
109 reg7_m = __msa_srai_w(reg7_m, 6); \
110 reg4_m = __msa_srai_w(reg4_m, 6); \
111 reg2_m = __msa_srai_w(reg2_m, 6); \
112 reg3_m = __msa_srai_w(reg3_m, 6); \
113 CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
114 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
115 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
116 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
117 }
118
119 // Pack and Store 8 ARGB values.
120 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \
121 { \
122 v8i16 vec0_m, vec1_m; \
123 v16u8 dst0_m, dst1_m; \
124 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
125 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
126 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
127 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
128 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
129 }
130
131 // Takes ARGB input and calculates Y.
132 #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
133 y_out) \
134 { \
135 v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \
136 v8u16 reg0_m, reg1_m; \
137 \
138 vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \
139 vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \
140 vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \
141 vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \
142 reg0_m = __msa_dotp_u_h(vec0_m, const0); \
143 reg1_m = __msa_dotp_u_h(vec1_m, const0); \
144 reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \
145 reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \
146 reg0_m += const2; \
147 reg1_m += const2; \
148 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \
149 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \
150 y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
151 }
152
153 // Loads current and next row of ARGB input and averages it to calculate U and V
154 #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
155 { \
156 v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
157 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
158 v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
159 v8u16 reg8_m, reg9_m; \
160 \
161 src0_m = (v16u8)__msa_ld_b((void*)s, 0); \
162 src1_m = (v16u8)__msa_ld_b((void*)s, 16); \
163 src2_m = (v16u8)__msa_ld_b((void*)s, 32); \
164 src3_m = (v16u8)__msa_ld_b((void*)s, 48); \
165 src4_m = (v16u8)__msa_ld_b((void*)t, 0); \
166 src5_m = (v16u8)__msa_ld_b((void*)t, 16); \
167 src6_m = (v16u8)__msa_ld_b((void*)t, 32); \
168 src7_m = (v16u8)__msa_ld_b((void*)t, 48); \
169 vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
170 vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
171 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
172 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
173 vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
174 vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
175 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
176 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
177 reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \
178 reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \
179 reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \
180 reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \
181 reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \
182 reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \
183 reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \
184 reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \
185 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
186 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
187 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
188 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
189 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
190 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
191 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
192 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
193 reg8_m += const_0x0101; \
194 reg9_m += const_0x0101; \
195 reg0_m += const_0x0101; \
196 reg1_m += const_0x0101; \
197 argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \
198 argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \
199 argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \
200 argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \
201 }
202
203 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
204 shf0, shf1, shf2, shf3, shift, u_out, v_out) \
205 { \
206 v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
207 v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
208 \
209 vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
210 vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
211 vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
212 vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
213 vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
214 vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
215 vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
216 vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
217 reg0_m = __msa_dotp_u_w(vec0_m, const0); \
218 reg1_m = __msa_dotp_u_w(vec1_m, const0); \
219 reg2_m = __msa_dotp_u_w(vec4_m, const0); \
220 reg3_m = __msa_dotp_u_w(vec5_m, const0); \
221 reg0_m += const1; \
222 reg1_m += const1; \
223 reg2_m += const1; \
224 reg3_m += const1; \
225 reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \
226 reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \
227 reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \
228 reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \
229 reg0_m = __msa_srl_w(reg0_m, shift); \
230 reg1_m = __msa_srl_w(reg1_m, shift); \
231 reg2_m = __msa_srl_w(reg2_m, shift); \
232 reg3_m = __msa_srl_w(reg3_m, shift); \
233 u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
234 v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
235 }
236
237 // Takes ARGB input and calculates U and V.
238 #define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
239 shf0, shf1, shf2, shf3, v_out, u_out) \
240 { \
241 v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
242 v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
243 \
244 vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
245 vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
246 vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
247 vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
248 vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
249 vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
250 vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
251 vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
252 reg0_m = __msa_dotp_u_w(vec0_m, const1); \
253 reg1_m = __msa_dotp_u_w(vec1_m, const1); \
254 reg2_m = __msa_dotp_u_w(vec4_m, const1); \
255 reg3_m = __msa_dotp_u_w(vec5_m, const1); \
256 reg0_m += (v4u32)const3; \
257 reg1_m += (v4u32)const3; \
258 reg2_m += (v4u32)const3; \
259 reg3_m += (v4u32)const3; \
260 reg0_m -= __msa_dotp_u_w(vec2_m, const0); \
261 reg1_m -= __msa_dotp_u_w(vec3_m, const0); \
262 reg2_m -= __msa_dotp_u_w(vec6_m, const2); \
263 reg3_m -= __msa_dotp_u_w(vec7_m, const2); \
264 u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
265 v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
266 u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \
267 v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \
268 }
269
270 // Load I444 pixel data
271 #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
272 { \
273 uint64_t y_m, u_m, v_m; \
274 v2i64 zero_m = {0}; \
275 y_m = LD(psrc_y); \
276 u_m = LD(psrc_u); \
277 v_m = LD(psrc_v); \
278 out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \
279 out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \
280 out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
281 }
282
283 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
284 { \
285 v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
286 v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \
287 _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \
288 _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \
289 _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \
290 _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \
291 _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \
292 _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \
293 _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \
294 _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \
295 _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \
296 _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \
297 _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \
298 _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \
299 _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \
300 _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \
301 _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \
302 _reg1 = const_8080 + const_112 * _reg0; \
303 _reg3 = const_8080 + const_112 * _reg4; \
304 _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \
305 _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \
306 _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \
307 _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \
308 _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \
309 }
310
MirrorRow_MSA(const uint8_t * src,uint8_t * dst,int width)311 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
312 int x;
313 v16u8 src0, src1, src2, src3;
314 v16u8 dst0, dst1, dst2, dst3;
315 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
316 src += width - 64;
317
318 for (x = 0; x < width; x += 64) {
319 LD_UB4(src, 16, src3, src2, src1, src0);
320 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
321 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
322 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
323 dst += 64;
324 src -= 64;
325 }
326 }
327
MirrorUVRow_MSA(const uint8_t * src_uv,uint8_t * dst_uv,int width)328 void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
329 int x;
330 v8u16 src, dst;
331 v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
332 src_uv += (width - 8) << 1;
333 for (x = 0; x < width; x += 8) {
334 src = LD_UH(src_uv);
335 dst = __msa_vshf_h(shuffler, src, src);
336 ST_UH(dst, dst_uv);
337 src_uv -= 16;
338 dst_uv += 16;
339 }
340 }
341
ARGBMirrorRow_MSA(const uint8_t * src,uint8_t * dst,int width)342 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
343 int x;
344 v16u8 src0, src1, src2, src3;
345 v16u8 dst0, dst1, dst2, dst3;
346 v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
347 src += width * 4 - 64;
348
349 for (x = 0; x < width; x += 16) {
350 LD_UB4(src, 16, src3, src2, src1, src0);
351 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
352 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
353 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
354 dst += 64;
355 src -= 64;
356 }
357 }
358
I422ToYUY2Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)359 void I422ToYUY2Row_MSA(const uint8_t* src_y,
360 const uint8_t* src_u,
361 const uint8_t* src_v,
362 uint8_t* dst_yuy2,
363 int width) {
364 int x;
365 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
366 v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
367
368 for (x = 0; x < width; x += 32) {
369 src_u0 = LD_UB(src_u);
370 src_v0 = LD_UB(src_v);
371 LD_UB2(src_y, 16, src_y0, src_y1);
372 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
373 ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
374 ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
375 ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
376 src_u += 16;
377 src_v += 16;
378 src_y += 32;
379 dst_yuy2 += 64;
380 }
381 }
382
I422ToUYVYRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)383 void I422ToUYVYRow_MSA(const uint8_t* src_y,
384 const uint8_t* src_u,
385 const uint8_t* src_v,
386 uint8_t* dst_uyvy,
387 int width) {
388 int x;
389 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
390 v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
391
392 for (x = 0; x < width; x += 32) {
393 src_u0 = LD_UB(src_u);
394 src_v0 = LD_UB(src_v);
395 LD_UB2(src_y, 16, src_y0, src_y1);
396 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
397 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
398 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
399 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
400 src_u += 16;
401 src_v += 16;
402 src_y += 32;
403 dst_uyvy += 64;
404 }
405 }
406
I422ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)407 void I422ToARGBRow_MSA(const uint8_t* src_y,
408 const uint8_t* src_u,
409 const uint8_t* src_v,
410 uint8_t* dst_argb,
411 const struct YuvConstants* yuvconstants,
412 int width) {
413 int x;
414 v16u8 src0, src1, src2;
415 v8i16 vec0, vec1, vec2;
416 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
417 v4i32 vec_ubvr, vec_ugvg;
418 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
419 v8i16 const_0x80 = __msa_ldi_h(0x80);
420
421 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
422 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
423 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
424
425 for (x = 0; x < width; x += 8) {
426 READYUV422(src_y, src_u, src_v, src0, src1, src2);
427 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
428 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
429 STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
430 src_y += 8;
431 src_u += 4;
432 src_v += 4;
433 dst_argb += 32;
434 }
435 }
436
I422ToRGBARow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)437 void I422ToRGBARow_MSA(const uint8_t* src_y,
438 const uint8_t* src_u,
439 const uint8_t* src_v,
440 uint8_t* dst_argb,
441 const struct YuvConstants* yuvconstants,
442 int width) {
443 int x;
444 v16u8 src0, src1, src2;
445 v8i16 vec0, vec1, vec2;
446 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
447 v4i32 vec_ubvr, vec_ugvg;
448 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
449 v8i16 const_0x80 = __msa_ldi_h(0x80);
450
451 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
452 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
453 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
454
455 for (x = 0; x < width; x += 8) {
456 READYUV422(src_y, src_u, src_v, src0, src1, src2);
457 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
458 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
459 STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
460 src_y += 8;
461 src_u += 4;
462 src_v += 4;
463 dst_argb += 32;
464 }
465 }
466
I422AlphaToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)467 void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
468 const uint8_t* src_u,
469 const uint8_t* src_v,
470 const uint8_t* src_a,
471 uint8_t* dst_argb,
472 const struct YuvConstants* yuvconstants,
473 int width) {
474 int x;
475 int64_t data_a;
476 v16u8 src0, src1, src2, src3;
477 v8i16 vec0, vec1, vec2;
478 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
479 v4i32 vec_ubvr, vec_ugvg;
480 v4i32 zero = {0};
481 v8i16 const_0x80 = __msa_ldi_h(0x80);
482
483 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
484 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
485 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
486
487 for (x = 0; x < width; x += 8) {
488 data_a = LD(src_a);
489 READYUV422(src_y, src_u, src_v, src0, src1, src2);
490 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
491 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
492 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
493 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
494 STOREARGB(vec0, vec1, vec2, src3, dst_argb);
495 src_y += 8;
496 src_u += 4;
497 src_v += 4;
498 src_a += 8;
499 dst_argb += 32;
500 }
501 }
502
I422ToRGB24Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int32_t width)503 void I422ToRGB24Row_MSA(const uint8_t* src_y,
504 const uint8_t* src_u,
505 const uint8_t* src_v,
506 uint8_t* dst_argb,
507 const struct YuvConstants* yuvconstants,
508 int32_t width) {
509 int x;
510 int64_t data_u, data_v;
511 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
512 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
513 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
514 v4i32 vec_ubvr, vec_ugvg;
515 v16u8 reg0, reg1, reg2, reg3;
516 v2i64 zero = {0};
517 v8i16 const_0x80 = __msa_ldi_h(0x80);
518 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
519 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
520 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
521 11, 29, 12, 13, 30, 14, 15, 31};
522
523 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
524 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
525 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
526
527 for (x = 0; x < width; x += 16) {
528 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
529 data_u = LD(src_u);
530 data_v = LD(src_v);
531 src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
532 src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
533 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
534 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
535 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
536 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
537 YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5);
538 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
539 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
540 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
541 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
542 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
543 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
544 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
545 ST_UB2(dst0, dst1, dst_argb, 16);
546 ST_UB(dst2, (dst_argb + 32));
547 src_y += 16;
548 src_u += 8;
549 src_v += 8;
550 dst_argb += 48;
551 }
552 }
553
554 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
I422ToRGB565Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)555 void I422ToRGB565Row_MSA(const uint8_t* src_y,
556 const uint8_t* src_u,
557 const uint8_t* src_v,
558 uint8_t* dst_rgb565,
559 const struct YuvConstants* yuvconstants,
560 int width) {
561 int x;
562 v16u8 src0, src1, src2, dst0;
563 v8i16 vec0, vec1, vec2;
564 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
565 v4i32 vec_ubvr, vec_ugvg;
566 v8i16 const_0x80 = __msa_ldi_h(0x80);
567
568 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
569 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
570 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
571
572 for (x = 0; x < width; x += 8) {
573 READYUV422(src_y, src_u, src_v, src0, src1, src2);
574 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
575 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
576 vec0 = __msa_srli_h(vec0, 3);
577 vec1 = __msa_srli_h(vec1, 2);
578 vec2 = __msa_srli_h(vec2, 3);
579 vec2 = __msa_slli_h(vec2, 11);
580 vec1 = __msa_slli_h(vec1, 5);
581 vec0 |= vec1;
582 dst0 = (v16u8)(vec2 | vec0);
583 ST_UB(dst0, dst_rgb565);
584 src_y += 8;
585 src_u += 4;
586 src_v += 4;
587 dst_rgb565 += 16;
588 }
589 }
590
591 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
I422ToARGB4444Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)592 void I422ToARGB4444Row_MSA(const uint8_t* src_y,
593 const uint8_t* src_u,
594 const uint8_t* src_v,
595 uint8_t* dst_argb4444,
596 const struct YuvConstants* yuvconstants,
597 int width) {
598 int x;
599 v16u8 src0, src1, src2, dst0;
600 v8i16 vec0, vec1, vec2;
601 v8u16 reg0, reg1, reg2;
602 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
603 v4i32 vec_ubvr, vec_ugvg;
604 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
605 v8u16 mask = (v8u16)__msa_fill_h(0x00F0);
606 v8i16 const_0x80 = __msa_ldi_h(0x80);
607
608 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
609 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
610 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
611
612 for (x = 0; x < width; x += 8) {
613 READYUV422(src_y, src_u, src_v, src0, src1, src2);
614 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
615 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
616 reg0 = (v8u16)__msa_srli_h(vec0, 4);
617 reg2 = (v8u16)__msa_srli_h(vec2, 4);
618 reg1 = (v8u16)__msa_and_v(vec1, mask);
619 reg2 = (v8u16)__msa_slli_h(reg2, 8);
620 reg1 |= const_0xF000;
621 reg0 |= reg2;
622 dst0 = (v16u8)(reg1 | reg0);
623 ST_UB(dst0, dst_argb4444);
624 src_y += 8;
625 src_u += 4;
626 src_v += 4;
627 dst_argb4444 += 16;
628 }
629 }
630
I422ToARGB1555Row_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)631 void I422ToARGB1555Row_MSA(const uint8_t* src_y,
632 const uint8_t* src_u,
633 const uint8_t* src_v,
634 uint8_t* dst_argb1555,
635 const struct YuvConstants* yuvconstants,
636 int width) {
637 int x;
638 v16u8 src0, src1, src2, dst0;
639 v8i16 vec0, vec1, vec2;
640 v8u16 reg0, reg1, reg2;
641 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
642 v4i32 vec_ubvr, vec_ugvg;
643 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
644 v8i16 const_0x80 = __msa_ldi_h(0x80);
645
646 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
647 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
648 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
649
650 for (x = 0; x < width; x += 8) {
651 READYUV422(src_y, src_u, src_v, src0, src1, src2);
652 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
653 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
654 reg0 = (v8u16)__msa_srli_h(vec0, 3);
655 reg1 = (v8u16)__msa_srli_h(vec1, 3);
656 reg2 = (v8u16)__msa_srli_h(vec2, 3);
657 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
658 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
659 reg1 |= const_0x8000;
660 reg0 |= reg2;
661 dst0 = (v16u8)(reg1 | reg0);
662 ST_UB(dst0, dst_argb1555);
663 src_y += 8;
664 src_u += 4;
665 src_v += 4;
666 dst_argb1555 += 16;
667 }
668 }
669
YUY2ToYRow_MSA(const uint8_t * src_yuy2,uint8_t * dst_y,int width)670 void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
671 int x;
672 v16u8 src0, src1, src2, src3, dst0, dst1;
673
674 for (x = 0; x < width; x += 32) {
675 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
676 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
677 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
678 ST_UB2(dst0, dst1, dst_y, 16);
679 src_yuy2 += 64;
680 dst_y += 32;
681 }
682 }
683
YUY2ToUVRow_MSA(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)684 void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
685 int src_stride_yuy2,
686 uint8_t* dst_u,
687 uint8_t* dst_v,
688 int width) {
689 const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
690 int x;
691 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
692 v16u8 vec0, vec1, dst0, dst1;
693
694 for (x = 0; x < width; x += 32) {
695 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
696 LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
697 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
698 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
699 src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
700 src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
701 vec0 = __msa_aver_u_b(src0, src2);
702 vec1 = __msa_aver_u_b(src1, src3);
703 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
704 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
705 ST_UB(dst0, dst_u);
706 ST_UB(dst1, dst_v);
707 src_yuy2 += 64;
708 src_yuy2_next += 64;
709 dst_u += 16;
710 dst_v += 16;
711 }
712 }
713
YUY2ToUV422Row_MSA(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)714 void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
715 uint8_t* dst_u,
716 uint8_t* dst_v,
717 int width) {
718 int x;
719 v16u8 src0, src1, src2, src3, dst0, dst1;
720
721 for (x = 0; x < width; x += 32) {
722 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
723 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
724 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
725 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
726 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
727 ST_UB(dst0, dst_u);
728 ST_UB(dst1, dst_v);
729 src_yuy2 += 64;
730 dst_u += 16;
731 dst_v += 16;
732 }
733 }
734
UYVYToYRow_MSA(const uint8_t * src_uyvy,uint8_t * dst_y,int width)735 void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
736 int x;
737 v16u8 src0, src1, src2, src3, dst0, dst1;
738
739 for (x = 0; x < width; x += 32) {
740 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
741 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
742 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
743 ST_UB2(dst0, dst1, dst_y, 16);
744 src_uyvy += 64;
745 dst_y += 32;
746 }
747 }
748
UYVYToUVRow_MSA(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)749 void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
750 int src_stride_uyvy,
751 uint8_t* dst_u,
752 uint8_t* dst_v,
753 int width) {
754 const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
755 int x;
756 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
757 v16u8 vec0, vec1, dst0, dst1;
758
759 for (x = 0; x < width; x += 32) {
760 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
761 LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
762 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
763 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
764 src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
765 src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
766 vec0 = __msa_aver_u_b(src0, src2);
767 vec1 = __msa_aver_u_b(src1, src3);
768 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
769 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
770 ST_UB(dst0, dst_u);
771 ST_UB(dst1, dst_v);
772 src_uyvy += 64;
773 src_uyvy_next += 64;
774 dst_u += 16;
775 dst_v += 16;
776 }
777 }
778
UYVYToUV422Row_MSA(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)779 void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
780 uint8_t* dst_u,
781 uint8_t* dst_v,
782 int width) {
783 int x;
784 v16u8 src0, src1, src2, src3, dst0, dst1;
785
786 for (x = 0; x < width; x += 32) {
787 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
788 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
789 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
790 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
791 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
792 ST_UB(dst0, dst_u);
793 ST_UB(dst1, dst_v);
794 src_uyvy += 64;
795 dst_u += 16;
796 dst_v += 16;
797 }
798 }
799
ARGBToYRow_MSA(const uint8_t * src_argb,uint8_t * dst_y,int width)800 void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
801 int x;
802 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
803 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
804 v16i8 zero = {0};
805 v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
806 v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
807 v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
808 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
809
810 for (x = 0; x < width; x += 16) {
811 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
812 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
813 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
814 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
815 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
816 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
817 vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
818 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
819 reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
820 reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
821 reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
822 reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
823 reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
824 reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
825 reg0 *= const_0x19;
826 reg1 *= const_0x19;
827 reg2 *= const_0x81;
828 reg3 *= const_0x81;
829 reg4 *= const_0x42;
830 reg5 *= const_0x42;
831 reg0 += reg2;
832 reg1 += reg3;
833 reg0 += reg4;
834 reg1 += reg5;
835 reg0 += const_0x1080;
836 reg1 += const_0x1080;
837 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
838 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
839 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
840 ST_UB(dst0, dst_y);
841 src_argb += 64;
842 dst_y += 16;
843 }
844 }
845
ARGBToUVRow_MSA(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)846 void ARGBToUVRow_MSA(const uint8_t* src_argb,
847 int src_stride_argb,
848 uint8_t* dst_u,
849 uint8_t* dst_v,
850 int width) {
851 int x;
852 const uint8_t* src_argb_next = src_argb + src_stride_argb;
853 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
854 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
855 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
856 v16u8 dst0, dst1;
857 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
858 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
859 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
860 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
861 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
862 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
863 v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
864
865 for (x = 0; x < width; x += 32) {
866 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
867 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
868 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
869 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
870 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64);
871 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80);
872 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96);
873 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112);
874 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
875 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
876 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
877 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
878 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
879 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
880 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
881 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
882 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
883 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
884 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
885 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
886 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
887 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
888 reg0 = __msa_hadd_u_h(vec8, vec8);
889 reg1 = __msa_hadd_u_h(vec9, vec9);
890 reg2 = __msa_hadd_u_h(vec4, vec4);
891 reg3 = __msa_hadd_u_h(vec5, vec5);
892 reg4 = __msa_hadd_u_h(vec0, vec0);
893 reg5 = __msa_hadd_u_h(vec1, vec1);
894 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0);
895 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16);
896 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32);
897 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48);
898 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64);
899 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80);
900 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96);
901 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112);
902 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
903 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
904 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
905 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
906 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
907 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
908 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
909 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
910 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
911 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
912 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
913 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
914 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
915 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
916 reg0 += __msa_hadd_u_h(vec8, vec8);
917 reg1 += __msa_hadd_u_h(vec9, vec9);
918 reg2 += __msa_hadd_u_h(vec4, vec4);
919 reg3 += __msa_hadd_u_h(vec5, vec5);
920 reg4 += __msa_hadd_u_h(vec0, vec0);
921 reg5 += __msa_hadd_u_h(vec1, vec1);
922 reg0 += const_0x0001;
923 reg1 += const_0x0001;
924 reg2 += const_0x0001;
925 reg3 += const_0x0001;
926 reg4 += const_0x0001;
927 reg5 += const_0x0001;
928 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
929 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
930 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
931 reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
932 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
933 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
934 reg6 = reg0 * const_0x70;
935 reg7 = reg1 * const_0x70;
936 reg8 = reg2 * const_0x4A;
937 reg9 = reg3 * const_0x4A;
938 reg6 += const_0x8080;
939 reg7 += const_0x8080;
940 reg8 += reg4 * const_0x26;
941 reg9 += reg5 * const_0x26;
942 reg0 *= const_0x12;
943 reg1 *= const_0x12;
944 reg2 *= const_0x5E;
945 reg3 *= const_0x5E;
946 reg4 *= const_0x70;
947 reg5 *= const_0x70;
948 reg2 += reg0;
949 reg3 += reg1;
950 reg4 += const_0x8080;
951 reg5 += const_0x8080;
952 reg6 -= reg8;
953 reg7 -= reg9;
954 reg4 -= reg2;
955 reg5 -= reg3;
956 reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
957 reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
958 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
959 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
960 dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
961 dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
962 ST_UB(dst0, dst_u);
963 ST_UB(dst1, dst_v);
964 src_argb += 128;
965 src_argb_next += 128;
966 dst_u += 16;
967 dst_v += 16;
968 }
969 }
970
ARGBToRGB24Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)971 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
972 int x;
973 v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
974 v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
975 v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14,
976 16, 17, 18, 20, 21, 22, 24, 25};
977 v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
978 21, 22, 24, 25, 26, 28, 29, 30};
979
980 for (x = 0; x < width; x += 16) {
981 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
982 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
983 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
984 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
985 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
986 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
987 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
988 ST_UB2(dst0, dst1, dst_rgb, 16);
989 ST_UB(dst2, (dst_rgb + 32));
990 src_argb += 64;
991 dst_rgb += 48;
992 }
993 }
994
ARGBToRAWRow_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)995 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
996 int x;
997 v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
998 v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
999 v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12,
1000 18, 17, 16, 22, 21, 20, 26, 25};
1001 v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22,
1002 21, 20, 26, 25, 24, 30, 29, 28};
1003
1004 for (x = 0; x < width; x += 16) {
1005 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1006 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1007 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
1008 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
1009 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
1010 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
1011 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
1012 ST_UB2(dst0, dst1, dst_rgb, 16);
1013 ST_UB(dst2, (dst_rgb + 32));
1014 src_argb += 64;
1015 dst_rgb += 48;
1016 }
1017 }
1018
ARGBToRGB565Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1019 void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
1020 int x;
1021 v16u8 src0, src1, dst0;
1022 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1023 v16i8 zero = {0};
1024
1025 for (x = 0; x < width; x += 8) {
1026 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1027 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1028 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
1029 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
1030 vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
1031 vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
1032 vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
1033 vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
1034 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
1035 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1036 vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
1037 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1038 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
1039 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
1040 vec0 = __msa_binsli_b(vec0, vec1, 2);
1041 vec1 = __msa_binsli_b(vec2, vec3, 4);
1042 vec4 = __msa_binsli_b(vec4, vec5, 2);
1043 vec5 = __msa_binsli_b(vec6, vec7, 4);
1044 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1045 vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
1046 dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
1047 ST_UB(dst0, dst_rgb);
1048 src_argb += 32;
1049 dst_rgb += 16;
1050 }
1051 }
1052
ARGBToARGB1555Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1053 void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
1054 uint8_t* dst_rgb,
1055 int width) {
1056 int x;
1057 v16u8 src0, src1, dst0;
1058 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1059 v16i8 zero = {0};
1060
1061 for (x = 0; x < width; x += 8) {
1062 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1063 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1064 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
1065 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
1066 vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
1067 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
1068 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1069 vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
1070 vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
1071 vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
1072 vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
1073 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1074 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
1075 vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
1076 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
1077 vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
1078 vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
1079 vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
1080 vec0 = __msa_binsli_b(vec0, vec1, 2);
1081 vec5 = __msa_binsli_b(vec5, vec6, 2);
1082 vec1 = __msa_binsli_b(vec2, vec3, 5);
1083 vec6 = __msa_binsli_b(vec7, vec8, 5);
1084 vec1 = __msa_binsli_b(vec1, vec4, 0);
1085 vec6 = __msa_binsli_b(vec6, vec9, 0);
1086 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1087 vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
1088 dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
1089 ST_UB(dst0, dst_rgb);
1090 src_argb += 32;
1091 dst_rgb += 16;
1092 }
1093 }
1094
ARGBToARGB4444Row_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1095 void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
1096 uint8_t* dst_rgb,
1097 int width) {
1098 int x;
1099 v16u8 src0, src1;
1100 v16u8 vec0, vec1;
1101 v16u8 dst0;
1102 v16i8 zero = {0};
1103
1104 for (x = 0; x < width; x += 8) {
1105 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1106 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1107 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
1108 vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
1109 src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
1110 src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
1111 vec0 = __msa_binsli_b(vec0, src0, 3);
1112 vec1 = __msa_binsli_b(vec1, src1, 3);
1113 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1114 ST_UB(dst0, dst_rgb);
1115 src_argb += 32;
1116 dst_rgb += 16;
1117 }
1118 }
1119
ARGBToUV444Row_MSA(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int32_t width)1120 void ARGBToUV444Row_MSA(const uint8_t* src_argb,
1121 uint8_t* dst_u,
1122 uint8_t* dst_v,
1123 int32_t width) {
1124 int32_t x;
1125 v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
1126 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1127 v8u16 vec8, vec9, vec10, vec11;
1128 v8u16 const_112 = (v8u16)__msa_ldi_h(112);
1129 v8u16 const_74 = (v8u16)__msa_ldi_h(74);
1130 v8u16 const_38 = (v8u16)__msa_ldi_h(38);
1131 v8u16 const_94 = (v8u16)__msa_ldi_h(94);
1132 v8u16 const_18 = (v8u16)__msa_ldi_h(18);
1133 v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
1134 v16i8 zero = {0};
1135
1136 for (x = width; x > 0; x -= 16) {
1137 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1138 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1139 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
1140 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
1141 reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1142 reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
1143 reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1144 reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
1145 src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
1146 src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
1147 src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
1148 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
1149 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
1150 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
1151 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
1152 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
1153 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
1154 vec10 = vec0 * const_18;
1155 vec11 = vec1 * const_18;
1156 vec8 = vec2 * const_94;
1157 vec9 = vec3 * const_94;
1158 vec6 = vec4 * const_112;
1159 vec7 = vec5 * const_112;
1160 vec0 *= const_112;
1161 vec1 *= const_112;
1162 vec2 *= const_74;
1163 vec3 *= const_74;
1164 vec4 *= const_38;
1165 vec5 *= const_38;
1166 vec8 += vec10;
1167 vec9 += vec11;
1168 vec6 += const_32896;
1169 vec7 += const_32896;
1170 vec0 += const_32896;
1171 vec1 += const_32896;
1172 vec2 += vec4;
1173 vec3 += vec5;
1174 vec0 -= vec2;
1175 vec1 -= vec3;
1176 vec6 -= vec8;
1177 vec7 -= vec9;
1178 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1179 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1180 vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
1181 vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
1182 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1183 dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
1184 ST_UB(dst0, dst_u);
1185 ST_UB(dst1, dst_v);
1186 src_argb += 64;
1187 dst_u += 16;
1188 dst_v += 16;
1189 }
1190 }
1191
ARGBMultiplyRow_MSA(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1192 void ARGBMultiplyRow_MSA(const uint8_t* src_argb,
1193 const uint8_t* src_argb1,
1194 uint8_t* dst_argb,
1195 int width) {
1196 int x;
1197 v16u8 src0, src1, dst0;
1198 v8u16 vec0, vec1, vec2, vec3;
1199 v4u32 reg0, reg1, reg2, reg3;
1200 v8i16 zero = {0};
1201
1202 for (x = 0; x < width; x += 4) {
1203 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1204 src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
1205 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1206 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1207 vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
1208 vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
1209 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1210 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1211 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1212 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1213 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1214 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1215 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1216 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1217 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
1218 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
1219 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
1220 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
1221 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1222 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1223 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1224 ST_UB(dst0, dst_argb);
1225 src_argb += 16;
1226 src_argb1 += 16;
1227 dst_argb += 16;
1228 }
1229 }
1230
ARGBAddRow_MSA(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1231 void ARGBAddRow_MSA(const uint8_t* src_argb,
1232 const uint8_t* src_argb1,
1233 uint8_t* dst_argb,
1234 int width) {
1235 int x;
1236 v16u8 src0, src1, src2, src3, dst0, dst1;
1237
1238 for (x = 0; x < width; x += 8) {
1239 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1240 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1241 src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
1242 src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
1243 dst0 = __msa_adds_u_b(src0, src2);
1244 dst1 = __msa_adds_u_b(src1, src3);
1245 ST_UB2(dst0, dst1, dst_argb, 16);
1246 src_argb += 32;
1247 src_argb1 += 32;
1248 dst_argb += 32;
1249 }
1250 }
1251
ARGBSubtractRow_MSA(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1252 void ARGBSubtractRow_MSA(const uint8_t* src_argb,
1253 const uint8_t* src_argb1,
1254 uint8_t* dst_argb,
1255 int width) {
1256 int x;
1257 v16u8 src0, src1, src2, src3, dst0, dst1;
1258
1259 for (x = 0; x < width; x += 8) {
1260 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1261 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1262 src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
1263 src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
1264 dst0 = __msa_subs_u_b(src0, src2);
1265 dst1 = __msa_subs_u_b(src1, src3);
1266 ST_UB2(dst0, dst1, dst_argb, 16);
1267 src_argb += 32;
1268 src_argb1 += 32;
1269 dst_argb += 32;
1270 }
1271 }
1272
ARGBAttenuateRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,int width)1273 void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
1274 uint8_t* dst_argb,
1275 int width) {
1276 int x;
1277 v16u8 src0, src1, dst0, dst1;
1278 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1279 v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
1280 v8i16 zero = {0};
1281 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
1282
1283 for (x = 0; x < width; x += 8) {
1284 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1285 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1286 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1287 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1288 vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
1289 vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
1290 vec4 = (v8u16)__msa_fill_h(vec0[3]);
1291 vec5 = (v8u16)__msa_fill_h(vec0[7]);
1292 vec6 = (v8u16)__msa_fill_h(vec1[3]);
1293 vec7 = (v8u16)__msa_fill_h(vec1[7]);
1294 vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
1295 vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1296 vec6 = (v8u16)__msa_fill_h(vec2[3]);
1297 vec7 = (v8u16)__msa_fill_h(vec2[7]);
1298 vec8 = (v8u16)__msa_fill_h(vec3[3]);
1299 vec9 = (v8u16)__msa_fill_h(vec3[7]);
1300 vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1301 vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
1302 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
1303 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
1304 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
1305 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
1306 reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
1307 reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
1308 reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
1309 reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
1310 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1311 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1312 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1313 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1314 reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1315 reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1316 reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1317 reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1318 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1319 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1320 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1321 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1322 reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
1323 reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
1324 reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
1325 reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
1326 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1327 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1328 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
1329 vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
1330 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1331 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1332 dst0 = __msa_bmnz_v(dst0, src0, mask);
1333 dst1 = __msa_bmnz_v(dst1, src1, mask);
1334 ST_UB2(dst0, dst1, dst_argb, 16);
1335 src_argb += 32;
1336 dst_argb += 32;
1337 }
1338 }
1339
ARGBToRGB565DitherRow_MSA(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1340 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
1341 uint8_t* dst_rgb,
1342 uint32_t dither4,
1343 int width) {
1344 int x;
1345 v16u8 src0, src1, dst0, vec0, vec1;
1346 v8i16 vec_d0;
1347 v8i16 reg0, reg1, reg2;
1348 v16i8 zero = {0};
1349 v8i16 max = __msa_ldi_h(0xFF);
1350
1351 vec_d0 = (v8i16)__msa_fill_w(dither4);
1352 vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
1353
1354 for (x = 0; x < width; x += 8) {
1355 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1356 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1357 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1358 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1359 reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
1360 reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
1361 reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
1362 reg0 += vec_d0;
1363 reg1 += vec_d0;
1364 reg2 += vec_d0;
1365 reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
1366 reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
1367 reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
1368 reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
1369 reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
1370 reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
1371 reg0 = __msa_srai_h(reg0, 3);
1372 reg2 = __msa_srai_h(reg2, 3);
1373 reg1 = __msa_srai_h(reg1, 2);
1374 reg2 = __msa_slli_h(reg2, 11);
1375 reg1 = __msa_slli_h(reg1, 5);
1376 reg0 |= reg1;
1377 dst0 = (v16u8)(reg0 | reg2);
1378 ST_UB(dst0, dst_rgb);
1379 src_argb += 32;
1380 dst_rgb += 16;
1381 }
1382 }
1383
ARGBShuffleRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1384 void ARGBShuffleRow_MSA(const uint8_t* src_argb,
1385 uint8_t* dst_argb,
1386 const uint8_t* shuffler,
1387 int width) {
1388 int x;
1389 v16u8 src0, src1, dst0, dst1;
1390 v16i8 vec0;
1391 v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
1392 int32_t val = LW((int32_t*)shuffler);
1393
1394 vec0 = (v16i8)__msa_fill_w(val);
1395 shuffler_vec += vec0;
1396
1397 for (x = 0; x < width; x += 8) {
1398 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1399 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1400 dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
1401 dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
1402 ST_UB2(dst0, dst1, dst_argb, 16);
1403 src_argb += 32;
1404 dst_argb += 32;
1405 }
1406 }
1407
ARGBShadeRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1408 void ARGBShadeRow_MSA(const uint8_t* src_argb,
1409 uint8_t* dst_argb,
1410 int width,
1411 uint32_t value) {
1412 int x;
1413 v16u8 src0, dst0;
1414 v8u16 vec0, vec1;
1415 v4u32 reg0, reg1, reg2, reg3, rgba_scale;
1416 v8i16 zero = {0};
1417
1418 rgba_scale[0] = value;
1419 rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
1420 rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
1421
1422 for (x = 0; x < width; x += 4) {
1423 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1424 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1425 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1426 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1427 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1428 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1429 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1430 reg0 *= rgba_scale;
1431 reg1 *= rgba_scale;
1432 reg2 *= rgba_scale;
1433 reg3 *= rgba_scale;
1434 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1435 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1436 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1437 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1438 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1439 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1440 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1441 ST_UB(dst0, dst_argb);
1442 src_argb += 16;
1443 dst_argb += 16;
1444 }
1445 }
1446
ARGBGrayRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,int width)1447 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1448 int x;
1449 v16u8 src0, src1, vec0, vec1, dst0, dst1;
1450 v8u16 reg0;
1451 v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
1452 v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
1453
1454 for (x = 0; x < width; x += 8) {
1455 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1456 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1457 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1458 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1459 reg0 = __msa_dotp_u_h(vec0, const_0x961D);
1460 reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
1461 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
1462 vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
1463 vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
1464 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
1465 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
1466 ST_UB2(dst0, dst1, dst_argb, 16);
1467 src_argb += 32;
1468 dst_argb += 32;
1469 }
1470 }
1471
ARGBSepiaRow_MSA(uint8_t * dst_argb,int width)1472 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
1473 int x;
1474 v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
1475 v8u16 reg0, reg1, reg2;
1476 v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
1477 v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
1478 v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
1479 v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
1480 v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
1481 v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
1482 v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
1483
1484 for (x = 0; x < width; x += 8) {
1485 src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
1486 src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
1487 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1488 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1489 vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
1490 reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
1491 reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
1492 reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
1493 reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
1494 reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
1495 reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
1496 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
1497 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
1498 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
1499 reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
1500 reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
1501 vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
1502 vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
1503 vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
1504 vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1505 vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1506 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
1507 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
1508 ST_UB2(dst0, dst1, dst_argb, 16);
1509 dst_argb += 32;
1510 }
1511 }
1512
ARGB4444ToARGBRow_MSA(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1513 void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
1514 uint8_t* dst_argb,
1515 int width) {
1516 int x;
1517 v16u8 src0, src1;
1518 v8u16 vec0, vec1, vec2, vec3;
1519 v16u8 dst0, dst1, dst2, dst3;
1520
1521 for (x = 0; x < width; x += 16) {
1522 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
1523 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
1524 vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
1525 vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
1526 vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
1527 vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
1528 vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
1529 vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
1530 vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
1531 vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
1532 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1533 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
1534 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1535 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
1536 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1537 src_argb4444 += 32;
1538 dst_argb += 64;
1539 }
1540 }
1541
ARGB1555ToARGBRow_MSA(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1542 void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
1543 uint8_t* dst_argb,
1544 int width) {
1545 int x;
1546 v8u16 src0, src1;
1547 v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
1548 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
1549 v16u8 dst0, dst1, dst2, dst3;
1550 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1551
1552 for (x = 0; x < width; x += 16) {
1553 src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0);
1554 src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16);
1555 vec0 = src0 & const_0x1F;
1556 vec1 = src1 & const_0x1F;
1557 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1558 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1559 vec2 = src0 & const_0x1F;
1560 vec3 = src1 & const_0x1F;
1561 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1562 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1563 vec4 = src0 & const_0x1F;
1564 vec5 = src1 & const_0x1F;
1565 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1566 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1567 reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1568 reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1569 reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1570 reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1571 reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
1572 reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
1573 reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
1574 reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
1575 reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
1576 reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
1577 reg3 = -reg3;
1578 reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
1579 reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
1580 reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
1581 reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
1582 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
1583 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
1584 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
1585 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
1586 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1587 src_argb1555 += 32;
1588 dst_argb += 64;
1589 }
1590 }
1591
RGB565ToARGBRow_MSA(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1592 void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
1593 uint8_t* dst_argb,
1594 int width) {
1595 int x;
1596 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
1597 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1598 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
1599 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1600 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1601 v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
1602 v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
1603
1604 for (x = 0; x < width; x += 16) {
1605 src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0);
1606 src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16);
1607 vec0 = src0 & const_0x1F;
1608 vec1 = src0 & const_0x7E0;
1609 vec2 = src0 & const_0xF800;
1610 vec3 = src1 & const_0x1F;
1611 vec4 = src1 & const_0x7E0;
1612 vec5 = src1 & const_0xF800;
1613 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1614 reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
1615 reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
1616 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1617 reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
1618 reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
1619 reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
1620 reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
1621 reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
1622 reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
1623 reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
1624 reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
1625 res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
1626 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
1627 res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
1628 res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
1629 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
1630 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
1631 dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
1632 dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
1633 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1634 src_rgb565 += 32;
1635 dst_argb += 64;
1636 }
1637 }
1638
RGB24ToARGBRow_MSA(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1639 void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
1640 uint8_t* dst_argb,
1641 int width) {
1642 int x;
1643 v16u8 src0, src1, src2;
1644 v16u8 vec0, vec1, vec2;
1645 v16u8 dst0, dst1, dst2, dst3;
1646 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1647 v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
1648
1649 for (x = 0; x < width; x += 16) {
1650 src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0);
1651 src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16);
1652 src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32);
1653 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1654 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1655 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1656 dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
1657 dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
1658 dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
1659 dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
1660 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1661 src_rgb24 += 48;
1662 dst_argb += 64;
1663 }
1664 }
1665
RAWToARGBRow_MSA(const uint8_t * src_raw,uint8_t * dst_argb,int width)1666 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1667 int x;
1668 v16u8 src0, src1, src2;
1669 v16u8 vec0, vec1, vec2;
1670 v16u8 dst0, dst1, dst2, dst3;
1671 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1672 v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
1673
1674 for (x = 0; x < width; x += 16) {
1675 src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
1676 src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
1677 src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
1678 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1679 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1680 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1681 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
1682 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
1683 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
1684 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
1685 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1686 src_raw += 48;
1687 dst_argb += 64;
1688 }
1689 }
1690
ARGB1555ToYRow_MSA(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1691 void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
1692 uint8_t* dst_y,
1693 int width) {
1694 int x;
1695 v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
1696 v16u8 reg0, reg1, reg2, dst;
1697 v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
1698 v8i16 res0, res1;
1699 v8i16 const_66 = (v8i16)__msa_ldi_h(66);
1700 v8i16 const_129 = (v8i16)__msa_ldi_h(129);
1701 v8i16 const_25 = (v8i16)__msa_ldi_h(25);
1702 v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080);
1703 v16u8 zero = (v16u8)__msa_ldi_b(0);
1704
1705 for (x = 0; x < width; x += 16) {
1706 src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0);
1707 src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16);
1708 tmp0 = (v16u8)__msa_pckev_b(src1, src0);
1709 tmp1 = (v16u8)__msa_pckod_b(src1, src0);
1710 tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
1711 tmpg = (v16u8)__msa_srli_b(tmp0, 5);
1712 reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
1713 reg0 = (v16u8)__msa_slli_b(reg0, 3);
1714 tmpg = (v16u8)__msa_or_v(tmpg, reg0);
1715 reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
1716 tmpr = (v16u8)__msa_srli_b(reg1, 2);
1717 reg0 = (v16u8)__msa_slli_b(tmpb, 3);
1718 reg1 = (v16u8)__msa_slli_b(tmpg, 3);
1719 reg2 = (v16u8)__msa_slli_b(tmpr, 3);
1720 tmpb = (v16u8)__msa_srli_b(tmpb, 2);
1721 tmpg = (v16u8)__msa_srli_b(tmpg, 2);
1722 tmpr = (v16u8)__msa_srli_b(tmpr, 2);
1723 tmpb = (v16u8)__msa_or_v(reg0, tmpb);
1724 tmpg = (v16u8)__msa_or_v(reg1, tmpg);
1725 tmpr = (v16u8)__msa_or_v(reg2, tmpr);
1726 tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
1727 tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
1728 tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
1729 tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
1730 tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
1731 tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
1732 res0 = const_1080 + const_25 * tmpb_r;
1733 res1 = const_1080 + const_25 * tmpb_l;
1734 res0 += const_129 * tmpg_r;
1735 res1 += const_129 * tmpg_l;
1736 res0 += const_66 * tmpr_r;
1737 res1 += const_66 * tmpr_l;
1738 dst = (v16u8)__msa_pckod_b(res1, res0);
1739 ST_UB(dst, dst_y);
1740 src_argb1555 += 32;
1741 dst_y += 16;
1742 }
1743 }
1744
RGB565ToYRow_MSA(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1745 void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1746 int x;
1747 v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
1748 v16u8 reg0, reg1, dst;
1749 v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
1750 v8i16 res0, res1;
1751 v8i16 const_66 = (v8i16)__msa_ldi_h(66);
1752 v8i16 const_129 = (v8i16)__msa_ldi_h(129);
1753 v8i16 const_25 = (v8i16)__msa_ldi_h(25);
1754 v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080);
1755 v16u8 zero = __msa_ldi_b(0);
1756
1757 for (x = 0; x < width; x += 16) {
1758 src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0);
1759 src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16);
1760 tmp0 = (v16u8)__msa_pckev_b(src1, src0);
1761 tmp1 = (v16u8)__msa_pckod_b(src1, src0);
1762 tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
1763 tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
1764 reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
1765 reg0 = (v16u8)__msa_srli_b(tmp0, 5);
1766 reg1 = (v16u8)__msa_slli_b(reg1, 3);
1767 tmpg = (v16u8)__msa_or_v(reg1, reg0);
1768 reg0 = (v16u8)__msa_slli_b(tmpb, 3);
1769 reg1 = (v16u8)__msa_srli_b(tmpb, 2);
1770 tmpb = (v16u8)__msa_or_v(reg1, reg0);
1771 reg0 = (v16u8)__msa_slli_b(tmpg, 2);
1772 reg1 = (v16u8)__msa_srli_b(tmpg, 4);
1773 tmpg = (v16u8)__msa_or_v(reg1, reg0);
1774 reg0 = (v16u8)__msa_srli_b(tmpr, 5);
1775 tmpr = (v16u8)__msa_or_v(tmpr, reg0);
1776 tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
1777 tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
1778 tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
1779 tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
1780 tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
1781 tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
1782 res0 = const_1080 + const_25 * tmpb_r;
1783 res1 = const_1080 + const_25 * tmpb_l;
1784 res0 += const_129 * tmpg_r;
1785 res1 += const_129 * tmpg_l;
1786 res0 += const_66 * tmpr_r;
1787 res1 += const_66 * tmpr_l;
1788 dst = (v16u8)__msa_pckod_b(res1, res0);
1789 ST_UB(dst, dst_y);
1790 src_rgb565 += 32;
1791 dst_y += 16;
1792 }
1793 }
1794
RGB24ToYRow_MSA(const uint8_t * src_argb,uint8_t * dst_y,int width)1795 void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1796 int x;
1797 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1798 v8u16 vec0, vec1, vec2, vec3;
1799 v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
1800 v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
1801 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1802 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1803 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1804 18, 19, 20, 21, 21, 22, 23, 24};
1805 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1806 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1807 v16i8 zero = {0};
1808
1809 for (x = 0; x < width; x += 16) {
1810 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1811 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1812 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
1813 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1814 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1815 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1816 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1817 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1818 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1819 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1820 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1821 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
1822 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
1823 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
1824 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
1825 vec0 += const_0x1080;
1826 vec1 += const_0x1080;
1827 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1828 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1829 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1830 ST_UB(dst0, dst_y);
1831 src_argb += 48;
1832 dst_y += 16;
1833 }
1834 }
1835
RAWToYRow_MSA(const uint8_t * src_argb,uint8_t * dst_y,int width)1836 void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1837 int x;
1838 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1839 v8u16 vec0, vec1, vec2, vec3;
1840 v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
1841 v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
1842 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1843 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1844 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1845 18, 19, 20, 21, 21, 22, 23, 24};
1846 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1847 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1848 v16i8 zero = {0};
1849
1850 for (x = 0; x < width; x += 16) {
1851 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
1852 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
1853 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
1854 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1855 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1856 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1857 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1858 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1859 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1860 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1861 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1862 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
1863 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
1864 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
1865 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
1866 vec0 += const_0x1080;
1867 vec1 += const_0x1080;
1868 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1869 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1870 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1871 ST_UB(dst0, dst_y);
1872 src_argb += 48;
1873 dst_y += 16;
1874 }
1875 }
1876
ARGB1555ToUVRow_MSA(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1877 void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
1878 int src_stride_argb1555,
1879 uint8_t* dst_u,
1880 uint8_t* dst_v,
1881 int width) {
1882 int x;
1883 const uint16_t* s = (const uint16_t*)src_argb1555;
1884 const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
1885 int64_t res0, res1;
1886 v16u8 src0, src1, src2, src3, dst;
1887 v16u8 tmp0, tmp1, tmp2, tmp3;
1888 v16u8 reg0, reg1, reg2, reg3;
1889 v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
1890 v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
1891 v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
1892 v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
1893 v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
1894 v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
1895 v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
1896
1897 for (x = 0; x < width; x += 16) {
1898 src0 = (v8u16)__msa_ld_b((void*)s, 0);
1899 src1 = (v8u16)__msa_ld_b((void*)s, 16);
1900 src2 = (v8u16)__msa_ld_b((void*)t, 0);
1901 src3 = (v8u16)__msa_ld_b((void*)t, 16);
1902 tmp0 = (v16u8)__msa_pckev_b(src1, src0);
1903 tmp1 = (v16u8)__msa_pckod_b(src1, src0);
1904 tmp2 = (v16u8)__msa_pckev_b(src3, src2);
1905 tmp3 = (v16u8)__msa_pckod_b(src3, src2);
1906 tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
1907 nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
1908 tmpg = (v16u8)__msa_srli_b(tmp0, 5);
1909 nexg = (v16u8)__msa_srli_b(tmp2, 5);
1910 reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
1911 reg2 = (v16u8)__msa_andi_b(tmp3, 0x03);
1912 reg0 = (v16u8)__msa_slli_b(reg0, 3);
1913 reg2 = (v16u8)__msa_slli_b(reg2, 3);
1914 tmpg = (v16u8)__msa_or_v(tmpg, reg0);
1915 nexg = (v16u8)__msa_or_v(nexg, reg2);
1916 reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
1917 reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C);
1918 tmpr = (v16u8)__msa_srli_b(reg1, 2);
1919 nexr = (v16u8)__msa_srli_b(reg3, 2);
1920 reg0 = (v16u8)__msa_slli_b(tmpb, 3);
1921 reg1 = (v16u8)__msa_slli_b(tmpg, 3);
1922 reg2 = (v16u8)__msa_slli_b(tmpr, 3);
1923 tmpb = (v16u8)__msa_srli_b(tmpb, 2);
1924 tmpg = (v16u8)__msa_srli_b(tmpg, 2);
1925 tmpr = (v16u8)__msa_srli_b(tmpr, 2);
1926 tmpb = (v16u8)__msa_or_v(reg0, tmpb);
1927 tmpg = (v16u8)__msa_or_v(reg1, tmpg);
1928 tmpr = (v16u8)__msa_or_v(reg2, tmpr);
1929 reg0 = (v16u8)__msa_slli_b(nexb, 3);
1930 reg1 = (v16u8)__msa_slli_b(nexg, 3);
1931 reg2 = (v16u8)__msa_slli_b(nexr, 3);
1932 nexb = (v16u8)__msa_srli_b(nexb, 2);
1933 nexg = (v16u8)__msa_srli_b(nexg, 2);
1934 nexr = (v16u8)__msa_srli_b(nexr, 2);
1935 nexb = (v16u8)__msa_or_v(reg0, nexb);
1936 nexg = (v16u8)__msa_or_v(reg1, nexg);
1937 nexr = (v16u8)__msa_or_v(reg2, nexr);
1938 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
1939 res0 = __msa_copy_u_d((v2i64)dst, 0);
1940 res1 = __msa_copy_u_d((v2i64)dst, 1);
1941 SD(res0, dst_u);
1942 SD(res1, dst_v);
1943 s += 16;
1944 t += 16;
1945 dst_u += 8;
1946 dst_v += 8;
1947 }
1948 }
1949
RGB565ToUVRow_MSA(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1950 void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
1951 int src_stride_rgb565,
1952 uint8_t* dst_u,
1953 uint8_t* dst_v,
1954 int width) {
1955 int x;
1956 const uint16_t* s = (const uint16_t*)src_rgb565;
1957 const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
1958 int64_t res0, res1;
1959 v16u8 src0, src1, src2, src3, dst;
1960 v16u8 tmp0, tmp1, tmp2, tmp3;
1961 v16u8 reg0, reg1, reg2, reg3;
1962 v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
1963 v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
1964 v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
1965 v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
1966 v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
1967 v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
1968 v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
1969
1970 for (x = 0; x < width; x += 16) {
1971 src0 = (v16u8)__msa_ld_b((void*)s, 0);
1972 src1 = (v16u8)__msa_ld_b((void*)s, 16);
1973 src2 = (v16u8)__msa_ld_b((void*)t, 0);
1974 src3 = (v16u8)__msa_ld_b((void*)t, 16);
1975 tmp0 = (v16u8)__msa_pckev_b(src1, src0);
1976 tmp1 = (v16u8)__msa_pckod_b(src1, src0);
1977 tmp2 = (v16u8)__msa_pckev_b(src3, src2);
1978 tmp3 = (v16u8)__msa_pckod_b(src3, src2);
1979 tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
1980 tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
1981 nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
1982 nexr = (v16u8)__msa_andi_b(tmp3, 0xF8);
1983 reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
1984 reg3 = (v16u8)__msa_andi_b(tmp3, 0x07);
1985 reg0 = (v16u8)__msa_srli_b(tmp0, 5);
1986 reg1 = (v16u8)__msa_slli_b(reg1, 3);
1987 reg2 = (v16u8)__msa_srli_b(tmp2, 5);
1988 reg3 = (v16u8)__msa_slli_b(reg3, 3);
1989 tmpg = (v16u8)__msa_or_v(reg1, reg0);
1990 nexg = (v16u8)__msa_or_v(reg2, reg3);
1991 reg0 = (v16u8)__msa_slli_b(tmpb, 3);
1992 reg1 = (v16u8)__msa_srli_b(tmpb, 2);
1993 reg2 = (v16u8)__msa_slli_b(nexb, 3);
1994 reg3 = (v16u8)__msa_srli_b(nexb, 2);
1995 tmpb = (v16u8)__msa_or_v(reg1, reg0);
1996 nexb = (v16u8)__msa_or_v(reg2, reg3);
1997 reg0 = (v16u8)__msa_slli_b(tmpg, 2);
1998 reg1 = (v16u8)__msa_srli_b(tmpg, 4);
1999 reg2 = (v16u8)__msa_slli_b(nexg, 2);
2000 reg3 = (v16u8)__msa_srli_b(nexg, 4);
2001 tmpg = (v16u8)__msa_or_v(reg1, reg0);
2002 nexg = (v16u8)__msa_or_v(reg2, reg3);
2003 reg0 = (v16u8)__msa_srli_b(tmpr, 5);
2004 reg2 = (v16u8)__msa_srli_b(nexr, 5);
2005 tmpr = (v16u8)__msa_or_v(tmpr, reg0);
2006 nexr = (v16u8)__msa_or_v(nexr, reg2);
2007 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
2008 res0 = __msa_copy_u_d((v2i64)dst, 0);
2009 res1 = __msa_copy_u_d((v2i64)dst, 1);
2010 SD(res0, dst_u);
2011 SD(res1, dst_v);
2012 s += 16;
2013 t += 16;
2014 dst_u += 8;
2015 dst_v += 8;
2016 }
2017 }
2018
RGB24ToUVRow_MSA(const uint8_t * src_rgb,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2019 void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
2020 int src_stride_rgb,
2021 uint8_t* dst_u,
2022 uint8_t* dst_v,
2023 int width) {
2024 int x;
2025 const uint8_t* s = src_rgb;
2026 const uint8_t* t = src_rgb + src_stride_rgb;
2027 int64_t res0, res1;
2028 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2029 v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2030 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2031 v8i16 reg0, reg1, reg2, reg3;
2032 v16u8 dst0;
2033 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
2034 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
2035 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
2036 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
2037 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
2038 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2039 v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
2040 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2041 v16i8 zero = {0};
2042
2043 for (x = 0; x < width; x += 16) {
2044 inp0 = (v16u8)__msa_ld_b((void*)s, 0);
2045 inp1 = (v16u8)__msa_ld_b((void*)s, 16);
2046 inp2 = (v16u8)__msa_ld_b((void*)s, 32);
2047 inp3 = (v16u8)__msa_ld_b((void*)t, 0);
2048 inp4 = (v16u8)__msa_ld_b((void*)t, 16);
2049 inp5 = (v16u8)__msa_ld_b((void*)t, 32);
2050 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2051 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2052 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2053 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2054 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2055 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2056 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2057 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2058 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2059 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2060 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2061 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2062 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2063 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2064 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2065 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2066 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2067 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2068 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2069 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2070 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2071 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2072 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2073 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2074 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2075 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2076 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2077 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2078 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2079 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2080 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2081 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2082 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2083 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2084 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2085 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2086 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2087 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2088 reg0 += const_0x0001;
2089 reg1 += const_0x0001;
2090 reg2 += const_0x0001;
2091 reg3 += const_0x0001;
2092 reg0 = __msa_srai_h((v8i16)reg0, 1);
2093 reg1 = __msa_srai_h((v8i16)reg1, 1);
2094 reg2 = __msa_srai_h((v8i16)reg2, 1);
2095 reg3 = __msa_srai_h((v8i16)reg3, 1);
2096 vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
2097 vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
2098 vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
2099 vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
2100 vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2101 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2102 vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2103 vec3 = vec0 * const_0x70;
2104 vec4 = vec1 * const_0x4A;
2105 vec5 = vec2 * const_0x26;
2106 vec2 *= const_0x70;
2107 vec1 *= const_0x5E;
2108 vec0 *= const_0x12;
2109 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2110 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2111 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2112 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2113 reg0 += reg1;
2114 reg2 += reg3;
2115 reg0 = __msa_srai_h(reg0, 8);
2116 reg2 = __msa_srai_h(reg2, 8);
2117 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2118 res0 = __msa_copy_u_d((v2i64)dst0, 0);
2119 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2120 SD(res0, dst_u);
2121 SD(res1, dst_v);
2122 t += 48;
2123 s += 48;
2124 dst_u += 8;
2125 dst_v += 8;
2126 }
2127 }
2128
RAWToUVRow_MSA(const uint8_t * src_rgb,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2129 void RAWToUVRow_MSA(const uint8_t* src_rgb,
2130 int src_stride_rgb,
2131 uint8_t* dst_u,
2132 uint8_t* dst_v,
2133 int width) {
2134 int x;
2135 const uint8_t* s = src_rgb;
2136 const uint8_t* t = src_rgb + src_stride_rgb;
2137 int64_t res0, res1;
2138 v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2139 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2140 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2141 v8i16 reg0, reg1, reg2, reg3;
2142 v16u8 dst0;
2143 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
2144 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
2145 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
2146 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
2147 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
2148 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2149 v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
2150 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2151 v16i8 zero = {0};
2152
2153 for (x = 0; x < width; x += 16) {
2154 inp0 = (v16u8)__msa_ld_b((void*)s, 0);
2155 inp1 = (v16u8)__msa_ld_b((void*)s, 16);
2156 inp2 = (v16u8)__msa_ld_b((void*)s, 32);
2157 inp3 = (v16u8)__msa_ld_b((void*)t, 0);
2158 inp4 = (v16u8)__msa_ld_b((void*)t, 16);
2159 inp5 = (v16u8)__msa_ld_b((void*)t, 32);
2160 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2161 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2162 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2163 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2164 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2165 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2166 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2167 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2168 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2169 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2170 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2171 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2172 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2173 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2174 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2175 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2176 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2177 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2178 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2179 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2180 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2181 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2182 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2183 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2184 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2185 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2186 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2187 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2188 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2189 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2190 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2191 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2192 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2193 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2194 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2195 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2196 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2197 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2198 reg0 += const_0x0001;
2199 reg1 += const_0x0001;
2200 reg2 += const_0x0001;
2201 reg3 += const_0x0001;
2202 reg0 = __msa_srai_h(reg0, 1);
2203 reg1 = __msa_srai_h(reg1, 1);
2204 reg2 = __msa_srai_h(reg2, 1);
2205 reg3 = __msa_srai_h(reg3, 1);
2206 vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2207 vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2208 vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
2209 vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
2210 vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2211 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2212 vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2213 vec3 = vec0 * const_0x70;
2214 vec4 = vec1 * const_0x4A;
2215 vec5 = vec2 * const_0x26;
2216 vec2 *= const_0x70;
2217 vec1 *= const_0x5E;
2218 vec0 *= const_0x12;
2219 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2220 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2221 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2222 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2223 reg0 += reg1;
2224 reg2 += reg3;
2225 reg0 = __msa_srai_h(reg0, 8);
2226 reg2 = __msa_srai_h(reg2, 8);
2227 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2228 res0 = __msa_copy_u_d((v2i64)dst0, 0);
2229 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2230 SD(res0, dst_u);
2231 SD(res1, dst_v);
2232 t += 48;
2233 s += 48;
2234 dst_u += 8;
2235 dst_v += 8;
2236 }
2237 }
2238
NV12ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2239 void NV12ToARGBRow_MSA(const uint8_t* src_y,
2240 const uint8_t* src_uv,
2241 uint8_t* dst_argb,
2242 const struct YuvConstants* yuvconstants,
2243 int width) {
2244 int x;
2245 uint64_t val0, val1;
2246 v16u8 src0, src1, res0, res1, dst0, dst1;
2247 v8i16 vec0, vec1, vec2;
2248 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
2249 v4i32 vec_ubvr, vec_ugvg;
2250 v16u8 zero = {0};
2251 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2252 v8i16 const_0x80 = __msa_ldi_h(0x80);
2253
2254 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2255 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2256 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2257
2258 for (x = 0; x < width; x += 8) {
2259 val0 = LD(src_y);
2260 val1 = LD(src_uv);
2261 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2262 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2263 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
2264 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2265 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2266 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2267 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2268 ST_UB2(dst0, dst1, dst_argb, 16);
2269 src_y += 8;
2270 src_uv += 8;
2271 dst_argb += 32;
2272 }
2273 }
2274
NV12ToRGB565Row_MSA(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2275 void NV12ToRGB565Row_MSA(const uint8_t* src_y,
2276 const uint8_t* src_uv,
2277 uint8_t* dst_rgb565,
2278 const struct YuvConstants* yuvconstants,
2279 int width) {
2280 int x;
2281 uint64_t val0, val1;
2282 v16u8 src0, src1, dst0;
2283 v8i16 vec0, vec1, vec2;
2284 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
2285 v4i32 vec_ubvr, vec_ugvg;
2286 v8i16 const_0x80 = __msa_ldi_h(0x80);
2287 v16u8 zero = {0};
2288
2289 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2290 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2291 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2292
2293 for (x = 0; x < width; x += 8) {
2294 val0 = LD(src_y);
2295 val1 = LD(src_uv);
2296 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2297 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2298 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
2299 vec0 = vec0 >> 3;
2300 vec1 = (vec1 >> 2) << 5;
2301 vec2 = (vec2 >> 3) << 11;
2302 dst0 = (v16u8)(vec0 | vec1 | vec2);
2303 ST_UB(dst0, dst_rgb565);
2304 src_y += 8;
2305 src_uv += 8;
2306 dst_rgb565 += 16;
2307 }
2308 }
2309
NV21ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2310 void NV21ToARGBRow_MSA(const uint8_t* src_y,
2311 const uint8_t* src_vu,
2312 uint8_t* dst_argb,
2313 const struct YuvConstants* yuvconstants,
2314 int width) {
2315 int x;
2316 uint64_t val0, val1;
2317 v16u8 src0, src1, res0, res1, dst0, dst1;
2318 v8i16 vec0, vec1, vec2;
2319 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
2320 v4i32 vec_ubvr, vec_ugvg;
2321 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2322 v16u8 zero = {0};
2323 v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
2324 v8i16 const_0x80 = __msa_ldi_h(0x80);
2325
2326 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2327 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2328 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2329
2330 for (x = 0; x < width; x += 8) {
2331 val0 = LD(src_y);
2332 val1 = LD(src_vu);
2333 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2334 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2335 src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
2336 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
2337 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2338 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2339 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2340 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2341 ST_UB2(dst0, dst1, dst_argb, 16);
2342 src_y += 8;
2343 src_vu += 8;
2344 dst_argb += 32;
2345 }
2346 }
2347
SobelRow_MSA(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2348 void SobelRow_MSA(const uint8_t* src_sobelx,
2349 const uint8_t* src_sobely,
2350 uint8_t* dst_argb,
2351 int width) {
2352 int x;
2353 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
2354 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
2355 v16i8 const_0x4 = __msa_ldi_b(0x4);
2356 v16i8 mask1 = mask0 + const_0x4;
2357 v16i8 mask2 = mask1 + const_0x4;
2358 v16i8 mask3 = mask2 + const_0x4;
2359 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2360
2361 for (x = 0; x < width; x += 16) {
2362 src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
2363 src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
2364 vec0 = __msa_adds_u_b(src0, src1);
2365 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
2366 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
2367 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
2368 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
2369 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2370 src_sobelx += 16;
2371 src_sobely += 16;
2372 dst_argb += 64;
2373 }
2374 }
2375
SobelToPlaneRow_MSA(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)2376 void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
2377 const uint8_t* src_sobely,
2378 uint8_t* dst_y,
2379 int width) {
2380 int x;
2381 v16u8 src0, src1, src2, src3, dst0, dst1;
2382
2383 for (x = 0; x < width; x += 32) {
2384 src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
2385 src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16);
2386 src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
2387 src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16);
2388 dst0 = __msa_adds_u_b(src0, src2);
2389 dst1 = __msa_adds_u_b(src1, src3);
2390 ST_UB2(dst0, dst1, dst_y, 16);
2391 src_sobelx += 32;
2392 src_sobely += 32;
2393 dst_y += 32;
2394 }
2395 }
2396
SobelXYRow_MSA(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2397 void SobelXYRow_MSA(const uint8_t* src_sobelx,
2398 const uint8_t* src_sobely,
2399 uint8_t* dst_argb,
2400 int width) {
2401 int x;
2402 v16u8 src0, src1, vec0, vec1, vec2;
2403 v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
2404 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2405
2406 for (x = 0; x < width; x += 16) {
2407 src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
2408 src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
2409 vec0 = __msa_adds_u_b(src0, src1);
2410 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
2411 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
2412 reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
2413 reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
2414 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
2415 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
2416 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
2417 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
2418 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2419 src_sobelx += 16;
2420 src_sobely += 16;
2421 dst_argb += 64;
2422 }
2423 }
2424
ARGBToYJRow_MSA(const uint8_t * src_argb,uint8_t * dst_y,int width)2425 void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2426 int x;
2427 v16u8 src0, src1, src2, src3, dst0;
2428 v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
2429 v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
2430 v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
2431
2432 for (x = 0; x < width; x += 16) {
2433 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
2434 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
2435 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
2436 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
2437 ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
2438 dst0);
2439 ST_UB(dst0, dst_y);
2440 src_argb += 64;
2441 dst_y += 16;
2442 }
2443 }
2444
BGRAToYRow_MSA(const uint8_t * src_argb,uint8_t * dst_y,int width)2445 void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2446 int x;
2447 v16u8 src0, src1, src2, src3, dst0;
2448 v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
2449 v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
2450 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2451
2452 for (x = 0; x < width; x += 16) {
2453 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
2454 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
2455 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
2456 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
2457 ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
2458 dst0);
2459 ST_UB(dst0, dst_y);
2460 src_argb += 64;
2461 dst_y += 16;
2462 }
2463 }
2464
ABGRToYRow_MSA(const uint8_t * src_argb,uint8_t * dst_y,int width)2465 void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2466 int x;
2467 v16u8 src0, src1, src2, src3, dst0;
2468 v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
2469 v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
2470 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2471
2472 for (x = 0; x < width; x += 16) {
2473 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
2474 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
2475 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
2476 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
2477 ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
2478 dst0);
2479 ST_UB(dst0, dst_y);
2480 src_argb += 64;
2481 dst_y += 16;
2482 }
2483 }
2484
RGBAToYRow_MSA(const uint8_t * src_argb,uint8_t * dst_y,int width)2485 void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2486 int x;
2487 v16u8 src0, src1, src2, src3, dst0;
2488 v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
2489 v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
2490 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2491
2492 for (x = 0; x < width; x += 16) {
2493 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
2494 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
2495 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
2496 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
2497 ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
2498 dst0);
2499 ST_UB(dst0, dst_y);
2500 src_argb += 64;
2501 dst_y += 16;
2502 }
2503 }
2504
ARGBToUVJRow_MSA(const uint8_t * src_rgb,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2505 void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
2506 int src_stride_rgb,
2507 uint8_t* dst_u,
2508 uint8_t* dst_v,
2509 int width) {
2510 int x;
2511 const uint8_t* s = src_rgb;
2512 const uint8_t* t = src_rgb + src_stride_rgb;
2513 v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
2514 v8u16 vec0, vec1, vec2, vec3;
2515 v8u16 dst0, dst1, dst2, dst3;
2516 v16u8 zero = {0};
2517 v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
2518 v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
2519 v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
2520 v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
2521 v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
2522 v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
2523 v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
2524 v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
2525 v4i32 shift = __msa_fill_w(0x00000008);
2526
2527 for (x = 0; x < width; x += 32) {
2528 src1 = __msa_ld_b((void*)s, 0);
2529 src3 = __msa_ld_b((void*)s, 16);
2530 src5 = __msa_ld_b((void*)t, 0);
2531 src7 = __msa_ld_b((void*)t, 16);
2532 src0 = __msa_ilvr_b(zero, src1);
2533 src1 = __msa_ilvl_b(zero, src1);
2534 src2 = __msa_ilvr_b(zero, src3);
2535 src3 = __msa_ilvl_b(zero, src3);
2536 src4 = __msa_ilvr_b(zero, src5);
2537 src5 = __msa_ilvl_b(zero, src5);
2538 src6 = __msa_ilvr_b(zero, src7);
2539 src7 = __msa_ilvl_b(zero, src7);
2540 src0 += src4;
2541 src1 += src5;
2542 src2 += src6;
2543 src3 += src7;
2544 src4 = __msa_ilvev_d(src1, src0);
2545 src5 = __msa_ilvod_d(src1, src0);
2546 src6 = __msa_ilvev_d(src3, src2);
2547 src7 = __msa_ilvod_d(src3, src2);
2548 vec0 = __msa_aver_u_h(src4, src5);
2549 vec1 = __msa_aver_u_h(src6, src7);
2550
2551 src1 = __msa_ld_b((void*)s, 32);
2552 src3 = __msa_ld_b((void*)s, 48);
2553 src5 = __msa_ld_b((void*)t, 32);
2554 src7 = __msa_ld_b((void*)t, 48);
2555 src0 = __msa_ilvr_b(zero, src1);
2556 src1 = __msa_ilvl_b(zero, src1);
2557 src2 = __msa_ilvr_b(zero, src3);
2558 src3 = __msa_ilvl_b(zero, src3);
2559 src4 = __msa_ilvr_b(zero, src5);
2560 src5 = __msa_ilvl_b(zero, src5);
2561 src6 = __msa_ilvr_b(zero, src7);
2562 src7 = __msa_ilvl_b(zero, src7);
2563 src0 += src4;
2564 src1 += src5;
2565 src2 += src6;
2566 src3 += src7;
2567 src4 = __msa_ilvev_d(src1, src0);
2568 src5 = __msa_ilvod_d(src1, src0);
2569 src6 = __msa_ilvev_d(src3, src2);
2570 src7 = __msa_ilvod_d(src3, src2);
2571 vec2 = __msa_aver_u_h(src4, src5);
2572 vec3 = __msa_aver_u_h(src6, src7);
2573 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
2574 const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
2575 shuffler2, shuffler3, shift, dst0, dst1);
2576
2577 src1 = __msa_ld_b((void*)s, 64);
2578 src3 = __msa_ld_b((void*)s, 80);
2579 src5 = __msa_ld_b((void*)t, 64);
2580 src7 = __msa_ld_b((void*)t, 80);
2581 src0 = __msa_ilvr_b(zero, src1);
2582 src1 = __msa_ilvl_b(zero, src1);
2583 src2 = __msa_ilvr_b(zero, src3);
2584 src3 = __msa_ilvl_b(zero, src3);
2585 src4 = __msa_ilvr_b(zero, src5);
2586 src5 = __msa_ilvl_b(zero, src5);
2587 src6 = __msa_ilvr_b(zero, src7);
2588 src7 = __msa_ilvl_b(zero, src7);
2589 src0 += src4;
2590 src1 += src5;
2591 src2 += src6;
2592 src3 += src7;
2593 src4 = __msa_ilvev_d(src1, src0);
2594 src5 = __msa_ilvod_d(src1, src0);
2595 src6 = __msa_ilvev_d(src3, src2);
2596 src7 = __msa_ilvod_d(src3, src2);
2597 vec0 = __msa_aver_u_h(src4, src5);
2598 vec1 = __msa_aver_u_h(src6, src7);
2599
2600 src1 = __msa_ld_b((void*)s, 96);
2601 src3 = __msa_ld_b((void*)s, 112);
2602 src5 = __msa_ld_b((void*)t, 96);
2603 src7 = __msa_ld_b((void*)t, 112);
2604 src0 = __msa_ilvr_b(zero, src1);
2605 src1 = __msa_ilvl_b(zero, src1);
2606 src2 = __msa_ilvr_b(zero, src3);
2607 src3 = __msa_ilvl_b(zero, src3);
2608 src4 = __msa_ilvr_b(zero, src5);
2609 src5 = __msa_ilvl_b(zero, src5);
2610 src6 = __msa_ilvr_b(zero, src7);
2611 src7 = __msa_ilvl_b(zero, src7);
2612 src0 += src4;
2613 src1 += src5;
2614 src2 += src6;
2615 src3 += src7;
2616 src4 = __msa_ilvev_d(src1, src0);
2617 src5 = __msa_ilvod_d(src1, src0);
2618 src6 = __msa_ilvev_d(src3, src2);
2619 src7 = __msa_ilvod_d(src3, src2);
2620 vec2 = __msa_aver_u_h(src4, src5);
2621 vec3 = __msa_aver_u_h(src6, src7);
2622 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
2623 const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
2624 shuffler2, shuffler3, shift, dst2, dst3);
2625
2626 dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
2627 dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
2628 ST_UB(dst0, dst_u);
2629 ST_UB(dst1, dst_v);
2630 s += 128;
2631 t += 128;
2632 dst_v += 16;
2633 dst_u += 16;
2634 }
2635 }
2636
BGRAToUVRow_MSA(const uint8_t * src_rgb,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2637 void BGRAToUVRow_MSA(const uint8_t* src_rgb,
2638 int src_stride_rgb,
2639 uint8_t* dst_u,
2640 uint8_t* dst_v,
2641 int width) {
2642 int x;
2643 const uint8_t* s = src_rgb;
2644 const uint8_t* t = src_rgb + src_stride_rgb;
2645 const uint8_t unused = 0xf;
2646 v8u16 src0, src1, src2, src3;
2647 v16u8 dst0, dst1;
2648 v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
2649 v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
2650 v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
2651 v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
2652 v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
2653 v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
2654 v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
2655 v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
2656 v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
2657
2658 for (x = 0; x < width; x += 16) {
2659 READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
2660 ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
2661 const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
2662 shuffler3, dst0, dst1);
2663 *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
2664 *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
2665 s += 64;
2666 t += 64;
2667 dst_u += 8;
2668 dst_v += 8;
2669 }
2670 }
2671
ABGRToUVRow_MSA(const uint8_t * src_rgb,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2672 void ABGRToUVRow_MSA(const uint8_t* src_rgb,
2673 int src_stride_rgb,
2674 uint8_t* dst_u,
2675 uint8_t* dst_v,
2676 int width) {
2677 int x;
2678 const uint8_t* s = src_rgb;
2679 const uint8_t* t = src_rgb + src_stride_rgb;
2680 const uint8_t unused = 0xf;
2681 v8u16 src0, src1, src2, src3;
2682 v16u8 dst0, dst1;
2683 v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
2684 v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
2685 v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
2686 v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
2687 v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
2688 v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
2689 v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
2690 v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
2691 v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
2692
2693 for (x = 0; x < width; x += 16) {
2694 READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
2695 ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
2696 const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
2697 shuffler3, dst0, dst1);
2698 *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
2699 *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
2700 s += 64;
2701 t += 64;
2702 dst_u += 8;
2703 dst_v += 8;
2704 }
2705 }
2706
RGBAToUVRow_MSA(const uint8_t * src_rgb,int src_stride_rgb,uint8_t * dst_u,uint8_t * dst_v,int width)2707 void RGBAToUVRow_MSA(const uint8_t* src_rgb,
2708 int src_stride_rgb,
2709 uint8_t* dst_u,
2710 uint8_t* dst_v,
2711 int width) {
2712 int x;
2713 const uint8_t* s = src_rgb;
2714 const uint8_t* t = src_rgb + src_stride_rgb;
2715 const uint8_t unused = 0xf;
2716 v8u16 src0, src1, src2, src3;
2717 v16u8 dst0, dst1;
2718 v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
2719 v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
2720 v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
2721 v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
2722 v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
2723 v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
2724 v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
2725 v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
2726 v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
2727
2728 for (x = 0; x < width; x += 16) {
2729 READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
2730 ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
2731 const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
2732 shuffler3, dst0, dst1);
2733 *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
2734 *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
2735 s += 64;
2736 t += 64;
2737 dst_u += 8;
2738 dst_v += 8;
2739 }
2740 }
2741
I444ToARGBRow_MSA(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2742 void I444ToARGBRow_MSA(const uint8_t* src_y,
2743 const uint8_t* src_u,
2744 const uint8_t* src_v,
2745 uint8_t* dst_argb,
2746 const struct YuvConstants* yuvconstants,
2747 int width) {
2748 int x;
2749 v16u8 src0, src1, src2, dst0, dst1;
2750 v8i16 vec0, vec1, vec2;
2751 v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
2752 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2753 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
2754 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2755 v8i16 zero = {0};
2756 v4i32 const_0x80 = __msa_fill_w(0x80);
2757
2758 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2759
2760 for (x = 0; x < width; x += 8) {
2761 READI444(src_y, src_u, src_v, src0, src1, src2);
2762 vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2763 reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2764 reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2765 reg0 *= vec_yg;
2766 reg1 *= vec_yg;
2767 reg0 = __msa_srai_w(reg0, 16);
2768 reg1 = __msa_srai_w(reg1, 16);
2769 reg0 += vec_yb;
2770 reg1 += vec_yb;
2771 vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
2772 vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
2773 reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2774 reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2775 reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
2776 reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
2777 reg6 -= const_0x80;
2778 reg7 -= const_0x80;
2779 reg8 -= const_0x80;
2780 reg9 -= const_0x80;
2781 tmp0 = reg0 + reg6 * vec_ub;
2782 tmp1 = reg1 + reg7 * vec_ub;
2783 tmp2 = reg0 + reg8 * vec_vr;
2784 tmp3 = reg1 + reg9 * vec_vr;
2785 tmp4 = reg6 * vec_ug;
2786 tmp5 = reg7 * vec_ug;
2787 tmp4 += reg8 * vec_vg;
2788 tmp5 += reg9 * vec_vg;
2789 tmp4 = reg0 - tmp4;
2790 tmp5 = reg1 - tmp5;
2791 reg0 = __msa_srai_w(tmp0, 6);
2792 reg1 = __msa_srai_w(tmp1, 6);
2793 reg2 = __msa_srai_w(tmp2, 6);
2794 reg3 = __msa_srai_w(tmp3, 6);
2795 reg4 = __msa_srai_w(tmp4, 6);
2796 reg5 = __msa_srai_w(tmp5, 6);
2797 CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
2798 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2799 vec1 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
2800 vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2801 vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
2802 vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
2803 dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
2804 dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
2805 ST_UB2(dst0, dst1, dst_argb, 16);
2806 src_y += 8;
2807 src_u += 8;
2808 src_v += 8;
2809 dst_argb += 32;
2810 }
2811 }
2812
2813 // TODO - respect YuvConstants
I400ToARGBRow_MSA(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2814 void I400ToARGBRow_MSA(const uint8_t* src_y,
2815 uint8_t* dst_argb,
2816 const struct YuvConstants* yuvconstants,
2817 int width) {
2818 int x;
2819 #if defined(__aarch64__) || defined(__arm__)
2820 int ygb = yuvconstants->kUVBiasBGR[3];
2821 int yg = yuvconstants->kYToRgb[1];
2822 #else
2823 int ygb = yuvconstants->kYBiasToRgb[0];
2824 int yg = yuvconstants->kYToRgb[0];
2825 #endif
2826 v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
2827 v8i16 vec0, vec1;
2828 v4i32 reg0, reg1, reg2, reg3;
2829 v4i32 vec_yg = __msa_fill_w(yg);
2830 v8i16 vec_ygb = __msa_fill_h(ygb);
2831 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2832 v8i16 max = __msa_ldi_h(0xFF);
2833 v8i16 zero = {0};
2834
2835 for (x = 0; x < width; x += 16) {
2836 src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
2837 vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2838 vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2839 reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
2840 reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
2841 reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
2842 reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
2843 reg0 *= vec_yg;
2844 reg1 *= vec_yg;
2845 reg2 *= vec_yg;
2846 reg3 *= vec_yg;
2847 reg0 = __msa_srai_w(reg0, 16);
2848 reg1 = __msa_srai_w(reg1, 16);
2849 reg2 = __msa_srai_w(reg2, 16);
2850 reg3 = __msa_srai_w(reg3, 16);
2851 vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2852 vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2853 vec0 += vec_ygb;
2854 vec1 += vec_ygb;
2855 vec0 = __msa_srai_h(vec0, 6);
2856 vec1 = __msa_srai_h(vec1, 6);
2857 vec0 = __msa_maxi_s_h(vec0, 0);
2858 vec1 = __msa_maxi_s_h(vec1, 0);
2859 vec0 = __msa_min_s_h(max, vec0);
2860 vec1 = __msa_min_s_h(max, vec1);
2861 res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
2862 res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
2863 res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
2864 res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
2865 res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
2866 dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
2867 dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
2868 dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
2869 dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
2870 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2871 src_y += 16;
2872 dst_argb += 64;
2873 }
2874 }
2875
J400ToARGBRow_MSA(const uint8_t * src_y,uint8_t * dst_argb,int width)2876 void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
2877 int x;
2878 v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
2879 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2880
2881 for (x = 0; x < width; x += 16) {
2882 src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
2883 vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2884 vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2885 vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
2886 vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
2887 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
2888 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
2889 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
2890 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
2891 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2892 src_y += 16;
2893 dst_argb += 64;
2894 }
2895 }
2896
YUY2ToARGBRow_MSA(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2897 void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
2898 uint8_t* dst_argb,
2899 const struct YuvConstants* yuvconstants,
2900 int width) {
2901 int x;
2902 v16u8 src0, src1, src2;
2903 v8i16 vec0, vec1, vec2;
2904 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
2905 v4i32 vec_ubvr, vec_ugvg;
2906 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2907 v8i16 const_0x80 = __msa_ldi_h(0x80);
2908
2909 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2910 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2911 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2912
2913 for (x = 0; x < width; x += 8) {
2914 src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
2915 src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2916 src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2917 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
2918 STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
2919 src_yuy2 += 16;
2920 dst_argb += 32;
2921 }
2922 }
2923
UYVYToARGBRow_MSA(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2924 void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
2925 uint8_t* dst_argb,
2926 const struct YuvConstants* yuvconstants,
2927 int width) {
2928 int x;
2929 v16u8 src0, src1, src2;
2930 v8i16 vec0, vec1, vec2;
2931 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
2932 v4i32 vec_ubvr, vec_ugvg;
2933 v8i16 const_0x80 = __msa_ldi_h(0x80);
2934 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2935
2936 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
2937 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2938 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2939
2940 for (x = 0; x < width; x += 8) {
2941 src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
2942 src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2943 src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2944 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
2945 STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
2946 src_uyvy += 16;
2947 dst_argb += 32;
2948 }
2949 }
2950
InterpolateRow_MSA(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int32_t source_y_fraction)2951 void InterpolateRow_MSA(uint8_t* dst_ptr,
2952 const uint8_t* src_ptr,
2953 ptrdiff_t src_stride,
2954 int width,
2955 int32_t source_y_fraction) {
2956 int32_t y1_fraction = source_y_fraction;
2957 int32_t y0_fraction = 256 - y1_fraction;
2958 uint16_t y_fractions;
2959 const uint8_t* s = src_ptr;
2960 const uint8_t* t = src_ptr + src_stride;
2961 int x;
2962 v16u8 src0, src1, src2, src3, dst0, dst1;
2963 v8u16 vec0, vec1, vec2, vec3, y_frac;
2964
2965 if (0 == y1_fraction) {
2966 memcpy(dst_ptr, src_ptr, width);
2967 return;
2968 }
2969
2970 if (128 == y1_fraction) {
2971 for (x = 0; x < width; x += 32) {
2972 src0 = (v16u8)__msa_ld_b((void*)s, 0);
2973 src1 = (v16u8)__msa_ld_b((void*)s, 16);
2974 src2 = (v16u8)__msa_ld_b((void*)t, 0);
2975 src3 = (v16u8)__msa_ld_b((void*)t, 16);
2976 dst0 = __msa_aver_u_b(src0, src2);
2977 dst1 = __msa_aver_u_b(src1, src3);
2978 ST_UB2(dst0, dst1, dst_ptr, 16);
2979 s += 32;
2980 t += 32;
2981 dst_ptr += 32;
2982 }
2983 return;
2984 }
2985
2986 y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
2987 y_frac = (v8u16)__msa_fill_h(y_fractions);
2988
2989 for (x = 0; x < width; x += 32) {
2990 src0 = (v16u8)__msa_ld_b((void*)s, 0);
2991 src1 = (v16u8)__msa_ld_b((void*)s, 16);
2992 src2 = (v16u8)__msa_ld_b((void*)t, 0);
2993 src3 = (v16u8)__msa_ld_b((void*)t, 16);
2994 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
2995 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
2996 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
2997 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
2998 vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
2999 vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
3000 vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
3001 vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
3002 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
3003 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
3004 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
3005 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
3006 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3007 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
3008 ST_UB2(dst0, dst1, dst_ptr, 16);
3009 s += 32;
3010 t += 32;
3011 dst_ptr += 32;
3012 }
3013 }
3014
ARGBSetRow_MSA(uint8_t * dst_argb,uint32_t v32,int width)3015 void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
3016 int x;
3017 v4i32 dst0 = __builtin_msa_fill_w(v32);
3018
3019 for (x = 0; x < width; x += 4) {
3020 ST_UB(dst0, dst_argb);
3021 dst_argb += 16;
3022 }
3023 }
3024
RAWToRGB24Row_MSA(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)3025 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
3026 int x;
3027 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
3028 v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
3029 v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13,
3030 18, 17, 16, 21, 20, 19, 24, 23};
3031 v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
3032 24, 23, 28, 27, 26, 31, 30, 29};
3033
3034 for (x = 0; x < width; x += 16) {
3035 src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
3036 src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
3037 src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
3038 src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
3039 src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
3040 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
3041 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
3042 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
3043 ST_UB2(dst0, dst1, dst_rgb24, 16);
3044 ST_UB(dst2, (dst_rgb24 + 32));
3045 src_raw += 48;
3046 dst_rgb24 += 48;
3047 }
3048 }
3049
MergeUVRow_MSA(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3050 void MergeUVRow_MSA(const uint8_t* src_u,
3051 const uint8_t* src_v,
3052 uint8_t* dst_uv,
3053 int width) {
3054 int x;
3055 v16u8 src0, src1, dst0, dst1;
3056
3057 for (x = 0; x < width; x += 16) {
3058 src0 = (v16u8)__msa_ld_b((void*)src_u, 0);
3059 src1 = (v16u8)__msa_ld_b((void*)src_v, 0);
3060 dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
3061 dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
3062 ST_UB2(dst0, dst1, dst_uv, 16);
3063 src_u += 16;
3064 src_v += 16;
3065 dst_uv += 32;
3066 }
3067 }
3068
ARGBExtractAlphaRow_MSA(const uint8_t * src_argb,uint8_t * dst_a,int width)3069 void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
3070 uint8_t* dst_a,
3071 int width) {
3072 int i;
3073 v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
3074
3075 for (i = 0; i < width; i += 16) {
3076 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
3077 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
3078 src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
3079 src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
3080 vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
3081 vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
3082 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
3083 ST_UB(dst0, dst_a);
3084 src_argb += 64;
3085 dst_a += 16;
3086 }
3087 }
3088
ARGBBlendRow_MSA(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3089 void ARGBBlendRow_MSA(const uint8_t* src_argb,
3090 const uint8_t* src_argb1,
3091 uint8_t* dst_argb,
3092 int width) {
3093 int x;
3094 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
3095 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3096 v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
3097 v8u16 const_256 = (v8u16)__msa_ldi_h(256);
3098 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
3099 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
3100 v16i8 zero = {0};
3101
3102 for (x = 0; x < width; x += 8) {
3103 src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
3104 src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
3105 src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
3106 src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
3107 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
3108 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
3109 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
3110 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
3111 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
3112 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
3113 vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
3114 vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
3115 vec8 = (v8u16)__msa_fill_h(vec0[3]);
3116 vec9 = (v8u16)__msa_fill_h(vec0[7]);
3117 vec10 = (v8u16)__msa_fill_h(vec1[3]);
3118 vec11 = (v8u16)__msa_fill_h(vec1[7]);
3119 vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
3120 vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
3121 vec10 = (v8u16)__msa_fill_h(vec2[3]);
3122 vec11 = (v8u16)__msa_fill_h(vec2[7]);
3123 vec12 = (v8u16)__msa_fill_h(vec3[3]);
3124 vec13 = (v8u16)__msa_fill_h(vec3[7]);
3125 vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
3126 vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
3127 vec8 = const_256 - vec8;
3128 vec9 = const_256 - vec9;
3129 vec10 = const_256 - vec10;
3130 vec11 = const_256 - vec11;
3131 vec8 *= vec4;
3132 vec9 *= vec5;
3133 vec10 *= vec6;
3134 vec11 *= vec7;
3135 vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
3136 vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
3137 vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
3138 vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
3139 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3140 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
3141 dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
3142 dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
3143 dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
3144 dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
3145 dst0 = __msa_bmnz_v(dst0, const_255, mask);
3146 dst1 = __msa_bmnz_v(dst1, const_255, mask);
3147 ST_UB2(dst0, dst1, dst_argb, 16);
3148 src_argb += 32;
3149 src_argb1 += 32;
3150 dst_argb += 32;
3151 }
3152 }
3153
ARGBQuantizeRow_MSA(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)3154 void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
3155 int scale,
3156 int interval_size,
3157 int interval_offset,
3158 int width) {
3159 int x;
3160 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
3161 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3162 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3163 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3164 v4i32 vec_scale = __msa_fill_w(scale);
3165 v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
3166 v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
3167 v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
3168 v16i8 zero = {0};
3169
3170 for (x = 0; x < width; x += 16) {
3171 src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
3172 src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
3173 src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
3174 src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48);
3175 vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
3176 vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
3177 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
3178 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
3179 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
3180 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
3181 vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
3182 vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
3183 tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
3184 tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
3185 tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
3186 tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
3187 tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
3188 tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
3189 tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
3190 tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
3191 tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
3192 tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
3193 tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
3194 tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
3195 tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
3196 tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
3197 tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
3198 tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
3199 tmp0 *= vec_scale;
3200 tmp1 *= vec_scale;
3201 tmp2 *= vec_scale;
3202 tmp3 *= vec_scale;
3203 tmp4 *= vec_scale;
3204 tmp5 *= vec_scale;
3205 tmp6 *= vec_scale;
3206 tmp7 *= vec_scale;
3207 tmp8 *= vec_scale;
3208 tmp9 *= vec_scale;
3209 tmp10 *= vec_scale;
3210 tmp11 *= vec_scale;
3211 tmp12 *= vec_scale;
3212 tmp13 *= vec_scale;
3213 tmp14 *= vec_scale;
3214 tmp15 *= vec_scale;
3215 tmp0 >>= 16;
3216 tmp1 >>= 16;
3217 tmp2 >>= 16;
3218 tmp3 >>= 16;
3219 tmp4 >>= 16;
3220 tmp5 >>= 16;
3221 tmp6 >>= 16;
3222 tmp7 >>= 16;
3223 tmp8 >>= 16;
3224 tmp9 >>= 16;
3225 tmp10 >>= 16;
3226 tmp11 >>= 16;
3227 tmp12 >>= 16;
3228 tmp13 >>= 16;
3229 tmp14 >>= 16;
3230 tmp15 >>= 16;
3231 vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
3232 vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
3233 vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
3234 vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
3235 vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
3236 vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
3237 vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
3238 vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
3239 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3240 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
3241 dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
3242 dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
3243 dst0 *= vec_int_sz;
3244 dst1 *= vec_int_sz;
3245 dst2 *= vec_int_sz;
3246 dst3 *= vec_int_sz;
3247 dst0 += vec_int_ofst;
3248 dst1 += vec_int_ofst;
3249 dst2 += vec_int_ofst;
3250 dst3 += vec_int_ofst;
3251 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
3252 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
3253 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
3254 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
3255 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
3256 dst_argb += 64;
3257 }
3258 }
3259
ARGBColorMatrixRow_MSA(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)3260 void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
3261 uint8_t* dst_argb,
3262 const int8_t* matrix_argb,
3263 int width) {
3264 int32_t x;
3265 v16i8 src0;
3266 v16u8 src1, src2, dst0, dst1;
3267 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3268 v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3269 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3270 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3271 v16i8 zero = {0};
3272 v8i16 max = __msa_ldi_h(255);
3273
3274 src0 = __msa_ld_b((void*)matrix_argb, 0);
3275 vec0 = (v8i16)__msa_ilvr_b(zero, src0);
3276 vec1 = (v8i16)__msa_ilvl_b(zero, src0);
3277
3278 for (x = 0; x < width; x += 8) {
3279 src1 = (v16u8)__msa_ld_b((void*)src_argb, 0);
3280 src2 = (v16u8)__msa_ld_b((void*)src_argb, 16);
3281 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
3282 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
3283 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
3284 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
3285 vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
3286 vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
3287 vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
3288 vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
3289 vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
3290 vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
3291 vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
3292 vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
3293 vec10 = vec2 * vec0;
3294 vec11 = vec2 * vec1;
3295 vec12 = vec6 * vec0;
3296 vec13 = vec6 * vec1;
3297 tmp0 = __msa_hadd_s_w(vec10, vec10);
3298 tmp1 = __msa_hadd_s_w(vec11, vec11);
3299 tmp2 = __msa_hadd_s_w(vec12, vec12);
3300 tmp3 = __msa_hadd_s_w(vec13, vec13);
3301 vec14 = vec3 * vec0;
3302 vec15 = vec3 * vec1;
3303 vec16 = vec7 * vec0;
3304 vec17 = vec7 * vec1;
3305 tmp4 = __msa_hadd_s_w(vec14, vec14);
3306 tmp5 = __msa_hadd_s_w(vec15, vec15);
3307 tmp6 = __msa_hadd_s_w(vec16, vec16);
3308 tmp7 = __msa_hadd_s_w(vec17, vec17);
3309 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
3310 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
3311 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
3312 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
3313 tmp0 = __msa_hadd_s_w(vec10, vec10);
3314 tmp1 = __msa_hadd_s_w(vec11, vec11);
3315 tmp2 = __msa_hadd_s_w(vec12, vec12);
3316 tmp3 = __msa_hadd_s_w(vec13, vec13);
3317 tmp0 = __msa_srai_w(tmp0, 6);
3318 tmp1 = __msa_srai_w(tmp1, 6);
3319 tmp2 = __msa_srai_w(tmp2, 6);
3320 tmp3 = __msa_srai_w(tmp3, 6);
3321 vec2 = vec4 * vec0;
3322 vec6 = vec4 * vec1;
3323 vec3 = vec8 * vec0;
3324 vec7 = vec8 * vec1;
3325 tmp8 = __msa_hadd_s_w(vec2, vec2);
3326 tmp9 = __msa_hadd_s_w(vec6, vec6);
3327 tmp10 = __msa_hadd_s_w(vec3, vec3);
3328 tmp11 = __msa_hadd_s_w(vec7, vec7);
3329 vec4 = vec5 * vec0;
3330 vec8 = vec5 * vec1;
3331 vec5 = vec9 * vec0;
3332 vec9 = vec9 * vec1;
3333 tmp12 = __msa_hadd_s_w(vec4, vec4);
3334 tmp13 = __msa_hadd_s_w(vec8, vec8);
3335 tmp14 = __msa_hadd_s_w(vec5, vec5);
3336 tmp15 = __msa_hadd_s_w(vec9, vec9);
3337 vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
3338 vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
3339 vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
3340 vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
3341 tmp4 = __msa_hadd_s_w(vec14, vec14);
3342 tmp5 = __msa_hadd_s_w(vec15, vec15);
3343 tmp6 = __msa_hadd_s_w(vec16, vec16);
3344 tmp7 = __msa_hadd_s_w(vec17, vec17);
3345 tmp4 = __msa_srai_w(tmp4, 6);
3346 tmp5 = __msa_srai_w(tmp5, 6);
3347 tmp6 = __msa_srai_w(tmp6, 6);
3348 tmp7 = __msa_srai_w(tmp7, 6);
3349 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
3350 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
3351 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
3352 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
3353 vec10 = __msa_maxi_s_h(vec10, 0);
3354 vec11 = __msa_maxi_s_h(vec11, 0);
3355 vec12 = __msa_maxi_s_h(vec12, 0);
3356 vec13 = __msa_maxi_s_h(vec13, 0);
3357 vec10 = __msa_min_s_h(vec10, max);
3358 vec11 = __msa_min_s_h(vec11, max);
3359 vec12 = __msa_min_s_h(vec12, max);
3360 vec13 = __msa_min_s_h(vec13, max);
3361 dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
3362 dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
3363 ST_UB2(dst0, dst1, dst_argb, 16);
3364 src_argb += 32;
3365 dst_argb += 32;
3366 }
3367 }
3368
SplitUVRow_MSA(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3369 void SplitUVRow_MSA(const uint8_t* src_uv,
3370 uint8_t* dst_u,
3371 uint8_t* dst_v,
3372 int width) {
3373 int x;
3374 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
3375
3376 for (x = 0; x < width; x += 32) {
3377 src0 = (v16u8)__msa_ld_b((void*)src_uv, 0);
3378 src1 = (v16u8)__msa_ld_b((void*)src_uv, 16);
3379 src2 = (v16u8)__msa_ld_b((void*)src_uv, 32);
3380 src3 = (v16u8)__msa_ld_b((void*)src_uv, 48);
3381 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
3382 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
3383 dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
3384 dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
3385 ST_UB2(dst0, dst1, dst_u, 16);
3386 ST_UB2(dst2, dst3, dst_v, 16);
3387 src_uv += 64;
3388 dst_u += 32;
3389 dst_v += 32;
3390 }
3391 }
3392
SetRow_MSA(uint8_t * dst,uint8_t v8,int width)3393 void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
3394 int x;
3395 v16u8 dst0 = (v16u8)__msa_fill_b(v8);
3396
3397 for (x = 0; x < width; x += 16) {
3398 ST_UB(dst0, dst);
3399 dst += 16;
3400 }
3401 }
3402
MirrorSplitUVRow_MSA(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3403 void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
3404 uint8_t* dst_u,
3405 uint8_t* dst_v,
3406 int width) {
3407 int x;
3408 v16u8 src0, src1, src2, src3;
3409 v16u8 dst0, dst1, dst2, dst3;
3410 v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
3411 v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
3412
3413 src_uv += (2 * width);
3414
3415 for (x = 0; x < width; x += 32) {
3416 src_uv -= 64;
3417 src2 = (v16u8)__msa_ld_b((void*)src_uv, 0);
3418 src3 = (v16u8)__msa_ld_b((void*)src_uv, 16);
3419 src0 = (v16u8)__msa_ld_b((void*)src_uv, 32);
3420 src1 = (v16u8)__msa_ld_b((void*)src_uv, 48);
3421 dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
3422 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
3423 dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
3424 dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
3425 ST_UB2(dst0, dst1, dst_v, 16);
3426 ST_UB2(dst2, dst3, dst_u, 16);
3427 dst_u += 32;
3428 dst_v += 32;
3429 }
3430 }
3431
SobelXRow_MSA(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int32_t width)3432 void SobelXRow_MSA(const uint8_t* src_y0,
3433 const uint8_t* src_y1,
3434 const uint8_t* src_y2,
3435 uint8_t* dst_sobelx,
3436 int32_t width) {
3437 int x;
3438 v16u8 src0, src1, src2, src3, src4, src5, dst0;
3439 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
3440 v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
3441 v16i8 tmp = __msa_ldi_b(8);
3442 v16i8 mask1 = mask0 + tmp;
3443 v8i16 zero = {0};
3444 v8i16 max = __msa_ldi_h(255);
3445
3446 for (x = 0; x < width; x += 16) {
3447 src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
3448 src1 = (v16u8)__msa_ld_b((void*)src_y0, 16);
3449 src2 = (v16u8)__msa_ld_b((void*)src_y1, 0);
3450 src3 = (v16u8)__msa_ld_b((void*)src_y1, 16);
3451 src4 = (v16u8)__msa_ld_b((void*)src_y2, 0);
3452 src5 = (v16u8)__msa_ld_b((void*)src_y2, 16);
3453 vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
3454 vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
3455 vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
3456 vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
3457 vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
3458 vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
3459 vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
3460 vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
3461 vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
3462 vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
3463 vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
3464 vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
3465 vec0 += vec2;
3466 vec1 += vec3;
3467 vec4 += vec2;
3468 vec5 += vec3;
3469 vec0 += vec4;
3470 vec1 += vec5;
3471 vec0 = __msa_add_a_h(zero, vec0);
3472 vec1 = __msa_add_a_h(zero, vec1);
3473 vec0 = __msa_maxi_s_h(vec0, 0);
3474 vec1 = __msa_maxi_s_h(vec1, 0);
3475 vec0 = __msa_min_s_h(max, vec0);
3476 vec1 = __msa_min_s_h(max, vec1);
3477 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3478 ST_UB(dst0, dst_sobelx);
3479 src_y0 += 16;
3480 src_y1 += 16;
3481 src_y2 += 16;
3482 dst_sobelx += 16;
3483 }
3484 }
3485
SobelYRow_MSA(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int32_t width)3486 void SobelYRow_MSA(const uint8_t* src_y0,
3487 const uint8_t* src_y1,
3488 uint8_t* dst_sobely,
3489 int32_t width) {
3490 int x;
3491 v16u8 src0, src1, dst0;
3492 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
3493 v8i16 zero = {0};
3494 v8i16 max = __msa_ldi_h(255);
3495
3496 for (x = 0; x < width; x += 16) {
3497 src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
3498 src1 = (v16u8)__msa_ld_b((void*)src_y1, 0);
3499 vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
3500 vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
3501 vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
3502 vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
3503 vec0 -= vec2;
3504 vec1 -= vec3;
3505 vec6[0] = src_y0[16] - src_y1[16];
3506 vec6[1] = src_y0[17] - src_y1[17];
3507 vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
3508 vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
3509 vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
3510 vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
3511 vec0 += vec2;
3512 vec1 += vec3;
3513 vec4 += vec2;
3514 vec5 += vec3;
3515 vec0 += vec4;
3516 vec1 += vec5;
3517 vec0 = __msa_add_a_h(zero, vec0);
3518 vec1 = __msa_add_a_h(zero, vec1);
3519 vec0 = __msa_maxi_s_h(vec0, 0);
3520 vec1 = __msa_maxi_s_h(vec1, 0);
3521 vec0 = __msa_min_s_h(max, vec0);
3522 vec1 = __msa_min_s_h(max, vec1);
3523 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
3524 ST_UB(dst0, dst_sobely);
3525 src_y0 += 16;
3526 src_y1 += 16;
3527 dst_sobely += 16;
3528 }
3529 }
3530
HalfFloatRow_MSA(const uint16_t * src,uint16_t * dst,float scale,int width)3531 void HalfFloatRow_MSA(const uint16_t* src,
3532 uint16_t* dst,
3533 float scale,
3534 int width) {
3535 int i;
3536 v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
3537 v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3538 v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
3539 v4f32 mult_vec;
3540 v8i16 zero = {0};
3541 mult_vec[0] = 1.9259299444e-34f * scale;
3542 mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
3543
3544 for (i = 0; i < width; i += 32) {
3545 src0 = (v8u16)__msa_ld_h((void*)src, 0);
3546 src1 = (v8u16)__msa_ld_h((void*)src, 16);
3547 src2 = (v8u16)__msa_ld_h((void*)src, 32);
3548 src3 = (v8u16)__msa_ld_h((void*)src, 48);
3549 vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
3550 vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
3551 vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
3552 vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
3553 vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
3554 vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
3555 vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
3556 vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
3557 fvec0 = __msa_ffint_u_w(vec0);
3558 fvec1 = __msa_ffint_u_w(vec1);
3559 fvec2 = __msa_ffint_u_w(vec2);
3560 fvec3 = __msa_ffint_u_w(vec3);
3561 fvec4 = __msa_ffint_u_w(vec4);
3562 fvec5 = __msa_ffint_u_w(vec5);
3563 fvec6 = __msa_ffint_u_w(vec6);
3564 fvec7 = __msa_ffint_u_w(vec7);
3565 fvec0 *= mult_vec;
3566 fvec1 *= mult_vec;
3567 fvec2 *= mult_vec;
3568 fvec3 *= mult_vec;
3569 fvec4 *= mult_vec;
3570 fvec5 *= mult_vec;
3571 fvec6 *= mult_vec;
3572 fvec7 *= mult_vec;
3573 vec0 = ((v4u32)fvec0) >> 13;
3574 vec1 = ((v4u32)fvec1) >> 13;
3575 vec2 = ((v4u32)fvec2) >> 13;
3576 vec3 = ((v4u32)fvec3) >> 13;
3577 vec4 = ((v4u32)fvec4) >> 13;
3578 vec5 = ((v4u32)fvec5) >> 13;
3579 vec6 = ((v4u32)fvec6) >> 13;
3580 vec7 = ((v4u32)fvec7) >> 13;
3581 dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
3582 dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
3583 dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
3584 dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
3585 ST_UH2(dst0, dst1, dst, 8);
3586 ST_UH2(dst2, dst3, dst + 16, 8);
3587 src += 32;
3588 dst += 32;
3589 }
3590 }
3591
3592 #ifdef __cplusplus
3593 } // extern "C"
3594 } // namespace libyuv
3595 #endif
3596
3597 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
3598