1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <string.h>
12
13 #include "libyuv/row.h"
14
15 // This module is for GCC MSA
16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
17 #include "libyuv/macros_msa.h"
18
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23
24 #define ALPHA_VAL (-1)
25
26 // Fill YUV -> RGB conversion constants into vectors
27 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
28 { \
29 ub = __msa_fill_w(yuvconst->kUVToB[0]); \
30 vr = __msa_fill_w(yuvconst->kUVToR[1]); \
31 ug = __msa_fill_w(yuvconst->kUVToG[0]); \
32 vg = __msa_fill_w(yuvconst->kUVToG[1]); \
33 bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
34 bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
35 br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
36 yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
37 }
38
39 // Load YUV 422 pixel data
40 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
41 { \
42 uint64 y_m; \
43 uint32 u_m, v_m; \
44 v4i32 zero_m = {0}; \
45 y_m = LD(psrc_y); \
46 u_m = LW(psrc_u); \
47 v_m = LW(psrc_v); \
48 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
49 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \
50 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
51 }
52
53 // Clip input vector elements between 0 to 255
54 #define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
55 { \
56 v4i32 max_m = __msa_ldi_w(0xFF); \
57 \
58 in0 = __msa_maxi_s_w(in0, 0); \
59 in1 = __msa_maxi_s_w(in1, 0); \
60 in2 = __msa_maxi_s_w(in2, 0); \
61 in3 = __msa_maxi_s_w(in3, 0); \
62 in4 = __msa_maxi_s_w(in4, 0); \
63 in5 = __msa_maxi_s_w(in5, 0); \
64 in0 = __msa_min_s_w(max_m, in0); \
65 in1 = __msa_min_s_w(max_m, in1); \
66 in2 = __msa_min_s_w(max_m, in2); \
67 in3 = __msa_min_s_w(max_m, in3); \
68 in4 = __msa_min_s_w(max_m, in4); \
69 in5 = __msa_min_s_w(max_m, in5); \
70 }
71
72 // Convert 8 pixels of YUV 420 to RGB.
73 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
74 { \
75 v8i16 vec0_m, vec1_m; \
76 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
77 v4i32 reg5_m, reg6_m, reg7_m; \
78 v16i8 zero_m = {0}; \
79 \
80 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
81 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
82 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
83 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
84 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
85 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
86 reg0_m *= yg; \
87 reg1_m *= yg; \
88 reg2_m *= ubvr; \
89 reg3_m *= ubvr; \
90 reg0_m = __msa_srai_w(reg0_m, 16); \
91 reg1_m = __msa_srai_w(reg1_m, 16); \
92 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
93 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
94 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
95 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
96 reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
97 reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
98 reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
99 reg5_m = reg0_m - reg5_m; \
100 reg6_m = reg1_m - reg6_m; \
101 reg2_m = reg0_m - reg2_m; \
102 reg3_m = reg1_m - reg3_m; \
103 reg7_m = reg0_m - reg7_m; \
104 reg4_m = reg1_m - reg4_m; \
105 reg5_m += bb; \
106 reg6_m += bb; \
107 reg7_m += bg; \
108 reg4_m += bg; \
109 reg2_m += br; \
110 reg3_m += br; \
111 reg5_m = __msa_srai_w(reg5_m, 6); \
112 reg6_m = __msa_srai_w(reg6_m, 6); \
113 reg7_m = __msa_srai_w(reg7_m, 6); \
114 reg4_m = __msa_srai_w(reg4_m, 6); \
115 reg2_m = __msa_srai_w(reg2_m, 6); \
116 reg3_m = __msa_srai_w(reg3_m, 6); \
117 CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
118 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
119 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
120 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
121 }
122
123 // Pack and Store 8 ARGB values.
124 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \
125 { \
126 v8i16 vec0_m, vec1_m; \
127 v16u8 dst0_m, dst1_m; \
128 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
129 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
130 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
131 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
132 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
133 }
134
135 // Takes ARGB input and calculates Y.
136 #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
137 y_out) \
138 { \
139 v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \
140 v8u16 reg0_m, reg1_m; \
141 \
142 vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \
143 vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \
144 vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \
145 vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \
146 reg0_m = __msa_dotp_u_h(vec0_m, const0); \
147 reg1_m = __msa_dotp_u_h(vec1_m, const0); \
148 reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \
149 reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \
150 reg0_m += const2; \
151 reg1_m += const2; \
152 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \
153 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \
154 y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
155 }
156
157 // Loads current and next row of ARGB input and averages it to calculate U and V
158 #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
159 { \
160 v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
161 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
162 v16u8 vec8_m, vec9_m; \
163 v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
164 v8u16 reg8_m, reg9_m; \
165 \
166 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \
167 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \
168 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \
169 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \
170 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \
171 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \
172 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \
173 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \
174 vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
175 vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
176 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
177 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
178 vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
179 vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
180 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
181 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
182 reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \
183 reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \
184 reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \
185 reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \
186 reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \
187 reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \
188 reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \
189 reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \
190 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
191 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
192 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
193 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
194 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
195 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
196 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
197 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
198 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
199 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
200 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
201 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
202 argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
203 argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
204 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \
205 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \
206 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \
207 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \
208 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \
209 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \
210 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \
211 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \
212 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
213 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
214 vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
215 vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
216 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
217 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
218 vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
219 vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
220 reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
221 reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
222 reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
223 reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
224 reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
225 reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
226 reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
227 reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
228 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
229 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
230 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
231 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
232 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
233 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
234 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
235 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
236 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
237 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
238 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
239 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
240 argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
241 argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
242 }
243
244 // Takes ARGB input and calculates U and V.
245 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
246 shf0, shf1, shf2, shf3, v_out, u_out) \
247 { \
248 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
249 v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
250 \
251 vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
252 vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
253 vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
254 vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
255 vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
256 vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
257 vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
258 vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
259 reg0_m = __msa_dotp_u_h(vec0_m, const1); \
260 reg1_m = __msa_dotp_u_h(vec1_m, const1); \
261 reg2_m = __msa_dotp_u_h(vec4_m, const1); \
262 reg3_m = __msa_dotp_u_h(vec5_m, const1); \
263 reg0_m += const3; \
264 reg1_m += const3; \
265 reg2_m += const3; \
266 reg3_m += const3; \
267 reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
268 reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
269 reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
270 reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
271 v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
272 u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
273 }
274
275 // Load I444 pixel data
276 #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
277 { \
278 uint64 y_m, u_m, v_m; \
279 v2i64 zero_m = {0}; \
280 y_m = LD(psrc_y); \
281 u_m = LD(psrc_u); \
282 v_m = LD(psrc_v); \
283 out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \
284 out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \
285 out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \
286 }
287
MirrorRow_MSA(const uint8 * src,uint8 * dst,int width)288 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
289 int x;
290 v16u8 src0, src1, src2, src3;
291 v16u8 dst0, dst1, dst2, dst3;
292 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
293 src += width - 64;
294
295 for (x = 0; x < width; x += 64) {
296 LD_UB4(src, 16, src3, src2, src1, src0);
297 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
298 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
299 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
300 dst += 64;
301 src -= 64;
302 }
303 }
304
ARGBMirrorRow_MSA(const uint8 * src,uint8 * dst,int width)305 void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
306 int x;
307 v16u8 src0, src1, src2, src3;
308 v16u8 dst0, dst1, dst2, dst3;
309 v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
310 src += width * 4 - 64;
311
312 for (x = 0; x < width; x += 16) {
313 LD_UB4(src, 16, src3, src2, src1, src0);
314 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
315 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
316 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
317 dst += 64;
318 src -= 64;
319 }
320 }
321
I422ToYUY2Row_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_yuy2,int width)322 void I422ToYUY2Row_MSA(const uint8* src_y,
323 const uint8* src_u,
324 const uint8* src_v,
325 uint8* dst_yuy2,
326 int width) {
327 int x;
328 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
329 v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
330
331 for (x = 0; x < width; x += 32) {
332 src_u0 = LD_UB(src_u);
333 src_v0 = LD_UB(src_v);
334 LD_UB2(src_y, 16, src_y0, src_y1);
335 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
336 ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
337 ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
338 ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
339 src_u += 16;
340 src_v += 16;
341 src_y += 32;
342 dst_yuy2 += 64;
343 }
344 }
345
I422ToUYVYRow_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_uyvy,int width)346 void I422ToUYVYRow_MSA(const uint8* src_y,
347 const uint8* src_u,
348 const uint8* src_v,
349 uint8* dst_uyvy,
350 int width) {
351 int x;
352 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
353 v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
354
355 for (x = 0; x < width; x += 32) {
356 src_u0 = LD_UB(src_u);
357 src_v0 = LD_UB(src_v);
358 LD_UB2(src_y, 16, src_y0, src_y1);
359 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
360 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
361 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
362 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
363 src_u += 16;
364 src_v += 16;
365 src_y += 32;
366 dst_uyvy += 64;
367 }
368 }
369
I422ToARGBRow_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)370 void I422ToARGBRow_MSA(const uint8* src_y,
371 const uint8* src_u,
372 const uint8* src_v,
373 uint8* rgb_buf,
374 const struct YuvConstants* yuvconstants,
375 int width) {
376 int x;
377 v16u8 src0, src1, src2;
378 v8i16 vec0, vec1, vec2;
379 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
380 v4i32 vec_ubvr, vec_ugvg;
381 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
382
383 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
384 vec_br, vec_yg);
385 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
386 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
387
388 for (x = 0; x < width; x += 8) {
389 READYUV422(src_y, src_u, src_v, src0, src1, src2);
390 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
391 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
392 vec0, vec1, vec2);
393 STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
394 src_y += 8;
395 src_u += 4;
396 src_v += 4;
397 rgb_buf += 32;
398 }
399 }
400
I422ToRGBARow_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)401 void I422ToRGBARow_MSA(const uint8* src_y,
402 const uint8* src_u,
403 const uint8* src_v,
404 uint8* rgb_buf,
405 const struct YuvConstants* yuvconstants,
406 int width) {
407 int x;
408 v16u8 src0, src1, src2;
409 v8i16 vec0, vec1, vec2;
410 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
411 v4i32 vec_ubvr, vec_ugvg;
412 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
413
414 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
415 vec_br, vec_yg);
416 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
417 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
418
419 for (x = 0; x < width; x += 8) {
420 READYUV422(src_y, src_u, src_v, src0, src1, src2);
421 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
422 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
423 vec0, vec1, vec2);
424 STOREARGB(alpha, vec0, vec1, vec2, rgb_buf);
425 src_y += 8;
426 src_u += 4;
427 src_v += 4;
428 rgb_buf += 32;
429 }
430 }
431
I422AlphaToARGBRow_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,const uint8 * src_a,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)432 void I422AlphaToARGBRow_MSA(const uint8* src_y,
433 const uint8* src_u,
434 const uint8* src_v,
435 const uint8* src_a,
436 uint8* rgb_buf,
437 const struct YuvConstants* yuvconstants,
438 int width) {
439 int x;
440 int64 data_a;
441 v16u8 src0, src1, src2, src3;
442 v8i16 vec0, vec1, vec2;
443 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
444 v4i32 vec_ubvr, vec_ugvg;
445 v4i32 zero = {0};
446
447 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
448 vec_br, vec_yg);
449 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
450 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
451
452 for (x = 0; x < width; x += 8) {
453 data_a = LD(src_a);
454 READYUV422(src_y, src_u, src_v, src0, src1, src2);
455 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
456 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
457 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
458 vec0, vec1, vec2);
459 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
460 STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
461 src_y += 8;
462 src_u += 4;
463 src_v += 4;
464 src_a += 8;
465 rgb_buf += 32;
466 }
467 }
468
I422ToRGB24Row_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int32 width)469 void I422ToRGB24Row_MSA(const uint8* src_y,
470 const uint8* src_u,
471 const uint8* src_v,
472 uint8* rgb_buf,
473 const struct YuvConstants* yuvconstants,
474 int32 width) {
475 int x;
476 int64 data_u, data_v;
477 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
478 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
479 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
480 v4i32 vec_ubvr, vec_ugvg;
481 v16u8 reg0, reg1, reg2, reg3;
482 v2i64 zero = {0};
483 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
484 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
485 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
486 11, 29, 12, 13, 30, 14, 15, 31};
487
488 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
489 vec_br, vec_yg);
490 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
491 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
492
493 for (x = 0; x < width; x += 16) {
494 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
495 data_u = LD(src_u);
496 data_v = LD(src_v);
497 src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
498 src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
499 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
500 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
501 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
502 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
503 vec0, vec1, vec2);
504 YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
505 vec3, vec4, vec5);
506 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
507 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
508 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
509 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
510 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
511 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
512 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
513 ST_UB2(dst0, dst1, rgb_buf, 16);
514 ST_UB(dst2, (rgb_buf + 32));
515 src_y += 16;
516 src_u += 8;
517 src_v += 8;
518 rgb_buf += 48;
519 }
520 }
521
522 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
I422ToRGB565Row_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)523 void I422ToRGB565Row_MSA(const uint8* src_y,
524 const uint8* src_u,
525 const uint8* src_v,
526 uint8* dst_rgb565,
527 const struct YuvConstants* yuvconstants,
528 int width) {
529 int x;
530 v16u8 src0, src1, src2, dst0;
531 v8i16 vec0, vec1, vec2;
532 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
533 v4i32 vec_ubvr, vec_ugvg;
534
535 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
536 vec_br, vec_yg);
537 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
538 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
539
540 for (x = 0; x < width; x += 8) {
541 READYUV422(src_y, src_u, src_v, src0, src1, src2);
542 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
543 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
544 vec0, vec2, vec1);
545 vec0 = __msa_srai_h(vec0, 3);
546 vec1 = __msa_srai_h(vec1, 3);
547 vec2 = __msa_srai_h(vec2, 2);
548 vec1 = __msa_slli_h(vec1, 11);
549 vec2 = __msa_slli_h(vec2, 5);
550 vec0 |= vec1;
551 dst0 = (v16u8)(vec2 | vec0);
552 ST_UB(dst0, dst_rgb565);
553 src_y += 8;
554 src_u += 4;
555 src_v += 4;
556 dst_rgb565 += 16;
557 }
558 }
559
560 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
I422ToARGB4444Row_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,const struct YuvConstants * yuvconstants,int width)561 void I422ToARGB4444Row_MSA(const uint8* src_y,
562 const uint8* src_u,
563 const uint8* src_v,
564 uint8* dst_argb4444,
565 const struct YuvConstants* yuvconstants,
566 int width) {
567 int x;
568 v16u8 src0, src1, src2, dst0;
569 v8i16 vec0, vec1, vec2;
570 v8u16 reg0, reg1, reg2;
571 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
572 v4i32 vec_ubvr, vec_ugvg;
573 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
574
575 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
576 vec_br, vec_yg);
577 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
578 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
579
580 for (x = 0; x < width; x += 8) {
581 READYUV422(src_y, src_u, src_v, src0, src1, src2);
582 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
583 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
584 vec0, vec1, vec2);
585 reg0 = (v8u16)__msa_srai_h(vec0, 4);
586 reg1 = (v8u16)__msa_srai_h(vec1, 4);
587 reg2 = (v8u16)__msa_srai_h(vec2, 4);
588 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
589 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
590 reg1 |= const_0xF000;
591 reg0 |= reg2;
592 dst0 = (v16u8)(reg1 | reg0);
593 ST_UB(dst0, dst_argb4444);
594 src_y += 8;
595 src_u += 4;
596 src_v += 4;
597 dst_argb4444 += 16;
598 }
599 }
600
I422ToARGB1555Row_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,const struct YuvConstants * yuvconstants,int width)601 void I422ToARGB1555Row_MSA(const uint8* src_y,
602 const uint8* src_u,
603 const uint8* src_v,
604 uint8* dst_argb1555,
605 const struct YuvConstants* yuvconstants,
606 int width) {
607 int x;
608 v16u8 src0, src1, src2, dst0;
609 v8i16 vec0, vec1, vec2;
610 v8u16 reg0, reg1, reg2;
611 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
612 v4i32 vec_ubvr, vec_ugvg;
613 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
614
615 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
616 vec_br, vec_yg);
617 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
618 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
619
620 for (x = 0; x < width; x += 8) {
621 READYUV422(src_y, src_u, src_v, src0, src1, src2);
622 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
623 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
624 vec0, vec1, vec2);
625 reg0 = (v8u16)__msa_srai_h(vec0, 3);
626 reg1 = (v8u16)__msa_srai_h(vec1, 3);
627 reg2 = (v8u16)__msa_srai_h(vec2, 3);
628 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
629 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
630 reg1 |= const_0x8000;
631 reg0 |= reg2;
632 dst0 = (v16u8)(reg1 | reg0);
633 ST_UB(dst0, dst_argb1555);
634 src_y += 8;
635 src_u += 4;
636 src_v += 4;
637 dst_argb1555 += 16;
638 }
639 }
640
YUY2ToYRow_MSA(const uint8 * src_yuy2,uint8 * dst_y,int width)641 void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
642 int x;
643 v16u8 src0, src1, src2, src3, dst0, dst1;
644
645 for (x = 0; x < width; x += 32) {
646 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
647 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
648 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
649 ST_UB2(dst0, dst1, dst_y, 16);
650 src_yuy2 += 64;
651 dst_y += 32;
652 }
653 }
654
YUY2ToUVRow_MSA(const uint8 * src_yuy2,int src_stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)655 void YUY2ToUVRow_MSA(const uint8* src_yuy2,
656 int src_stride_yuy2,
657 uint8* dst_u,
658 uint8* dst_v,
659 int width) {
660 const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
661 int x;
662 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
663 v16u8 vec0, vec1, dst0, dst1;
664
665 for (x = 0; x < width; x += 32) {
666 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
667 LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
668 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
669 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
670 src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
671 src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
672 vec0 = __msa_aver_u_b(src0, src2);
673 vec1 = __msa_aver_u_b(src1, src3);
674 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
675 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
676 ST_UB(dst0, dst_u);
677 ST_UB(dst1, dst_v);
678 src_yuy2 += 64;
679 src_yuy2_next += 64;
680 dst_u += 16;
681 dst_v += 16;
682 }
683 }
684
YUY2ToUV422Row_MSA(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)685 void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
686 uint8* dst_u,
687 uint8* dst_v,
688 int width) {
689 int x;
690 v16u8 src0, src1, src2, src3, dst0, dst1;
691
692 for (x = 0; x < width; x += 32) {
693 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
694 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
695 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
696 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
697 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
698 ST_UB(dst0, dst_u);
699 ST_UB(dst1, dst_v);
700 src_yuy2 += 64;
701 dst_u += 16;
702 dst_v += 16;
703 }
704 }
705
UYVYToYRow_MSA(const uint8 * src_uyvy,uint8 * dst_y,int width)706 void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
707 int x;
708 v16u8 src0, src1, src2, src3, dst0, dst1;
709
710 for (x = 0; x < width; x += 32) {
711 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
712 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
713 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
714 ST_UB2(dst0, dst1, dst_y, 16);
715 src_uyvy += 64;
716 dst_y += 32;
717 }
718 }
719
UYVYToUVRow_MSA(const uint8 * src_uyvy,int src_stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)720 void UYVYToUVRow_MSA(const uint8* src_uyvy,
721 int src_stride_uyvy,
722 uint8* dst_u,
723 uint8* dst_v,
724 int width) {
725 const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
726 int x;
727 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
728 v16u8 vec0, vec1, dst0, dst1;
729
730 for (x = 0; x < width; x += 32) {
731 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
732 LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
733 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
734 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
735 src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
736 src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
737 vec0 = __msa_aver_u_b(src0, src2);
738 vec1 = __msa_aver_u_b(src1, src3);
739 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
740 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
741 ST_UB(dst0, dst_u);
742 ST_UB(dst1, dst_v);
743 src_uyvy += 64;
744 src_uyvy_next += 64;
745 dst_u += 16;
746 dst_v += 16;
747 }
748 }
749
UYVYToUV422Row_MSA(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)750 void UYVYToUV422Row_MSA(const uint8* src_uyvy,
751 uint8* dst_u,
752 uint8* dst_v,
753 int width) {
754 int x;
755 v16u8 src0, src1, src2, src3, dst0, dst1;
756
757 for (x = 0; x < width; x += 32) {
758 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
759 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
760 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
761 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
762 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
763 ST_UB(dst0, dst_u);
764 ST_UB(dst1, dst_v);
765 src_uyvy += 64;
766 dst_u += 16;
767 dst_v += 16;
768 }
769 }
770
ARGBToYRow_MSA(const uint8 * src_argb0,uint8 * dst_y,int width)771 void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
772 int x;
773 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
774 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
775 v16i8 zero = {0};
776 v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
777 v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
778 v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
779 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
780
781 for (x = 0; x < width; x += 16) {
782 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
783 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
784 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
785 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
786 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
787 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
788 vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
789 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
790 reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
791 reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
792 reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
793 reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
794 reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
795 reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
796 reg0 *= const_0x19;
797 reg1 *= const_0x19;
798 reg2 *= const_0x81;
799 reg3 *= const_0x81;
800 reg4 *= const_0x42;
801 reg5 *= const_0x42;
802 reg0 += reg2;
803 reg1 += reg3;
804 reg0 += reg4;
805 reg1 += reg5;
806 reg0 += const_0x1080;
807 reg1 += const_0x1080;
808 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
809 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
810 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
811 ST_UB(dst0, dst_y);
812 src_argb0 += 64;
813 dst_y += 16;
814 }
815 }
816
ARGBToUVRow_MSA(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)817 void ARGBToUVRow_MSA(const uint8* src_argb0,
818 int src_stride_argb,
819 uint8* dst_u,
820 uint8* dst_v,
821 int width) {
822 int x;
823 const uint8* src_argb0_next = src_argb0 + src_stride_argb;
824 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
825 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
826 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
827 v16u8 dst0, dst1;
828 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
829 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
830 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
831 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
832 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
833 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
834
835 for (x = 0; x < width; x += 32) {
836 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
837 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
838 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
839 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
840 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
841 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
842 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
843 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
844 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
845 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
846 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
847 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
848 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
849 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
850 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
851 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
852 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
853 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
854 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
855 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
856 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
857 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
858 reg0 = __msa_hadd_u_h(vec8, vec8);
859 reg1 = __msa_hadd_u_h(vec9, vec9);
860 reg2 = __msa_hadd_u_h(vec4, vec4);
861 reg3 = __msa_hadd_u_h(vec5, vec5);
862 reg4 = __msa_hadd_u_h(vec0, vec0);
863 reg5 = __msa_hadd_u_h(vec1, vec1);
864 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
865 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
866 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
867 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
868 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
869 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
870 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
871 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
872 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
873 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
874 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
875 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
876 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
877 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
878 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
879 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
880 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
881 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
882 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
883 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
884 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
885 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
886 reg0 += __msa_hadd_u_h(vec8, vec8);
887 reg1 += __msa_hadd_u_h(vec9, vec9);
888 reg2 += __msa_hadd_u_h(vec4, vec4);
889 reg3 += __msa_hadd_u_h(vec5, vec5);
890 reg4 += __msa_hadd_u_h(vec0, vec0);
891 reg5 += __msa_hadd_u_h(vec1, vec1);
892 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
893 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
894 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
895 reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
896 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
897 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
898 reg6 = reg0 * const_0x70;
899 reg7 = reg1 * const_0x70;
900 reg8 = reg2 * const_0x4A;
901 reg9 = reg3 * const_0x4A;
902 reg6 += const_0x8080;
903 reg7 += const_0x8080;
904 reg8 += reg4 * const_0x26;
905 reg9 += reg5 * const_0x26;
906 reg0 *= const_0x12;
907 reg1 *= const_0x12;
908 reg2 *= const_0x5E;
909 reg3 *= const_0x5E;
910 reg4 *= const_0x70;
911 reg5 *= const_0x70;
912 reg2 += reg0;
913 reg3 += reg1;
914 reg4 += const_0x8080;
915 reg5 += const_0x8080;
916 reg6 -= reg8;
917 reg7 -= reg9;
918 reg4 -= reg2;
919 reg5 -= reg3;
920 reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
921 reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
922 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
923 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
924 dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
925 dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
926 ST_UB(dst0, dst_u);
927 ST_UB(dst1, dst_v);
928 src_argb0 += 128;
929 src_argb0_next += 128;
930 dst_u += 16;
931 dst_v += 16;
932 }
933 }
934
ARGBToRGB24Row_MSA(const uint8 * src_argb,uint8 * dst_rgb,int width)935 void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
936 int x;
937 v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
938 v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
939 v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14,
940 16, 17, 18, 20, 21, 22, 24, 25};
941 v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
942 21, 22, 24, 25, 26, 28, 29, 30};
943
944 for (x = 0; x < width; x += 16) {
945 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
946 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
947 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
948 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
949 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
950 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
951 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
952 ST_UB2(dst0, dst1, dst_rgb, 16);
953 ST_UB(dst2, (dst_rgb + 32));
954 src_argb += 64;
955 dst_rgb += 48;
956 }
957 }
958
ARGBToRAWRow_MSA(const uint8 * src_argb,uint8 * dst_rgb,int width)959 void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
960 int x;
961 v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
962 v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
963 v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12,
964 18, 17, 16, 22, 21, 20, 26, 25};
965 v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22,
966 21, 20, 26, 25, 24, 30, 29, 28};
967
968 for (x = 0; x < width; x += 16) {
969 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
970 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
971 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
972 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
973 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
974 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
975 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
976 ST_UB2(dst0, dst1, dst_rgb, 16);
977 ST_UB(dst2, (dst_rgb + 32));
978 src_argb += 64;
979 dst_rgb += 48;
980 }
981 }
982
ARGBToRGB565Row_MSA(const uint8 * src_argb,uint8 * dst_rgb,int width)983 void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
984 int x;
985 v16u8 src0, src1, dst0;
986 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
987 v16i8 zero = {0};
988
989 for (x = 0; x < width; x += 8) {
990 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
991 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
992 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
993 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
994 vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
995 vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
996 vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
997 vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
998 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
999 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1000 vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
1001 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1002 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
1003 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
1004 vec0 = __msa_binsli_b(vec0, vec1, 2);
1005 vec1 = __msa_binsli_b(vec2, vec3, 4);
1006 vec4 = __msa_binsli_b(vec4, vec5, 2);
1007 vec5 = __msa_binsli_b(vec6, vec7, 4);
1008 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1009 vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
1010 dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
1011 ST_UB(dst0, dst_rgb);
1012 src_argb += 32;
1013 dst_rgb += 16;
1014 }
1015 }
1016
ARGBToARGB1555Row_MSA(const uint8 * src_argb,uint8 * dst_rgb,int width)1017 void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
1018 int x;
1019 v16u8 src0, src1, dst0;
1020 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1021 v16i8 zero = {0};
1022
1023 for (x = 0; x < width; x += 8) {
1024 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1025 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1026 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
1027 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
1028 vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
1029 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
1030 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1031 vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
1032 vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
1033 vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
1034 vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
1035 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1036 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
1037 vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
1038 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
1039 vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
1040 vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
1041 vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
1042 vec0 = __msa_binsli_b(vec0, vec1, 2);
1043 vec5 = __msa_binsli_b(vec5, vec6, 2);
1044 vec1 = __msa_binsli_b(vec2, vec3, 5);
1045 vec6 = __msa_binsli_b(vec7, vec8, 5);
1046 vec1 = __msa_binsli_b(vec1, vec4, 0);
1047 vec6 = __msa_binsli_b(vec6, vec9, 0);
1048 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1049 vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
1050 dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
1051 ST_UB(dst0, dst_rgb);
1052 src_argb += 32;
1053 dst_rgb += 16;
1054 }
1055 }
1056
ARGBToARGB4444Row_MSA(const uint8 * src_argb,uint8 * dst_rgb,int width)1057 void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
1058 int x;
1059 v16u8 src0, src1;
1060 v16u8 vec0, vec1;
1061 v16u8 dst0;
1062 v16i8 zero = {0};
1063
1064 for (x = 0; x < width; x += 8) {
1065 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1066 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1067 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
1068 vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
1069 src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
1070 src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
1071 vec0 = __msa_binsli_b(vec0, src0, 3);
1072 vec1 = __msa_binsli_b(vec1, src1, 3);
1073 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1074 ST_UB(dst0, dst_rgb);
1075 src_argb += 32;
1076 dst_rgb += 16;
1077 }
1078 }
1079
ARGBToUV444Row_MSA(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int32 width)1080 void ARGBToUV444Row_MSA(const uint8* src_argb,
1081 uint8* dst_u,
1082 uint8* dst_v,
1083 int32 width) {
1084 int32 x;
1085 v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
1086 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1087 v8u16 vec8, vec9, vec10, vec11;
1088 v8u16 const_112 = (v8u16)__msa_ldi_h(112);
1089 v8u16 const_74 = (v8u16)__msa_ldi_h(74);
1090 v8u16 const_38 = (v8u16)__msa_ldi_h(38);
1091 v8u16 const_94 = (v8u16)__msa_ldi_h(94);
1092 v8u16 const_18 = (v8u16)__msa_ldi_h(18);
1093 v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
1094 v16i8 zero = {0};
1095
1096 for (x = width; x > 0; x -= 16) {
1097 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1098 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1099 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
1100 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
1101 reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1102 reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
1103 reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1104 reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
1105 src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
1106 src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
1107 src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
1108 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
1109 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
1110 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
1111 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
1112 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
1113 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
1114 vec10 = vec0 * const_18;
1115 vec11 = vec1 * const_18;
1116 vec8 = vec2 * const_94;
1117 vec9 = vec3 * const_94;
1118 vec6 = vec4 * const_112;
1119 vec7 = vec5 * const_112;
1120 vec0 *= const_112;
1121 vec1 *= const_112;
1122 vec2 *= const_74;
1123 vec3 *= const_74;
1124 vec4 *= const_38;
1125 vec5 *= const_38;
1126 vec8 += vec10;
1127 vec9 += vec11;
1128 vec6 += const_32896;
1129 vec7 += const_32896;
1130 vec0 += const_32896;
1131 vec1 += const_32896;
1132 vec2 += vec4;
1133 vec3 += vec5;
1134 vec0 -= vec2;
1135 vec1 -= vec3;
1136 vec6 -= vec8;
1137 vec7 -= vec9;
1138 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1139 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1140 vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
1141 vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
1142 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1143 dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
1144 ST_UB(dst0, dst_u);
1145 ST_UB(dst1, dst_v);
1146 src_argb += 64;
1147 dst_u += 16;
1148 dst_v += 16;
1149 }
1150 }
1151
ARGBMultiplyRow_MSA(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)1152 void ARGBMultiplyRow_MSA(const uint8* src_argb0,
1153 const uint8* src_argb1,
1154 uint8* dst_argb,
1155 int width) {
1156 int x;
1157 v16u8 src0, src1, dst0;
1158 v8u16 vec0, vec1, vec2, vec3;
1159 v4u32 reg0, reg1, reg2, reg3;
1160 v8i16 zero = {0};
1161
1162 for (x = 0; x < width; x += 4) {
1163 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1164 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
1165 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1166 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1167 vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
1168 vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
1169 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1170 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1171 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1172 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1173 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1174 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1175 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1176 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1177 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
1178 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
1179 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
1180 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
1181 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1182 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1183 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1184 ST_UB(dst0, dst_argb);
1185 src_argb0 += 16;
1186 src_argb1 += 16;
1187 dst_argb += 16;
1188 }
1189 }
1190
ARGBAddRow_MSA(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)1191 void ARGBAddRow_MSA(const uint8* src_argb0,
1192 const uint8* src_argb1,
1193 uint8* dst_argb,
1194 int width) {
1195 int x;
1196 v16u8 src0, src1, src2, src3, dst0, dst1;
1197
1198 for (x = 0; x < width; x += 8) {
1199 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1200 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1201 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
1202 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
1203 dst0 = __msa_adds_u_b(src0, src2);
1204 dst1 = __msa_adds_u_b(src1, src3);
1205 ST_UB2(dst0, dst1, dst_argb, 16);
1206 src_argb0 += 32;
1207 src_argb1 += 32;
1208 dst_argb += 32;
1209 }
1210 }
1211
ARGBSubtractRow_MSA(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)1212 void ARGBSubtractRow_MSA(const uint8* src_argb0,
1213 const uint8* src_argb1,
1214 uint8* dst_argb,
1215 int width) {
1216 int x;
1217 v16u8 src0, src1, src2, src3, dst0, dst1;
1218
1219 for (x = 0; x < width; x += 8) {
1220 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1221 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1222 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
1223 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
1224 dst0 = __msa_subs_u_b(src0, src2);
1225 dst1 = __msa_subs_u_b(src1, src3);
1226 ST_UB2(dst0, dst1, dst_argb, 16);
1227 src_argb0 += 32;
1228 src_argb1 += 32;
1229 dst_argb += 32;
1230 }
1231 }
1232
ARGBAttenuateRow_MSA(const uint8 * src_argb,uint8 * dst_argb,int width)1233 void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
1234 int x;
1235 v16u8 src0, src1, dst0, dst1;
1236 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1237 v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
1238 v8i16 zero = {0};
1239 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
1240
1241 for (x = 0; x < width; x += 8) {
1242 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1243 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1244 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1245 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1246 vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
1247 vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
1248 vec4 = (v8u16)__msa_fill_h(vec0[3]);
1249 vec5 = (v8u16)__msa_fill_h(vec0[7]);
1250 vec6 = (v8u16)__msa_fill_h(vec1[3]);
1251 vec7 = (v8u16)__msa_fill_h(vec1[7]);
1252 vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
1253 vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1254 vec6 = (v8u16)__msa_fill_h(vec2[3]);
1255 vec7 = (v8u16)__msa_fill_h(vec2[7]);
1256 vec8 = (v8u16)__msa_fill_h(vec3[3]);
1257 vec9 = (v8u16)__msa_fill_h(vec3[7]);
1258 vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1259 vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
1260 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
1261 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
1262 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
1263 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
1264 reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
1265 reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
1266 reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
1267 reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
1268 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1269 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1270 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1271 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1272 reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1273 reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1274 reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1275 reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1276 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1277 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1278 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1279 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1280 reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
1281 reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
1282 reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
1283 reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
1284 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1285 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1286 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
1287 vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
1288 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1289 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1290 dst0 = __msa_bmnz_v(dst0, src0, mask);
1291 dst1 = __msa_bmnz_v(dst1, src1, mask);
1292 ST_UB2(dst0, dst1, dst_argb, 16);
1293 src_argb += 32;
1294 dst_argb += 32;
1295 }
1296 }
1297
ARGBToRGB565DitherRow_MSA(const uint8 * src_argb,uint8 * dst_rgb,uint32 dither4,int width)1298 void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
1299 uint8* dst_rgb,
1300 uint32 dither4,
1301 int width) {
1302 int x;
1303 v16u8 src0, src1, dst0, vec0, vec1;
1304 v8i16 vec_d0;
1305 v8i16 reg0, reg1, reg2;
1306 v16i8 zero = {0};
1307 v8i16 max = __msa_ldi_h(0xFF);
1308
1309 vec_d0 = (v8i16)__msa_fill_w(dither4);
1310 vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
1311
1312 for (x = 0; x < width; x += 8) {
1313 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1314 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1315 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1316 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1317 reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
1318 reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
1319 reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
1320 reg0 += vec_d0;
1321 reg1 += vec_d0;
1322 reg2 += vec_d0;
1323 reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
1324 reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
1325 reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
1326 reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
1327 reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
1328 reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
1329 reg0 = __msa_srai_h(reg0, 3);
1330 reg2 = __msa_srai_h(reg2, 3);
1331 reg1 = __msa_srai_h(reg1, 2);
1332 reg2 = __msa_slli_h(reg2, 11);
1333 reg1 = __msa_slli_h(reg1, 5);
1334 reg0 |= reg1;
1335 dst0 = (v16u8)(reg0 | reg2);
1336 ST_UB(dst0, dst_rgb);
1337 src_argb += 32;
1338 dst_rgb += 16;
1339 }
1340 }
1341
ARGBShuffleRow_MSA(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)1342 void ARGBShuffleRow_MSA(const uint8* src_argb,
1343 uint8* dst_argb,
1344 const uint8* shuffler,
1345 int width) {
1346 int x;
1347 v16u8 src0, src1, dst0, dst1;
1348 v16i8 vec0;
1349 v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
1350 int32 val = LW((int32*)shuffler);
1351
1352 vec0 = (v16i8)__msa_fill_w(val);
1353 shuffler_vec += vec0;
1354
1355 for (x = 0; x < width; x += 8) {
1356 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1357 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1358 dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
1359 dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
1360 ST_UB2(dst0, dst1, dst_argb, 16);
1361 src_argb += 32;
1362 dst_argb += 32;
1363 }
1364 }
1365
ARGBShadeRow_MSA(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)1366 void ARGBShadeRow_MSA(const uint8* src_argb,
1367 uint8* dst_argb,
1368 int width,
1369 uint32 value) {
1370 int x;
1371 v16u8 src0, dst0;
1372 v8u16 vec0, vec1;
1373 v4u32 reg0, reg1, reg2, reg3, rgba_scale;
1374 v8i16 zero = {0};
1375
1376 rgba_scale[0] = value;
1377 rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
1378 rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
1379
1380 for (x = 0; x < width; x += 4) {
1381 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1382 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1383 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1384 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1385 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1386 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1387 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1388 reg0 *= rgba_scale;
1389 reg1 *= rgba_scale;
1390 reg2 *= rgba_scale;
1391 reg3 *= rgba_scale;
1392 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1393 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1394 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1395 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1396 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1397 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1398 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1399 ST_UB(dst0, dst_argb);
1400 src_argb += 16;
1401 dst_argb += 16;
1402 }
1403 }
1404
ARGBGrayRow_MSA(const uint8 * src_argb,uint8 * dst_argb,int width)1405 void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
1406 int x;
1407 v16u8 src0, src1, vec0, vec1, dst0, dst1;
1408 v8u16 reg0;
1409 v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
1410 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
1411
1412 for (x = 0; x < width; x += 8) {
1413 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1414 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1415 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1416 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1417 reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
1418 reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
1419 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
1420 vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
1421 vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
1422 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
1423 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
1424 ST_UB2(dst0, dst1, dst_argb, 16);
1425 src_argb += 32;
1426 dst_argb += 32;
1427 }
1428 }
1429
ARGBSepiaRow_MSA(uint8 * dst_argb,int width)1430 void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
1431 int x;
1432 v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
1433 v8u16 reg0, reg1, reg2;
1434 v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
1435 v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
1436 v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
1437 v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
1438 v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
1439 v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
1440 v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
1441
1442 for (x = 0; x < width; x += 8) {
1443 src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
1444 src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
1445 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1446 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1447 vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
1448 reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
1449 reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
1450 reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
1451 reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
1452 reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
1453 reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
1454 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
1455 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
1456 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
1457 reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
1458 reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
1459 vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
1460 vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
1461 vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
1462 vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1463 vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1464 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
1465 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
1466 ST_UB2(dst0, dst1, dst_argb, 16);
1467 dst_argb += 32;
1468 }
1469 }
1470
ARGB4444ToARGBRow_MSA(const uint8 * src_argb4444,uint8 * dst_argb,int width)1471 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
1472 uint8* dst_argb,
1473 int width) {
1474 int x;
1475 v16u8 src0, src1;
1476 v8u16 vec0, vec1, vec2, vec3;
1477 v16u8 dst0, dst1, dst2, dst3;
1478
1479 for (x = 0; x < width; x += 16) {
1480 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
1481 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
1482 vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
1483 vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
1484 vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
1485 vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
1486 vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
1487 vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
1488 vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
1489 vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
1490 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1491 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
1492 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1493 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
1494 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1495 src_argb4444 += 32;
1496 dst_argb += 64;
1497 }
1498 }
1499
ARGB1555ToARGBRow_MSA(const uint8 * src_argb1555,uint8 * dst_argb,int width)1500 void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
1501 uint8* dst_argb,
1502 int width) {
1503 int x;
1504 v8u16 src0, src1;
1505 v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
1506 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
1507 v16u8 dst0, dst1, dst2, dst3;
1508 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1509
1510 for (x = 0; x < width; x += 16) {
1511 src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
1512 src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
1513 vec0 = src0 & const_0x1F;
1514 vec1 = src1 & const_0x1F;
1515 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1516 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1517 vec2 = src0 & const_0x1F;
1518 vec3 = src1 & const_0x1F;
1519 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1520 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1521 vec4 = src0 & const_0x1F;
1522 vec5 = src1 & const_0x1F;
1523 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1524 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1525 reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1526 reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1527 reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1528 reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1529 reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
1530 reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
1531 reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
1532 reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
1533 reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
1534 reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
1535 reg3 = -reg3;
1536 reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
1537 reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
1538 reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
1539 reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
1540 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
1541 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
1542 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
1543 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
1544 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1545 src_argb1555 += 32;
1546 dst_argb += 64;
1547 }
1548 }
1549
RGB565ToARGBRow_MSA(const uint8 * src_rgb565,uint8 * dst_argb,int width)1550 void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) {
1551 int x;
1552 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
1553 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1554 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
1555 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1556 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1557 v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
1558 v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
1559
1560 for (x = 0; x < width; x += 16) {
1561 src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
1562 src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
1563 vec0 = src0 & const_0x1F;
1564 vec1 = src0 & const_0x7E0;
1565 vec2 = src0 & const_0xF800;
1566 vec3 = src1 & const_0x1F;
1567 vec4 = src1 & const_0x7E0;
1568 vec5 = src1 & const_0xF800;
1569 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1570 reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
1571 reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
1572 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1573 reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
1574 reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
1575 reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
1576 reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
1577 reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
1578 reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
1579 reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
1580 reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
1581 res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
1582 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
1583 res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
1584 res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
1585 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
1586 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
1587 dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
1588 dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
1589 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1590 src_rgb565 += 32;
1591 dst_argb += 64;
1592 }
1593 }
1594
RGB24ToARGBRow_MSA(const uint8 * src_rgb24,uint8 * dst_argb,int width)1595 void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) {
1596 int x;
1597 v16u8 src0, src1, src2;
1598 v16u8 vec0, vec1, vec2;
1599 v16u8 dst0, dst1, dst2, dst3;
1600 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1601 v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
1602
1603 for (x = 0; x < width; x += 16) {
1604 src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
1605 src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
1606 src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
1607 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1608 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1609 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1610 dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
1611 dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
1612 dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
1613 dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
1614 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1615 src_rgb24 += 48;
1616 dst_argb += 64;
1617 }
1618 }
1619
RAWToARGBRow_MSA(const uint8 * src_raw,uint8 * dst_argb,int width)1620 void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) {
1621 int x;
1622 v16u8 src0, src1, src2;
1623 v16u8 vec0, vec1, vec2;
1624 v16u8 dst0, dst1, dst2, dst3;
1625 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1626 v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
1627
1628 for (x = 0; x < width; x += 16) {
1629 src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
1630 src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
1631 src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
1632 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1633 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1634 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1635 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
1636 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
1637 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
1638 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
1639 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1640 src_raw += 48;
1641 dst_argb += 64;
1642 }
1643 }
1644
ARGB1555ToYRow_MSA(const uint8 * src_argb1555,uint8 * dst_y,int width)1645 void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) {
1646 int x;
1647 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
1648 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1649 v16u8 dst0;
1650 v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
1651 v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
1652 v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
1653 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1654 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1655
1656 for (x = 0; x < width; x += 16) {
1657 src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
1658 src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
1659 vec0 = src0 & const_0x1F;
1660 vec1 = src1 & const_0x1F;
1661 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1662 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1663 vec2 = src0 & const_0x1F;
1664 vec3 = src1 & const_0x1F;
1665 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1666 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1667 vec4 = src0 & const_0x1F;
1668 vec5 = src1 & const_0x1F;
1669 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1670 reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
1671 reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
1672 reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
1673 reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
1674 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1675 reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
1676 reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
1677 reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
1678 reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
1679 reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
1680 reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
1681 reg0 *= const_0x19;
1682 reg1 *= const_0x19;
1683 reg2 *= const_0x81;
1684 reg3 *= const_0x81;
1685 reg4 *= const_0x42;
1686 reg5 *= const_0x42;
1687 reg0 += reg2;
1688 reg1 += reg3;
1689 reg0 += reg4;
1690 reg1 += reg5;
1691 reg0 += const_0x1080;
1692 reg1 += const_0x1080;
1693 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1694 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
1695 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
1696 ST_UB(dst0, dst_y);
1697 src_argb1555 += 32;
1698 dst_y += 16;
1699 }
1700 }
1701
RGB565ToYRow_MSA(const uint8 * src_rgb565,uint8 * dst_y,int width)1702 void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) {
1703 int x;
1704 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1705 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1706 v4u32 res0, res1, res2, res3;
1707 v16u8 dst0;
1708 v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
1709 v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
1710 v8i16 const_0x1080 = __msa_fill_h(0x1080);
1711 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1712 v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
1713 v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
1714
1715 for (x = 0; x < width; x += 16) {
1716 src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
1717 src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
1718 vec0 = src0 & const_0x1F;
1719 vec1 = src0 & const_0x7E0;
1720 vec2 = src0 & const_0xF800;
1721 vec3 = src1 & const_0x1F;
1722 vec4 = src1 & const_0x7E0;
1723 vec5 = src1 & const_0xF800;
1724 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1725 reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
1726 reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
1727 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1728 reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
1729 reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
1730 reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
1731 reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
1732 reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
1733 reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
1734 reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
1735 reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
1736 vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
1737 vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
1738 vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
1739 vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
1740 vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
1741 vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
1742 vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
1743 vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
1744 res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
1745 res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
1746 res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
1747 res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
1748 res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
1749 res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
1750 res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
1751 res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
1752 res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
1753 res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
1754 res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
1755 res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
1756 vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
1757 vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
1758 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1759 ST_UB(dst0, dst_y);
1760 src_rgb565 += 32;
1761 dst_y += 16;
1762 }
1763 }
1764
RGB24ToYRow_MSA(const uint8 * src_argb0,uint8 * dst_y,int width)1765 void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
1766 int x;
1767 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1768 v8u16 vec0, vec1, vec2, vec3;
1769 v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
1770 v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
1771 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1772 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1773 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1774 18, 19, 20, 21, 21, 22, 23, 24};
1775 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1776 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1777 v16i8 zero = {0};
1778
1779 for (x = 0; x < width; x += 16) {
1780 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1781 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1782 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
1783 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1784 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1785 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1786 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1787 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1788 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1789 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1790 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1791 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
1792 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
1793 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
1794 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
1795 vec0 += const_0x1080;
1796 vec1 += const_0x1080;
1797 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1798 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1799 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1800 ST_UB(dst0, dst_y);
1801 src_argb0 += 48;
1802 dst_y += 16;
1803 }
1804 }
1805
RAWToYRow_MSA(const uint8 * src_argb0,uint8 * dst_y,int width)1806 void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
1807 int x;
1808 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1809 v8u16 vec0, vec1, vec2, vec3;
1810 v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
1811 v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
1812 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1813 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1814 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1815 18, 19, 20, 21, 21, 22, 23, 24};
1816 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1817 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1818 v16i8 zero = {0};
1819
1820 for (x = 0; x < width; x += 16) {
1821 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1822 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1823 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
1824 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1825 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1826 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1827 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1828 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1829 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1830 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1831 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1832 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
1833 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
1834 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
1835 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
1836 vec0 += const_0x1080;
1837 vec1 += const_0x1080;
1838 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1839 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1840 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1841 ST_UB(dst0, dst_y);
1842 src_argb0 += 48;
1843 dst_y += 16;
1844 }
1845 }
1846
ARGB1555ToUVRow_MSA(const uint8 * src_argb1555,int src_stride_argb1555,uint8 * dst_u,uint8 * dst_v,int width)1847 void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
1848 int src_stride_argb1555,
1849 uint8* dst_u,
1850 uint8* dst_v,
1851 int width) {
1852 int x;
1853 const uint16* s = (const uint16*)src_argb1555;
1854 const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555);
1855 int64_t res0, res1;
1856 v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
1857 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
1858 v16u8 dst0;
1859 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
1860 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
1861 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
1862 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
1863 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
1864 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
1865 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1866
1867 for (x = 0; x < width; x += 16) {
1868 src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
1869 src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
1870 src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
1871 src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
1872 vec0 = src0 & const_0x1F;
1873 vec1 = src1 & const_0x1F;
1874 vec0 += src2 & const_0x1F;
1875 vec1 += src3 & const_0x1F;
1876 vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1877 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1878 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1879 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1880 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1881 vec2 = src0 & const_0x1F;
1882 vec3 = src1 & const_0x1F;
1883 vec2 += src2 & const_0x1F;
1884 vec3 += src3 & const_0x1F;
1885 vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1886 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1887 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1888 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1889 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1890 vec4 = src0 & const_0x1F;
1891 vec5 = src1 & const_0x1F;
1892 vec4 += src2 & const_0x1F;
1893 vec5 += src3 & const_0x1F;
1894 vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1895 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
1896 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
1897 vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
1898 vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
1899 vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
1900 vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
1901 vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
1902 vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
1903 vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
1904 reg0 = vec6 * const_0x70;
1905 reg1 = vec0 * const_0x4A;
1906 reg2 = vec2 * const_0x70;
1907 reg3 = vec0 * const_0x5E;
1908 reg0 += const_0x8080;
1909 reg1 += vec2 * const_0x26;
1910 reg2 += const_0x8080;
1911 reg3 += vec6 * const_0x12;
1912 reg0 -= reg1;
1913 reg2 -= reg3;
1914 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1915 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
1916 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
1917 res0 = __msa_copy_u_d((v2i64)dst0, 0);
1918 res1 = __msa_copy_u_d((v2i64)dst0, 1);
1919 SD(res0, dst_u);
1920 SD(res1, dst_v);
1921 s += 16;
1922 t += 16;
1923 dst_u += 8;
1924 dst_v += 8;
1925 }
1926 }
1927
RGB565ToUVRow_MSA(const uint8 * src_rgb565,int src_stride_rgb565,uint8 * dst_u,uint8 * dst_v,int width)1928 void RGB565ToUVRow_MSA(const uint8* src_rgb565,
1929 int src_stride_rgb565,
1930 uint8* dst_u,
1931 uint8* dst_v,
1932 int width) {
1933 int x;
1934 const uint16* s = (const uint16*)src_rgb565;
1935 const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565);
1936 int64_t res0, res1;
1937 v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
1938 v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
1939 v16u8 dst0;
1940 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
1941 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
1942 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
1943 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
1944 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
1945 v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
1946 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1947 v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
1948
1949 for (x = 0; x < width; x += 16) {
1950 src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
1951 src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
1952 src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
1953 src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
1954 vec0 = src0 & const_0x1F;
1955 vec1 = src1 & const_0x1F;
1956 vec0 += src2 & const_0x1F;
1957 vec1 += src3 & const_0x1F;
1958 vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1959 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1960 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1961 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1962 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1963 vec2 = src0 & const_0x3F;
1964 vec3 = src1 & const_0x3F;
1965 vec2 += src2 & const_0x3F;
1966 vec3 += src3 & const_0x3F;
1967 vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1968 src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
1969 src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
1970 src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
1971 src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
1972 vec4 = src0 & const_0x1F;
1973 vec5 = src1 & const_0x1F;
1974 vec4 += src2 & const_0x1F;
1975 vec5 += src3 & const_0x1F;
1976 vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1977 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
1978 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
1979 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
1980 vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
1981 vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
1982 vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
1983 vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
1984 reg0 = vec3 * const_0x70;
1985 reg1 = vec1 * const_0x4A;
1986 reg2 = vec4 * const_0x70;
1987 reg3 = vec1 * const_0x5E;
1988 reg0 += const_32896;
1989 reg1 += vec4 * const_0x26;
1990 reg2 += const_32896;
1991 reg3 += vec3 * const_0x12;
1992 reg0 -= reg1;
1993 reg2 -= reg3;
1994 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1995 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
1996 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
1997 res0 = __msa_copy_u_d((v2i64)dst0, 0);
1998 res1 = __msa_copy_u_d((v2i64)dst0, 1);
1999 SD(res0, dst_u);
2000 SD(res1, dst_v);
2001 s += 16;
2002 t += 16;
2003 dst_u += 8;
2004 dst_v += 8;
2005 }
2006 }
2007
RGB24ToUVRow_MSA(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)2008 void RGB24ToUVRow_MSA(const uint8* src_rgb0,
2009 int src_stride_rgb,
2010 uint8* dst_u,
2011 uint8* dst_v,
2012 int width) {
2013 int x;
2014 const uint8* s = src_rgb0;
2015 const uint8* t = src_rgb0 + src_stride_rgb;
2016 int64 res0, res1;
2017 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2018 v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2019 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2020 v8i16 reg0, reg1, reg2, reg3;
2021 v16u8 dst0;
2022 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
2023 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
2024 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
2025 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
2026 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
2027 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2028 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2029 v16i8 zero = {0};
2030
2031 for (x = 0; x < width; x += 16) {
2032 inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2033 inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2034 inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
2035 inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2036 inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2037 inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
2038 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2039 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2040 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2041 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2042 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2043 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2044 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2045 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2046 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2047 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2048 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2049 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2050 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2051 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2052 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2053 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2054 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2055 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2056 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2057 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2058 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2059 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2060 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2061 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2062 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2063 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2064 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2065 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2066 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2067 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2068 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2069 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2070 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2071 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2072 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2073 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2074 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2075 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2076 reg0 = __msa_srai_h((v8i16)reg0, 2);
2077 reg1 = __msa_srai_h((v8i16)reg1, 2);
2078 reg2 = __msa_srai_h((v8i16)reg2, 2);
2079 reg3 = __msa_srai_h((v8i16)reg3, 2);
2080 vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
2081 vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
2082 vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
2083 vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
2084 vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2085 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2086 vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2087 vec3 = vec0 * const_0x70;
2088 vec4 = vec1 * const_0x4A;
2089 vec5 = vec2 * const_0x26;
2090 vec2 *= const_0x70;
2091 vec1 *= const_0x5E;
2092 vec0 *= const_0x12;
2093 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2094 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2095 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2096 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2097 reg0 += reg1;
2098 reg2 += reg3;
2099 reg0 = __msa_srai_h(reg0, 8);
2100 reg2 = __msa_srai_h(reg2, 8);
2101 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2102 res0 = __msa_copy_u_d((v2i64)dst0, 0);
2103 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2104 SD(res0, dst_u);
2105 SD(res1, dst_v);
2106 t += 48;
2107 s += 48;
2108 dst_u += 8;
2109 dst_v += 8;
2110 }
2111 }
2112
RAWToUVRow_MSA(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)2113 void RAWToUVRow_MSA(const uint8* src_rgb0,
2114 int src_stride_rgb,
2115 uint8* dst_u,
2116 uint8* dst_v,
2117 int width) {
2118 int x;
2119 const uint8* s = src_rgb0;
2120 const uint8* t = src_rgb0 + src_stride_rgb;
2121 int64 res0, res1;
2122 v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2123 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2124 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2125 v8i16 reg0, reg1, reg2, reg3;
2126 v16u8 dst0;
2127 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
2128 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
2129 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
2130 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
2131 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
2132 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2133 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2134 v16i8 zero = {0};
2135
2136 for (x = 0; x < width; x += 16) {
2137 inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2138 inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2139 inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
2140 inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2141 inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2142 inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
2143 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2144 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2145 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2146 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2147 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2148 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2149 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2150 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2151 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2152 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2153 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2154 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2155 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2156 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2157 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2158 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2159 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2160 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2161 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2162 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2163 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2164 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2165 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2166 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2167 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2168 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2169 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2170 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2171 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2172 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2173 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2174 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2175 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2176 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2177 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2178 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2179 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2180 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2181 reg0 = __msa_srai_h(reg0, 2);
2182 reg1 = __msa_srai_h(reg1, 2);
2183 reg2 = __msa_srai_h(reg2, 2);
2184 reg3 = __msa_srai_h(reg3, 2);
2185 vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2186 vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2187 vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
2188 vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
2189 vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2190 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2191 vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2192 vec3 = vec0 * const_0x70;
2193 vec4 = vec1 * const_0x4A;
2194 vec5 = vec2 * const_0x26;
2195 vec2 *= const_0x70;
2196 vec1 *= const_0x5E;
2197 vec0 *= const_0x12;
2198 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2199 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2200 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2201 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2202 reg0 += reg1;
2203 reg2 += reg3;
2204 reg0 = __msa_srai_h(reg0, 8);
2205 reg2 = __msa_srai_h(reg2, 8);
2206 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2207 res0 = __msa_copy_u_d((v2i64)dst0, 0);
2208 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2209 SD(res0, dst_u);
2210 SD(res1, dst_v);
2211 t += 48;
2212 s += 48;
2213 dst_u += 8;
2214 dst_v += 8;
2215 }
2216 }
2217
NV12ToARGBRow_MSA(const uint8 * src_y,const uint8 * src_uv,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)2218 void NV12ToARGBRow_MSA(const uint8* src_y,
2219 const uint8* src_uv,
2220 uint8* rgb_buf,
2221 const struct YuvConstants* yuvconstants,
2222 int width) {
2223 int x;
2224 uint64 val0, val1;
2225 v16u8 src0, src1, res0, res1, dst0, dst1;
2226 v8i16 vec0, vec1, vec2;
2227 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2228 v4i32 vec_ubvr, vec_ugvg;
2229 v16u8 zero = {0};
2230 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2231
2232 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2233 vec_br, vec_yg);
2234 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2235 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2236
2237 for (x = 0; x < width; x += 8) {
2238 val0 = LD(src_y);
2239 val1 = LD(src_uv);
2240 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2241 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2242 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2243 vec0, vec1, vec2);
2244 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2245 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2246 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2247 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2248 ST_UB2(dst0, dst1, rgb_buf, 16);
2249 src_y += 8;
2250 src_uv += 8;
2251 rgb_buf += 32;
2252 }
2253 }
2254
NV12ToRGB565Row_MSA(const uint8 * src_y,const uint8 * src_uv,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)2255 void NV12ToRGB565Row_MSA(const uint8* src_y,
2256 const uint8* src_uv,
2257 uint8* rgb_buf,
2258 const struct YuvConstants* yuvconstants,
2259 int width) {
2260 int x;
2261 uint64 val0, val1;
2262 v16u8 src0, src1, dst0;
2263 v8i16 vec0, vec1, vec2;
2264 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2265 v4i32 vec_ubvr, vec_ugvg;
2266 v16u8 zero = {0};
2267
2268 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2269 vec_br, vec_yg);
2270 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2271 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2272
2273 for (x = 0; x < width; x += 8) {
2274 val0 = LD(src_y);
2275 val1 = LD(src_uv);
2276 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2277 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2278 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2279 vec0, vec1, vec2);
2280 vec0 = vec0 >> 3;
2281 vec1 = (vec1 >> 2) << 5;
2282 vec2 = (vec2 >> 3) << 11;
2283 dst0 = (v16u8)(vec0 | vec1 | vec2);
2284 ST_UB(dst0, rgb_buf);
2285 src_y += 8;
2286 src_uv += 8;
2287 rgb_buf += 16;
2288 }
2289 }
2290
NV21ToARGBRow_MSA(const uint8 * src_y,const uint8 * src_vu,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)2291 void NV21ToARGBRow_MSA(const uint8* src_y,
2292 const uint8* src_vu,
2293 uint8* rgb_buf,
2294 const struct YuvConstants* yuvconstants,
2295 int width) {
2296 int x;
2297 uint64 val0, val1;
2298 v16u8 src0, src1, res0, res1, dst0, dst1;
2299 v8i16 vec0, vec1, vec2;
2300 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2301 v4i32 vec_ubvr, vec_ugvg;
2302 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2303 v16u8 zero = {0};
2304 v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
2305
2306 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2307 vec_br, vec_yg);
2308 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2309 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2310
2311 for (x = 0; x < width; x += 8) {
2312 val0 = LD(src_y);
2313 val1 = LD(src_vu);
2314 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2315 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2316 src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
2317 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2318 vec0, vec1, vec2);
2319 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2320 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2321 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2322 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2323 ST_UB2(dst0, dst1, rgb_buf, 16);
2324 src_y += 8;
2325 src_vu += 8;
2326 rgb_buf += 32;
2327 }
2328 }
2329
SobelRow_MSA(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2330 void SobelRow_MSA(const uint8* src_sobelx,
2331 const uint8* src_sobely,
2332 uint8* dst_argb,
2333 int width) {
2334 int x;
2335 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
2336 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
2337 v16i8 const_0x4 = __msa_ldi_b(0x4);
2338 v16i8 mask1 = mask0 + const_0x4;
2339 v16i8 mask2 = mask1 + const_0x4;
2340 v16i8 mask3 = mask2 + const_0x4;
2341 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2342
2343 for (x = 0; x < width; x += 16) {
2344 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2345 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2346 vec0 = __msa_adds_u_b(src0, src1);
2347 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
2348 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
2349 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
2350 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
2351 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2352 src_sobelx += 16;
2353 src_sobely += 16;
2354 dst_argb += 64;
2355 }
2356 }
2357
SobelToPlaneRow_MSA(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)2358 void SobelToPlaneRow_MSA(const uint8* src_sobelx,
2359 const uint8* src_sobely,
2360 uint8* dst_y,
2361 int width) {
2362 int x;
2363 v16u8 src0, src1, src2, src3, dst0, dst1;
2364
2365 for (x = 0; x < width; x += 32) {
2366 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2367 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
2368 src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2369 src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
2370 dst0 = __msa_adds_u_b(src0, src2);
2371 dst1 = __msa_adds_u_b(src1, src3);
2372 ST_UB2(dst0, dst1, dst_y, 16);
2373 src_sobelx += 32;
2374 src_sobely += 32;
2375 dst_y += 32;
2376 }
2377 }
2378
SobelXYRow_MSA(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2379 void SobelXYRow_MSA(const uint8* src_sobelx,
2380 const uint8* src_sobely,
2381 uint8* dst_argb,
2382 int width) {
2383 int x;
2384 v16u8 src0, src1, vec0, vec1, vec2;
2385 v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
2386 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2387
2388 for (x = 0; x < width; x += 16) {
2389 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2390 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2391 vec0 = __msa_adds_u_b(src0, src1);
2392 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
2393 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
2394 reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
2395 reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
2396 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
2397 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
2398 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
2399 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
2400 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2401 src_sobelx += 16;
2402 src_sobely += 16;
2403 dst_argb += 64;
2404 }
2405 }
2406
ARGBToYJRow_MSA(const uint8 * src_argb0,uint8 * dst_y,int width)2407 void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2408 int x;
2409 v16u8 src0, src1, src2, src3, dst0;
2410 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
2411 v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
2412 v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
2413
2414 for (x = 0; x < width; x += 16) {
2415 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2416 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2417 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2418 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2419 ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
2420 dst0);
2421 ST_UB(dst0, dst_y);
2422 src_argb0 += 64;
2423 dst_y += 16;
2424 }
2425 }
2426
BGRAToYRow_MSA(const uint8 * src_argb0,uint8 * dst_y,int width)2427 void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2428 int x;
2429 v16u8 src0, src1, src2, src3, dst0;
2430 v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
2431 v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
2432 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2433
2434 for (x = 0; x < width; x += 16) {
2435 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2436 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2437 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2438 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2439 ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
2440 dst0);
2441 ST_UB(dst0, dst_y);
2442 src_argb0 += 64;
2443 dst_y += 16;
2444 }
2445 }
2446
ABGRToYRow_MSA(const uint8 * src_argb0,uint8 * dst_y,int width)2447 void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2448 int x;
2449 v16u8 src0, src1, src2, src3, dst0;
2450 v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
2451 v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
2452 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2453
2454 for (x = 0; x < width; x += 16) {
2455 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2456 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2457 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2458 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2459 ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
2460 dst0);
2461 ST_UB(dst0, dst_y);
2462 src_argb0 += 64;
2463 dst_y += 16;
2464 }
2465 }
2466
RGBAToYRow_MSA(const uint8 * src_argb0,uint8 * dst_y,int width)2467 void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2468 int x;
2469 v16u8 src0, src1, src2, src3, dst0;
2470 v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
2471 v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
2472 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2473
2474 for (x = 0; x < width; x += 16) {
2475 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2476 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2477 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2478 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2479 ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
2480 dst0);
2481 ST_UB(dst0, dst_y);
2482 src_argb0 += 64;
2483 dst_y += 16;
2484 }
2485 }
2486
ARGBToUVJRow_MSA(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)2487 void ARGBToUVJRow_MSA(const uint8* src_rgb0,
2488 int src_stride_rgb,
2489 uint8* dst_u,
2490 uint8* dst_v,
2491 int width) {
2492 int x;
2493 const uint8* s = src_rgb0;
2494 const uint8* t = src_rgb0 + src_stride_rgb;
2495 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2496 v16u8 vec0, vec1, vec2, vec3;
2497 v16u8 dst0, dst1;
2498 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2499 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2500 18, 19, 22, 23, 26, 27, 30, 31};
2501 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2502 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2503 v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
2504 v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
2505 v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
2506 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2507
2508 for (x = 0; x < width; x += 32) {
2509 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2510 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2511 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
2512 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
2513 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2514 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2515 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
2516 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
2517 src0 = __msa_aver_u_b(src0, src4);
2518 src1 = __msa_aver_u_b(src1, src5);
2519 src2 = __msa_aver_u_b(src2, src6);
2520 src3 = __msa_aver_u_b(src3, src7);
2521 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2522 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2523 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2524 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2525 vec0 = __msa_aver_u_b(src4, src6);
2526 vec1 = __msa_aver_u_b(src5, src7);
2527 src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
2528 src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
2529 src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
2530 src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
2531 src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
2532 src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
2533 src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
2534 src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
2535 src0 = __msa_aver_u_b(src0, src4);
2536 src1 = __msa_aver_u_b(src1, src5);
2537 src2 = __msa_aver_u_b(src2, src6);
2538 src3 = __msa_aver_u_b(src3, src7);
2539 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2540 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2541 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2542 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2543 vec2 = __msa_aver_u_b(src4, src6);
2544 vec3 = __msa_aver_u_b(src5, src7);
2545 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
2546 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2547 dst1);
2548 ST_UB(dst0, dst_v);
2549 ST_UB(dst1, dst_u);
2550 s += 128;
2551 t += 128;
2552 dst_v += 16;
2553 dst_u += 16;
2554 }
2555 }
2556
BGRAToUVRow_MSA(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)2557 void BGRAToUVRow_MSA(const uint8* src_rgb0,
2558 int src_stride_rgb,
2559 uint8* dst_u,
2560 uint8* dst_v,
2561 int width) {
2562 int x;
2563 const uint8* s = src_rgb0;
2564 const uint8* t = src_rgb0 + src_stride_rgb;
2565 v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2566 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2567 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2568 18, 19, 22, 23, 26, 27, 30, 31};
2569 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2570 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2571 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2572 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2573 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
2574 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2575
2576 for (x = 0; x < width; x += 32) {
2577 READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2578 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2579 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2580 dst1);
2581 ST_UB(dst0, dst_v);
2582 ST_UB(dst1, dst_u);
2583 s += 128;
2584 t += 128;
2585 dst_v += 16;
2586 dst_u += 16;
2587 }
2588 }
2589
ABGRToUVRow_MSA(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)2590 void ABGRToUVRow_MSA(const uint8* src_rgb0,
2591 int src_stride_rgb,
2592 uint8* dst_u,
2593 uint8* dst_v,
2594 int width) {
2595 int x;
2596 const uint8* s = src_rgb0;
2597 const uint8* t = src_rgb0 + src_stride_rgb;
2598 v16u8 src0, src1, src2, src3;
2599 v16u8 dst0, dst1;
2600 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2601 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2602 18, 19, 22, 23, 26, 27, 30, 31};
2603 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2604 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2605 v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
2606 v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
2607 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2608 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2609
2610 for (x = 0; x < width; x += 32) {
2611 READ_ARGB(s, t, src0, src1, src2, src3);
2612 ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
2613 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2614 dst1);
2615 ST_UB(dst0, dst_u);
2616 ST_UB(dst1, dst_v);
2617 s += 128;
2618 t += 128;
2619 dst_u += 16;
2620 dst_v += 16;
2621 }
2622 }
2623
RGBAToUVRow_MSA(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)2624 void RGBAToUVRow_MSA(const uint8* src_rgb0,
2625 int src_stride_rgb,
2626 uint8* dst_u,
2627 uint8* dst_v,
2628 int width) {
2629 int x;
2630 const uint8* s = src_rgb0;
2631 const uint8* t = src_rgb0 + src_stride_rgb;
2632 v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2633 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2634 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2635 18, 19, 22, 23, 26, 27, 30, 31};
2636 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2637 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2638 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
2639 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2640 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
2641 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2642
2643 for (x = 0; x < width; x += 32) {
2644 READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2645 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2646 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2647 dst1);
2648 ST_UB(dst0, dst_u);
2649 ST_UB(dst1, dst_v);
2650 s += 128;
2651 t += 128;
2652 dst_u += 16;
2653 dst_v += 16;
2654 }
2655 }
2656
I444ToARGBRow_MSA(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)2657 void I444ToARGBRow_MSA(const uint8* src_y,
2658 const uint8* src_u,
2659 const uint8* src_v,
2660 uint8* rgb_buf,
2661 const struct YuvConstants* yuvconstants,
2662 int width) {
2663 int x;
2664 v16u8 src0, src1, src2, dst0, dst1;
2665 v8u16 vec0, vec1, vec2;
2666 v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
2667 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2668 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2669 v8i16 zero = {0};
2670
2671 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2672 vec_br, vec_yg);
2673
2674 for (x = 0; x < width; x += 8) {
2675 READI444(src_y, src_u, src_v, src0, src1, src2);
2676 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2677 reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2678 reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2679 reg0 *= vec_yg;
2680 reg1 *= vec_yg;
2681 reg0 = __msa_srai_w(reg0, 16);
2682 reg1 = __msa_srai_w(reg1, 16);
2683 reg4 = reg0 + vec_br;
2684 reg5 = reg1 + vec_br;
2685 reg2 = reg0 + vec_bg;
2686 reg3 = reg1 + vec_bg;
2687 reg0 += vec_bb;
2688 reg1 += vec_bb;
2689 vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
2690 vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
2691 reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2692 reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2693 reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
2694 reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
2695 reg0 -= reg6 * vec_ub;
2696 reg1 -= reg7 * vec_ub;
2697 reg2 -= reg6 * vec_ug;
2698 reg3 -= reg7 * vec_ug;
2699 reg4 -= reg8 * vec_vr;
2700 reg5 -= reg9 * vec_vr;
2701 reg2 -= reg8 * vec_vg;
2702 reg3 -= reg9 * vec_vg;
2703 reg0 = __msa_srai_w(reg0, 6);
2704 reg1 = __msa_srai_w(reg1, 6);
2705 reg2 = __msa_srai_w(reg2, 6);
2706 reg3 = __msa_srai_w(reg3, 6);
2707 reg4 = __msa_srai_w(reg4, 6);
2708 reg5 = __msa_srai_w(reg5, 6);
2709 CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
2710 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2711 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2712 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
2713 vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
2714 vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
2715 dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
2716 dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
2717 ST_UB2(dst0, dst1, rgb_buf, 16);
2718 src_y += 8;
2719 src_u += 8;
2720 src_v += 8;
2721 rgb_buf += 32;
2722 }
2723 }
2724
I400ToARGBRow_MSA(const uint8 * src_y,uint8 * rgb_buf,int width)2725 void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
2726 int x;
2727 v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
2728 v8i16 vec0, vec1;
2729 v4i32 reg0, reg1, reg2, reg3;
2730 v4i32 vec_yg = __msa_fill_w(0x4A35);
2731 v8i16 vec_ygb = __msa_fill_h(0xFB78);
2732 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2733 v8i16 max = __msa_ldi_h(0xFF);
2734 v8i16 zero = {0};
2735
2736 for (x = 0; x < width; x += 16) {
2737 src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
2738 vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2739 vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2740 reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
2741 reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
2742 reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
2743 reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
2744 reg0 *= vec_yg;
2745 reg1 *= vec_yg;
2746 reg2 *= vec_yg;
2747 reg3 *= vec_yg;
2748 reg0 = __msa_srai_w(reg0, 16);
2749 reg1 = __msa_srai_w(reg1, 16);
2750 reg2 = __msa_srai_w(reg2, 16);
2751 reg3 = __msa_srai_w(reg3, 16);
2752 vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2753 vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2754 vec0 += vec_ygb;
2755 vec1 += vec_ygb;
2756 vec0 = __msa_srai_h(vec0, 6);
2757 vec1 = __msa_srai_h(vec1, 6);
2758 vec0 = __msa_maxi_s_h(vec0, 0);
2759 vec1 = __msa_maxi_s_h(vec1, 0);
2760 vec0 = __msa_min_s_h(max, vec0);
2761 vec1 = __msa_min_s_h(max, vec1);
2762 res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
2763 res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
2764 res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
2765 res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
2766 res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
2767 dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
2768 dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
2769 dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
2770 dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
2771 ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
2772 src_y += 16;
2773 rgb_buf += 64;
2774 }
2775 }
2776
J400ToARGBRow_MSA(const uint8 * src_y,uint8 * dst_argb,int width)2777 void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
2778 int x;
2779 v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
2780 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2781
2782 for (x = 0; x < width; x += 16) {
2783 src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
2784 vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2785 vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2786 vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
2787 vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
2788 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
2789 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
2790 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
2791 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
2792 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2793 src_y += 16;
2794 dst_argb += 64;
2795 }
2796 }
2797
YUY2ToARGBRow_MSA(const uint8 * src_yuy2,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)2798 void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
2799 uint8* rgb_buf,
2800 const struct YuvConstants* yuvconstants,
2801 int width) {
2802 int x;
2803 v16u8 src0, src1, src2;
2804 v8i16 vec0, vec1, vec2;
2805 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2806 v4i32 vec_ubvr, vec_ugvg;
2807 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2808
2809 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2810 vec_br, vec_yg);
2811 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2812 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2813
2814 for (x = 0; x < width; x += 8) {
2815 src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
2816 src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2817 src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2818 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2819 vec0, vec1, vec2);
2820 STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
2821 src_yuy2 += 16;
2822 rgb_buf += 32;
2823 }
2824 }
2825
UYVYToARGBRow_MSA(const uint8 * src_uyvy,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)2826 void UYVYToARGBRow_MSA(const uint8* src_uyvy,
2827 uint8* rgb_buf,
2828 const struct YuvConstants* yuvconstants,
2829 int width) {
2830 int x;
2831 v16u8 src0, src1, src2;
2832 v8i16 vec0, vec1, vec2;
2833 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2834 v4i32 vec_ubvr, vec_ugvg;
2835 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2836
2837 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2838 vec_br, vec_yg);
2839 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2840 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2841
2842 for (x = 0; x < width; x += 8) {
2843 src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
2844 src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2845 src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2846 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2847 vec0, vec1, vec2);
2848 STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
2849 src_uyvy += 16;
2850 rgb_buf += 32;
2851 }
2852 }
2853
InterpolateRow_MSA(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int width,int32 source_y_fraction)2854 void InterpolateRow_MSA(uint8* dst_ptr,
2855 const uint8* src_ptr,
2856 ptrdiff_t src_stride,
2857 int width,
2858 int32 source_y_fraction) {
2859 int32 y1_fraction = source_y_fraction;
2860 int32 y0_fraction = 256 - y1_fraction;
2861 uint16 y_fractions;
2862 const uint8* s = src_ptr;
2863 const uint8* t = src_ptr + src_stride;
2864 int x;
2865 v16u8 src0, src1, src2, src3, dst0, dst1;
2866 v8u16 vec0, vec1, vec2, vec3, y_frac;
2867
2868 if (0 == y1_fraction) {
2869 memcpy(dst_ptr, src_ptr, width);
2870 return;
2871 }
2872
2873 if (128 == y1_fraction) {
2874 for (x = 0; x < width; x += 32) {
2875 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2876 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2877 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2878 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2879 dst0 = __msa_aver_u_b(src0, src2);
2880 dst1 = __msa_aver_u_b(src1, src3);
2881 ST_UB2(dst0, dst1, dst_ptr, 16);
2882 s += 32;
2883 t += 32;
2884 dst_ptr += 32;
2885 }
2886 return;
2887 }
2888
2889 y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
2890 y_frac = (v8u16)__msa_fill_h(y_fractions);
2891
2892 for (x = 0; x < width; x += 32) {
2893 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2894 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2895 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2896 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2897 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
2898 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
2899 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
2900 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
2901 vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
2902 vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
2903 vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
2904 vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
2905 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
2906 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
2907 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
2908 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
2909 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
2910 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
2911 ST_UB2(dst0, dst1, dst_ptr, 16);
2912 s += 32;
2913 t += 32;
2914 dst_ptr += 32;
2915 }
2916 }
2917
ARGBSetRow_MSA(uint8 * dst_argb,uint32 v32,int width)2918 void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
2919 int x;
2920 v16u8 dst0 = (v16u8)__msa_fill_w(v32);
2921
2922 for (x = 0; x < width; x += 4) {
2923 ST_UB(dst0, dst_argb);
2924 dst_argb += 16;
2925 }
2926 }
2927
RAWToRGB24Row_MSA(const uint8 * src_raw,uint8 * dst_rgb24,int width)2928 void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
2929 int x;
2930 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
2931 v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
2932 v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13,
2933 18, 17, 16, 21, 20, 19, 24, 23};
2934 v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
2935 24, 23, 28, 27, 26, 31, 30, 29};
2936
2937 for (x = 0; x < width; x += 16) {
2938 src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
2939 src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
2940 src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
2941 src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
2942 src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
2943 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
2944 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
2945 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
2946 ST_UB2(dst0, dst1, dst_rgb24, 16);
2947 ST_UB(dst2, (dst_rgb24 + 32));
2948 src_raw += 48;
2949 dst_rgb24 += 48;
2950 }
2951 }
2952
MergeUVRow_MSA(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2953 void MergeUVRow_MSA(const uint8* src_u,
2954 const uint8* src_v,
2955 uint8* dst_uv,
2956 int width) {
2957 int x;
2958 v16u8 src0, src1, dst0, dst1;
2959
2960 for (x = 0; x < width; x += 16) {
2961 src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
2962 src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
2963 dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
2964 dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
2965 ST_UB2(dst0, dst1, dst_uv, 16);
2966 src_u += 16;
2967 src_v += 16;
2968 dst_uv += 32;
2969 }
2970 }
2971
2972 #ifdef __cplusplus
2973 } // extern "C"
2974 } // namespace libyuv
2975 #endif
2976
2977 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
2978