1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12
13 #include "libyuv/scale_row.h"
14
15 // This module is for GCC MSA
16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
17 #include "libyuv/macros_msa.h"
18
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23
ScaleARGBRowDown2_MSA(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)24 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
25 ptrdiff_t src_stride,
26 uint8_t* dst_argb,
27 int dst_width) {
28 int x;
29 v16u8 src0, src1, dst0;
30 (void)src_stride;
31
32 for (x = 0; x < dst_width; x += 4) {
33 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
34 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
35 dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
36 ST_UB(dst0, dst_argb);
37 src_argb += 32;
38 dst_argb += 16;
39 }
40 }
41
ScaleARGBRowDown2Linear_MSA(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)42 void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
43 ptrdiff_t src_stride,
44 uint8_t* dst_argb,
45 int dst_width) {
46 int x;
47 v16u8 src0, src1, vec0, vec1, dst0;
48 (void)src_stride;
49
50 for (x = 0; x < dst_width; x += 4) {
51 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
52 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
53 vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
54 vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
55 dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
56 ST_UB(dst0, dst_argb);
57 src_argb += 32;
58 dst_argb += 16;
59 }
60 }
61
ScaleARGBRowDown2Box_MSA(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)62 void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
63 ptrdiff_t src_stride,
64 uint8_t* dst_argb,
65 int dst_width) {
66 int x;
67 const uint8_t* s = src_argb;
68 const uint8_t* t = src_argb + src_stride;
69 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
70 v8u16 reg0, reg1, reg2, reg3;
71 v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
72
73 for (x = 0; x < dst_width; x += 4) {
74 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
75 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
76 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
77 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
78 vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
79 vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
80 vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
81 vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
82 reg0 = __msa_hadd_u_h(vec0, vec0);
83 reg1 = __msa_hadd_u_h(vec1, vec1);
84 reg2 = __msa_hadd_u_h(vec2, vec2);
85 reg3 = __msa_hadd_u_h(vec3, vec3);
86 reg0 += reg2;
87 reg1 += reg3;
88 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
89 reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
90 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
91 ST_UB(dst0, dst_argb);
92 s += 32;
93 t += 32;
94 dst_argb += 16;
95 }
96 }
97
ScaleARGBRowDownEven_MSA(const uint8_t * src_argb,ptrdiff_t src_stride,int32_t src_stepx,uint8_t * dst_argb,int dst_width)98 void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
99 ptrdiff_t src_stride,
100 int32_t src_stepx,
101 uint8_t* dst_argb,
102 int dst_width) {
103 int x;
104 int32_t stepx = src_stepx * 4;
105 int32_t data0, data1, data2, data3;
106 (void)src_stride;
107
108 for (x = 0; x < dst_width; x += 4) {
109 data0 = LW(src_argb);
110 data1 = LW(src_argb + stepx);
111 data2 = LW(src_argb + stepx * 2);
112 data3 = LW(src_argb + stepx * 3);
113 SW(data0, dst_argb);
114 SW(data1, dst_argb + 4);
115 SW(data2, dst_argb + 8);
116 SW(data3, dst_argb + 12);
117 src_argb += stepx * 4;
118 dst_argb += 16;
119 }
120 }
121
ScaleARGBRowDownEvenBox_MSA(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)122 void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
123 ptrdiff_t src_stride,
124 int src_stepx,
125 uint8* dst_argb,
126 int dst_width) {
127 int x;
128 const uint8* nxt_argb = src_argb + src_stride;
129 int32_t stepx = src_stepx * 4;
130 int64_t data0, data1, data2, data3;
131 v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
132 v16u8 vec0, vec1, vec2, vec3;
133 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
134 v16u8 dst0;
135
136 for (x = 0; x < dst_width; x += 4) {
137 data0 = LD(src_argb);
138 data1 = LD(src_argb + stepx);
139 data2 = LD(src_argb + stepx * 2);
140 data3 = LD(src_argb + stepx * 3);
141 src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
142 src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
143 src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
144 src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
145 data0 = LD(nxt_argb);
146 data1 = LD(nxt_argb + stepx);
147 data2 = LD(nxt_argb + stepx * 2);
148 data3 = LD(nxt_argb + stepx * 3);
149 src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
150 src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
151 src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
152 src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
153 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
154 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
155 vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
156 vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
157 reg0 = __msa_hadd_u_h(vec0, vec0);
158 reg1 = __msa_hadd_u_h(vec1, vec1);
159 reg2 = __msa_hadd_u_h(vec2, vec2);
160 reg3 = __msa_hadd_u_h(vec3, vec3);
161 reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
162 reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
163 reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
164 reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
165 reg4 += reg6;
166 reg5 += reg7;
167 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
168 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
169 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
170 ST_UB(dst0, dst_argb);
171 src_argb += stepx * 4;
172 nxt_argb += stepx * 4;
173 dst_argb += 16;
174 }
175 }
176
ScaleRowDown2_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)177 void ScaleRowDown2_MSA(const uint8_t* src_ptr,
178 ptrdiff_t src_stride,
179 uint8_t* dst,
180 int dst_width) {
181 int x;
182 v16u8 src0, src1, src2, src3, dst0, dst1;
183 (void)src_stride;
184
185 for (x = 0; x < dst_width; x += 32) {
186 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
187 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
188 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
189 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
190 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
191 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
192 ST_UB2(dst0, dst1, dst, 16);
193 src_ptr += 64;
194 dst += 32;
195 }
196 }
197
ScaleRowDown2Linear_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)198 void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
199 ptrdiff_t src_stride,
200 uint8_t* dst,
201 int dst_width) {
202 int x;
203 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
204 (void)src_stride;
205
206 for (x = 0; x < dst_width; x += 32) {
207 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
208 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
209 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
210 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
211 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
212 vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
213 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
214 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
215 dst0 = __msa_aver_u_b(vec1, vec0);
216 dst1 = __msa_aver_u_b(vec3, vec2);
217 ST_UB2(dst0, dst1, dst, 16);
218 src_ptr += 64;
219 dst += 32;
220 }
221 }
222
ScaleRowDown2Box_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)223 void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
224 ptrdiff_t src_stride,
225 uint8_t* dst,
226 int dst_width) {
227 int x;
228 const uint8_t* s = src_ptr;
229 const uint8_t* t = src_ptr + src_stride;
230 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
231 v8u16 vec0, vec1, vec2, vec3;
232
233 for (x = 0; x < dst_width; x += 32) {
234 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
235 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
236 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
237 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
238 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
239 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
240 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
241 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
242 vec0 = __msa_hadd_u_h(src0, src0);
243 vec1 = __msa_hadd_u_h(src1, src1);
244 vec2 = __msa_hadd_u_h(src2, src2);
245 vec3 = __msa_hadd_u_h(src3, src3);
246 vec0 += __msa_hadd_u_h(src4, src4);
247 vec1 += __msa_hadd_u_h(src5, src5);
248 vec2 += __msa_hadd_u_h(src6, src6);
249 vec3 += __msa_hadd_u_h(src7, src7);
250 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
251 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
252 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
253 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
254 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
255 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
256 ST_UB2(dst0, dst1, dst, 16);
257 s += 64;
258 t += 64;
259 dst += 32;
260 }
261 }
262
ScaleRowDown4_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)263 void ScaleRowDown4_MSA(const uint8_t* src_ptr,
264 ptrdiff_t src_stride,
265 uint8_t* dst,
266 int dst_width) {
267 int x;
268 v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
269 (void)src_stride;
270
271 for (x = 0; x < dst_width; x += 16) {
272 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
273 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
274 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
275 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
276 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
277 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
278 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
279 ST_UB(dst0, dst);
280 src_ptr += 64;
281 dst += 16;
282 }
283 }
284
ScaleRowDown4Box_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)285 void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
286 ptrdiff_t src_stride,
287 uint8_t* dst,
288 int dst_width) {
289 int x;
290 const uint8_t* s = src_ptr;
291 const uint8_t* t0 = s + src_stride;
292 const uint8_t* t1 = s + src_stride * 2;
293 const uint8_t* t2 = s + src_stride * 3;
294 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
295 v8u16 vec0, vec1, vec2, vec3;
296 v4u32 reg0, reg1, reg2, reg3;
297
298 for (x = 0; x < dst_width; x += 16) {
299 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
300 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
301 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
302 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
303 src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
304 src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
305 src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
306 src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
307 vec0 = __msa_hadd_u_h(src0, src0);
308 vec1 = __msa_hadd_u_h(src1, src1);
309 vec2 = __msa_hadd_u_h(src2, src2);
310 vec3 = __msa_hadd_u_h(src3, src3);
311 vec0 += __msa_hadd_u_h(src4, src4);
312 vec1 += __msa_hadd_u_h(src5, src5);
313 vec2 += __msa_hadd_u_h(src6, src6);
314 vec3 += __msa_hadd_u_h(src7, src7);
315 src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
316 src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
317 src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
318 src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
319 src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
320 src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
321 src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
322 src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
323 vec0 += __msa_hadd_u_h(src0, src0);
324 vec1 += __msa_hadd_u_h(src1, src1);
325 vec2 += __msa_hadd_u_h(src2, src2);
326 vec3 += __msa_hadd_u_h(src3, src3);
327 vec0 += __msa_hadd_u_h(src4, src4);
328 vec1 += __msa_hadd_u_h(src5, src5);
329 vec2 += __msa_hadd_u_h(src6, src6);
330 vec3 += __msa_hadd_u_h(src7, src7);
331 reg0 = __msa_hadd_u_w(vec0, vec0);
332 reg1 = __msa_hadd_u_w(vec1, vec1);
333 reg2 = __msa_hadd_u_w(vec2, vec2);
334 reg3 = __msa_hadd_u_w(vec3, vec3);
335 reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
336 reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
337 reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
338 reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
339 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
340 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
341 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
342 ST_UB(dst0, dst);
343 s += 64;
344 t0 += 64;
345 t1 += 64;
346 t2 += 64;
347 dst += 16;
348 }
349 }
350
ScaleRowDown38_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)351 void ScaleRowDown38_MSA(const uint8_t* src_ptr,
352 ptrdiff_t src_stride,
353 uint8_t* dst,
354 int dst_width) {
355 int x, width;
356 uint64_t dst0;
357 uint32_t dst1;
358 v16u8 src0, src1, vec0;
359 v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
360 (void)src_stride;
361
362 assert(dst_width % 3 == 0);
363 width = dst_width / 3;
364
365 for (x = 0; x < width; x += 4) {
366 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
367 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
368 vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
369 dst0 = __msa_copy_u_d((v2i64)vec0, 0);
370 dst1 = __msa_copy_u_w((v4i32)vec0, 2);
371 SD(dst0, dst);
372 SW(dst1, dst + 8);
373 src_ptr += 32;
374 dst += 12;
375 }
376 }
377
ScaleRowDown38_2_Box_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)378 void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
379 ptrdiff_t src_stride,
380 uint8_t* dst_ptr,
381 int dst_width) {
382 int x, width;
383 const uint8_t* s = src_ptr;
384 const uint8_t* t = src_ptr + src_stride;
385 uint64_t dst0;
386 uint32_t dst1;
387 v16u8 src0, src1, src2, src3, out;
388 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
389 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
390 v8i16 zero = {0};
391 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
392 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
393 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
394 v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
395
396 assert((dst_width % 3 == 0) && (dst_width > 0));
397 width = dst_width / 3;
398
399 for (x = 0; x < width; x += 4) {
400 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
401 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
402 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
403 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
404 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
405 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
406 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
407 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
408 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
409 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
410 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
411 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
412 vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
413 vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
414 vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
415 vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
416 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
417 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
418 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
419 tmp0 = __msa_hadd_u_w(vec4, vec4);
420 tmp1 = __msa_hadd_u_w(vec5, vec5);
421 tmp2 = __msa_hadd_u_w(vec6, vec6);
422 tmp3 = __msa_hadd_u_w(vec7, vec7);
423 tmp4 = __msa_hadd_u_w(vec0, vec0);
424 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
425 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
426 tmp0 = __msa_hadd_u_w(vec0, vec0);
427 tmp1 = __msa_hadd_u_w(vec1, vec1);
428 tmp0 *= const_0x2AAA;
429 tmp1 *= const_0x2AAA;
430 tmp4 *= const_0x4000;
431 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
432 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
433 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
434 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
435 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
436 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
437 dst0 = __msa_copy_u_d((v2i64)out, 0);
438 dst1 = __msa_copy_u_w((v4i32)out, 2);
439 SD(dst0, dst_ptr);
440 SW(dst1, dst_ptr + 8);
441 s += 32;
442 t += 32;
443 dst_ptr += 12;
444 }
445 }
446
ScaleRowDown38_3_Box_MSA(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)447 void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
448 ptrdiff_t src_stride,
449 uint8_t* dst_ptr,
450 int dst_width) {
451 int x, width;
452 const uint8_t* s = src_ptr;
453 const uint8_t* t0 = s + src_stride;
454 const uint8_t* t1 = s + src_stride * 2;
455 uint64_t dst0;
456 uint32_t dst1;
457 v16u8 src0, src1, src2, src3, src4, src5, out;
458 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
459 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
460 v8u16 zero = {0};
461 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
462 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
463 v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
464 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
465
466 assert((dst_width % 3 == 0) && (dst_width > 0));
467 width = dst_width / 3;
468
469 for (x = 0; x < width; x += 4) {
470 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
471 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
472 src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
473 src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
474 src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
475 src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
476 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
477 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
478 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
479 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
480 vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
481 vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
482 vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
483 vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
484 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
485 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
486 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
487 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
488 vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
489 vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
490 vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
491 vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
492 vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
493 vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
494 vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
495 vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
496 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
497 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
498 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
499 tmp0 = __msa_hadd_u_w(vec4, vec4);
500 tmp1 = __msa_hadd_u_w(vec5, vec5);
501 tmp2 = __msa_hadd_u_w(vec6, vec6);
502 tmp3 = __msa_hadd_u_w(vec7, vec7);
503 tmp4 = __msa_hadd_u_w(vec0, vec0);
504 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
505 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
506 tmp0 = __msa_hadd_u_w(vec0, vec0);
507 tmp1 = __msa_hadd_u_w(vec1, vec1);
508 tmp0 *= const_0x1C71;
509 tmp1 *= const_0x1C71;
510 tmp4 *= const_0x2AAA;
511 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
512 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
513 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
514 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
515 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
516 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
517 dst0 = __msa_copy_u_d((v2i64)out, 0);
518 dst1 = __msa_copy_u_w((v4i32)out, 2);
519 SD(dst0, dst_ptr);
520 SW(dst1, dst_ptr + 8);
521 s += 32;
522 t0 += 32;
523 t1 += 32;
524 dst_ptr += 12;
525 }
526 }
527
ScaleAddRow_MSA(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)528 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
529 int x;
530 v16u8 src0;
531 v8u16 dst0, dst1;
532 v16i8 zero = {0};
533
534 assert(src_width > 0);
535
536 for (x = 0; x < src_width; x += 16) {
537 src0 = LD_UB(src_ptr);
538 dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
539 dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
540 dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
541 dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
542 ST_UH2(dst0, dst1, dst_ptr, 8);
543 src_ptr += 16;
544 dst_ptr += 16;
545 }
546 }
547
548 #ifdef __cplusplus
549 } // extern "C"
550 } // namespace libyuv
551 #endif
552
553 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
554