1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13
14 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
15 { \
16 out0 = __msa_subs_u_h(out0, in0); \
17 out1 = __msa_subs_u_h(out1, in1); \
18 }
19
intra_predict_vert_4x4_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)20 static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
21 int32_t dst_stride) {
22 uint32_t src_data;
23
24 src_data = LW(src);
25
26 SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
27 }
28
intra_predict_vert_8x8_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)29 static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
30 int32_t dst_stride) {
31 uint32_t row;
32 uint32_t src_data1, src_data2;
33
34 src_data1 = LW(src);
35 src_data2 = LW(src + 4);
36
37 for (row = 8; row--;) {
38 SW(src_data1, dst);
39 SW(src_data2, (dst + 4));
40 dst += dst_stride;
41 }
42 }
43
intra_predict_vert_16x16_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)44 static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
45 int32_t dst_stride) {
46 uint32_t row;
47 v16u8 src0;
48
49 src0 = LD_UB(src);
50
51 for (row = 16; row--;) {
52 ST_UB(src0, dst);
53 dst += dst_stride;
54 }
55 }
56
intra_predict_vert_32x32_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)57 static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
58 int32_t dst_stride) {
59 uint32_t row;
60 v16u8 src1, src2;
61
62 src1 = LD_UB(src);
63 src2 = LD_UB(src + 16);
64
65 for (row = 32; row--;) {
66 ST_UB2(src1, src2, dst, 16);
67 dst += dst_stride;
68 }
69 }
70
intra_predict_horiz_4x4_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)71 static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
72 int32_t dst_stride) {
73 uint32_t out0, out1, out2, out3;
74
75 out0 = src[0] * 0x01010101;
76 out1 = src[1] * 0x01010101;
77 out2 = src[2] * 0x01010101;
78 out3 = src[3] * 0x01010101;
79
80 SW4(out0, out1, out2, out3, dst, dst_stride);
81 }
82
intra_predict_horiz_8x8_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)83 static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
84 int32_t dst_stride) {
85 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
86
87 out0 = src[0] * 0x0101010101010101ull;
88 out1 = src[1] * 0x0101010101010101ull;
89 out2 = src[2] * 0x0101010101010101ull;
90 out3 = src[3] * 0x0101010101010101ull;
91 out4 = src[4] * 0x0101010101010101ull;
92 out5 = src[5] * 0x0101010101010101ull;
93 out6 = src[6] * 0x0101010101010101ull;
94 out7 = src[7] * 0x0101010101010101ull;
95
96 SD4(out0, out1, out2, out3, dst, dst_stride);
97 dst += (4 * dst_stride);
98 SD4(out4, out5, out6, out7, dst, dst_stride);
99 }
100
intra_predict_horiz_16x16_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)101 static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
102 int32_t dst_stride) {
103 uint32_t row;
104 uint8_t inp0, inp1, inp2, inp3;
105 v16u8 src0, src1, src2, src3;
106
107 for (row = 4; row--;) {
108 inp0 = src[0];
109 inp1 = src[1];
110 inp2 = src[2];
111 inp3 = src[3];
112 src += 4;
113
114 src0 = (v16u8)__msa_fill_b(inp0);
115 src1 = (v16u8)__msa_fill_b(inp1);
116 src2 = (v16u8)__msa_fill_b(inp2);
117 src3 = (v16u8)__msa_fill_b(inp3);
118
119 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 }
122 }
123
intra_predict_horiz_32x32_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)124 static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
125 int32_t dst_stride) {
126 uint32_t row;
127 uint8_t inp0, inp1, inp2, inp3;
128 v16u8 src0, src1, src2, src3;
129
130 for (row = 8; row--;) {
131 inp0 = src[0];
132 inp1 = src[1];
133 inp2 = src[2];
134 inp3 = src[3];
135 src += 4;
136
137 src0 = (v16u8)__msa_fill_b(inp0);
138 src1 = (v16u8)__msa_fill_b(inp1);
139 src2 = (v16u8)__msa_fill_b(inp2);
140 src3 = (v16u8)__msa_fill_b(inp3);
141
142 ST_UB2(src0, src0, dst, 16);
143 dst += dst_stride;
144 ST_UB2(src1, src1, dst, 16);
145 dst += dst_stride;
146 ST_UB2(src2, src2, dst, 16);
147 dst += dst_stride;
148 ST_UB2(src3, src3, dst, 16);
149 dst += dst_stride;
150 }
151 }
152
intra_predict_dc_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)153 static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
154 const uint8_t *src_left, uint8_t *dst,
155 int32_t dst_stride) {
156 uint32_t val0, val1;
157 v16i8 store, src = { 0 };
158 v8u16 sum_h;
159 v4u32 sum_w;
160 v2u64 sum_d;
161
162 val0 = LW(src_top);
163 val1 = LW(src_left);
164 INSERT_W2_SB(val0, val1, src);
165 sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
166 sum_w = __msa_hadd_u_w(sum_h, sum_h);
167 sum_d = __msa_hadd_u_d(sum_w, sum_w);
168 sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
169 store = __msa_splati_b((v16i8)sum_w, 0);
170 val0 = __msa_copy_u_w((v4i32)store, 0);
171
172 SW4(val0, val0, val0, val0, dst, dst_stride);
173 }
174
intra_predict_dc_tl_4x4_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)175 static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
176 int32_t dst_stride) {
177 uint32_t val0;
178 v16i8 store, data = { 0 };
179 v8u16 sum_h;
180 v4u32 sum_w;
181
182 val0 = LW(src);
183 data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
184 sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
185 sum_w = __msa_hadd_u_w(sum_h, sum_h);
186 sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
187 store = __msa_splati_b((v16i8)sum_w, 0);
188 val0 = __msa_copy_u_w((v4i32)store, 0);
189
190 SW4(val0, val0, val0, val0, dst, dst_stride);
191 }
192
intra_predict_128dc_4x4_msa(uint8_t * dst,int32_t dst_stride)193 static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
194 uint32_t out;
195 const v16i8 store = __msa_ldi_b(128);
196
197 out = __msa_copy_u_w((v4i32)store, 0);
198
199 SW4(out, out, out, out, dst, dst_stride);
200 }
201
intra_predict_dc_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)202 static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
203 const uint8_t *src_left, uint8_t *dst,
204 int32_t dst_stride) {
205 uint64_t val0, val1;
206 v16i8 store;
207 v16u8 src = { 0 };
208 v8u16 sum_h;
209 v4u32 sum_w;
210 v2u64 sum_d;
211
212 val0 = LD(src_top);
213 val1 = LD(src_left);
214 INSERT_D2_UB(val0, val1, src);
215 sum_h = __msa_hadd_u_h(src, src);
216 sum_w = __msa_hadd_u_w(sum_h, sum_h);
217 sum_d = __msa_hadd_u_d(sum_w, sum_w);
218 sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
219 sum_d = __msa_hadd_u_d(sum_w, sum_w);
220 sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
221 store = __msa_splati_b((v16i8)sum_w, 0);
222 val0 = __msa_copy_u_d((v2i64)store, 0);
223
224 SD4(val0, val0, val0, val0, dst, dst_stride);
225 dst += (4 * dst_stride);
226 SD4(val0, val0, val0, val0, dst, dst_stride);
227 }
228
intra_predict_dc_tl_8x8_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)229 static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
230 int32_t dst_stride) {
231 uint64_t val0;
232 v16i8 store;
233 v16u8 data = { 0 };
234 v8u16 sum_h;
235 v4u32 sum_w;
236 v2u64 sum_d;
237
238 val0 = LD(src);
239 data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
240 sum_h = __msa_hadd_u_h(data, data);
241 sum_w = __msa_hadd_u_w(sum_h, sum_h);
242 sum_d = __msa_hadd_u_d(sum_w, sum_w);
243 sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
244 store = __msa_splati_b((v16i8)sum_w, 0);
245 val0 = __msa_copy_u_d((v2i64)store, 0);
246
247 SD4(val0, val0, val0, val0, dst, dst_stride);
248 dst += (4 * dst_stride);
249 SD4(val0, val0, val0, val0, dst, dst_stride);
250 }
251
intra_predict_128dc_8x8_msa(uint8_t * dst,int32_t dst_stride)252 static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
253 uint64_t out;
254 const v16i8 store = __msa_ldi_b(128);
255
256 out = __msa_copy_u_d((v2i64)store, 0);
257
258 SD4(out, out, out, out, dst, dst_stride);
259 dst += (4 * dst_stride);
260 SD4(out, out, out, out, dst, dst_stride);
261 }
262
intra_predict_dc_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)263 static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
264 const uint8_t *src_left, uint8_t *dst,
265 int32_t dst_stride) {
266 v16u8 top, left, out;
267 v8u16 sum_h, sum_top, sum_left;
268 v4u32 sum_w;
269 v2u64 sum_d;
270
271 top = LD_UB(src_top);
272 left = LD_UB(src_left);
273 HADD_UB2_UH(top, left, sum_top, sum_left);
274 sum_h = sum_top + sum_left;
275 sum_w = __msa_hadd_u_w(sum_h, sum_h);
276 sum_d = __msa_hadd_u_d(sum_w, sum_w);
277 sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
278 sum_d = __msa_hadd_u_d(sum_w, sum_w);
279 sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
280 out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
281
282 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
283 dst += (8 * dst_stride);
284 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
285 }
286
intra_predict_dc_tl_16x16_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)287 static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
288 int32_t dst_stride) {
289 v16u8 data, out;
290 v8u16 sum_h;
291 v4u32 sum_w;
292 v2u64 sum_d;
293
294 data = LD_UB(src);
295 sum_h = __msa_hadd_u_h(data, data);
296 sum_w = __msa_hadd_u_w(sum_h, sum_h);
297 sum_d = __msa_hadd_u_d(sum_w, sum_w);
298 sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
299 sum_d = __msa_hadd_u_d(sum_w, sum_w);
300 sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
301 out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
302
303 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
304 dst += (8 * dst_stride);
305 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
306 }
307
intra_predict_128dc_16x16_msa(uint8_t * dst,int32_t dst_stride)308 static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
309 const v16u8 out = (v16u8)__msa_ldi_b(128);
310
311 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
312 dst += (8 * dst_stride);
313 ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
314 }
315
intra_predict_dc_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)316 static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
317 const uint8_t *src_left, uint8_t *dst,
318 int32_t dst_stride) {
319 uint32_t row;
320 v16u8 top0, top1, left0, left1, out;
321 v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
322 v4u32 sum_w;
323 v2u64 sum_d;
324
325 LD_UB2(src_top, 16, top0, top1);
326 LD_UB2(src_left, 16, left0, left1);
327 HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
328 HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
329 sum_h = sum_top0 + sum_top1;
330 sum_h += sum_left0 + sum_left1;
331 sum_w = __msa_hadd_u_w(sum_h, sum_h);
332 sum_d = __msa_hadd_u_d(sum_w, sum_w);
333 sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
334 sum_d = __msa_hadd_u_d(sum_w, sum_w);
335 sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
336 out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
337
338 for (row = 16; row--;) {
339 ST_UB2(out, out, dst, 16);
340 dst += dst_stride;
341 ST_UB2(out, out, dst, 16);
342 dst += dst_stride;
343 }
344 }
345
intra_predict_dc_tl_32x32_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)346 static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
347 int32_t dst_stride) {
348 uint32_t row;
349 v16u8 data0, data1, out;
350 v8u16 sum_h, sum_data0, sum_data1;
351 v4u32 sum_w;
352 v2u64 sum_d;
353
354 LD_UB2(src, 16, data0, data1);
355 HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
356 sum_h = sum_data0 + sum_data1;
357 sum_w = __msa_hadd_u_w(sum_h, sum_h);
358 sum_d = __msa_hadd_u_d(sum_w, sum_w);
359 sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
360 sum_d = __msa_hadd_u_d(sum_w, sum_w);
361 sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
362 out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
363
364 for (row = 16; row--;) {
365 ST_UB2(out, out, dst, 16);
366 dst += dst_stride;
367 ST_UB2(out, out, dst, 16);
368 dst += dst_stride;
369 }
370 }
371
intra_predict_128dc_32x32_msa(uint8_t * dst,int32_t dst_stride)372 static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
373 uint32_t row;
374 const v16u8 out = (v16u8)__msa_ldi_b(128);
375
376 for (row = 16; row--;) {
377 ST_UB2(out, out, dst, 16);
378 dst += dst_stride;
379 ST_UB2(out, out, dst, 16);
380 dst += dst_stride;
381 }
382 }
383
intra_predict_tm_4x4_msa(const uint8_t * src_top_ptr,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)384 static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
385 const uint8_t *src_left, uint8_t *dst,
386 int32_t dst_stride) {
387 uint32_t val;
388 uint8_t top_left = src_top_ptr[-1];
389 v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
390 v16u8 src0, src1, src2, src3;
391 v8u16 src_top_left, vec0, vec1, vec2, vec3;
392
393 src_top_left = (v8u16)__msa_fill_h(top_left);
394 val = LW(src_top_ptr);
395 src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
396
397 src_left0 = __msa_fill_b(src_left[0]);
398 src_left1 = __msa_fill_b(src_left[1]);
399 src_left2 = __msa_fill_b(src_left[2]);
400 src_left3 = __msa_fill_b(src_left[3]);
401
402 ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
403 src_left3, src_top, src0, src1, src2, src3);
404 HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
405 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
406 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
407 SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
408 PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
409 ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
410 }
411
intra_predict_tm_8x8_msa(const uint8_t * src_top_ptr,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)412 static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
413 const uint8_t *src_left, uint8_t *dst,
414 int32_t dst_stride) {
415 uint64_t val;
416 uint8_t top_left = src_top_ptr[-1];
417 uint32_t loop_cnt;
418 v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
419 v8u16 src_top_left, vec0, vec1, vec2, vec3;
420 v16u8 src0, src1, src2, src3;
421
422 val = LD(src_top_ptr);
423 src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
424 src_top_left = (v8u16)__msa_fill_h(top_left);
425
426 for (loop_cnt = 2; loop_cnt--;) {
427 src_left0 = __msa_fill_b(src_left[0]);
428 src_left1 = __msa_fill_b(src_left[1]);
429 src_left2 = __msa_fill_b(src_left[2]);
430 src_left3 = __msa_fill_b(src_left[3]);
431 src_left += 4;
432
433 ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
434 src_left3, src_top, src0, src1, src2, src3);
435 HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
436 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
437 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
438 SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
439 PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
440 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
441 dst += (4 * dst_stride);
442 }
443 }
444
intra_predict_tm_16x16_msa(const uint8_t * src_top_ptr,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)445 static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
446 const uint8_t *src_left, uint8_t *dst,
447 int32_t dst_stride) {
448 uint8_t top_left = src_top_ptr[-1];
449 uint32_t loop_cnt;
450 v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
451 v8u16 src_top_left, res_r, res_l;
452
453 src_top = LD_SB(src_top_ptr);
454 src_top_left = (v8u16)__msa_fill_h(top_left);
455
456 for (loop_cnt = 4; loop_cnt--;) {
457 src_left0 = __msa_fill_b(src_left[0]);
458 src_left1 = __msa_fill_b(src_left[1]);
459 src_left2 = __msa_fill_b(src_left[2]);
460 src_left3 = __msa_fill_b(src_left[3]);
461 src_left += 4;
462
463 ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
464 HADD_UB2_UH(res_r, res_l, res_r, res_l);
465 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
466
467 SAT_UH2_UH(res_r, res_l, 7);
468 PCKEV_ST_SB(res_r, res_l, dst);
469 dst += dst_stride;
470
471 ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
472 HADD_UB2_UH(res_r, res_l, res_r, res_l);
473 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
474 SAT_UH2_UH(res_r, res_l, 7);
475 PCKEV_ST_SB(res_r, res_l, dst);
476 dst += dst_stride;
477
478 ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
479 HADD_UB2_UH(res_r, res_l, res_r, res_l);
480 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
481 SAT_UH2_UH(res_r, res_l, 7);
482 PCKEV_ST_SB(res_r, res_l, dst);
483 dst += dst_stride;
484
485 ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
486 HADD_UB2_UH(res_r, res_l, res_r, res_l);
487 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
488 SAT_UH2_UH(res_r, res_l, 7);
489 PCKEV_ST_SB(res_r, res_l, dst);
490 dst += dst_stride;
491 }
492 }
493
intra_predict_tm_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t dst_stride)494 static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
495 const uint8_t *src_left, uint8_t *dst,
496 int32_t dst_stride) {
497 uint8_t top_left = src_top[-1];
498 uint32_t loop_cnt;
499 v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
500 v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
501
502 LD_SB2(src_top, 16, src_top0, src_top1);
503 src_top_left = (v8u16)__msa_fill_h(top_left);
504
505 for (loop_cnt = 8; loop_cnt--;) {
506 src_left0 = __msa_fill_b(src_left[0]);
507 src_left1 = __msa_fill_b(src_left[1]);
508 src_left2 = __msa_fill_b(src_left[2]);
509 src_left3 = __msa_fill_b(src_left[3]);
510 src_left += 4;
511
512 ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
513 ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
514 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
515 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
516 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
517 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
518 PCKEV_ST_SB(res_r0, res_l0, dst);
519 PCKEV_ST_SB(res_r1, res_l1, dst + 16);
520 dst += dst_stride;
521
522 ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
523 ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
524 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
525 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
526 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
527 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
528 PCKEV_ST_SB(res_r0, res_l0, dst);
529 PCKEV_ST_SB(res_r1, res_l1, dst + 16);
530 dst += dst_stride;
531
532 ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
533 ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
534 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
535 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
536 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
537 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
538 PCKEV_ST_SB(res_r0, res_l0, dst);
539 PCKEV_ST_SB(res_r1, res_l1, dst + 16);
540 dst += dst_stride;
541
542 ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
543 ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
544 HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
545 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
546 IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
547 SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
548 PCKEV_ST_SB(res_r0, res_l0, dst);
549 PCKEV_ST_SB(res_r1, res_l1, dst + 16);
550 dst += dst_stride;
551 }
552 }
553
vpx_v_predictor_4x4_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)554 void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
555 const uint8_t *above, const uint8_t *left) {
556 (void)left;
557
558 intra_predict_vert_4x4_msa(above, dst, y_stride);
559 }
560
vpx_v_predictor_8x8_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)561 void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
562 const uint8_t *above, const uint8_t *left) {
563 (void)left;
564
565 intra_predict_vert_8x8_msa(above, dst, y_stride);
566 }
567
vpx_v_predictor_16x16_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)568 void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
569 const uint8_t *above, const uint8_t *left) {
570 (void)left;
571
572 intra_predict_vert_16x16_msa(above, dst, y_stride);
573 }
574
vpx_v_predictor_32x32_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)575 void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
576 const uint8_t *above, const uint8_t *left) {
577 (void)left;
578
579 intra_predict_vert_32x32_msa(above, dst, y_stride);
580 }
581
vpx_h_predictor_4x4_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)582 void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
583 const uint8_t *above, const uint8_t *left) {
584 (void)above;
585
586 intra_predict_horiz_4x4_msa(left, dst, y_stride);
587 }
588
vpx_h_predictor_8x8_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)589 void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
590 const uint8_t *above, const uint8_t *left) {
591 (void)above;
592
593 intra_predict_horiz_8x8_msa(left, dst, y_stride);
594 }
595
vpx_h_predictor_16x16_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)596 void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
597 const uint8_t *above, const uint8_t *left) {
598 (void)above;
599
600 intra_predict_horiz_16x16_msa(left, dst, y_stride);
601 }
602
vpx_h_predictor_32x32_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)603 void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
604 const uint8_t *above, const uint8_t *left) {
605 (void)above;
606
607 intra_predict_horiz_32x32_msa(left, dst, y_stride);
608 }
609
vpx_dc_predictor_4x4_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)610 void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
611 const uint8_t *above, const uint8_t *left) {
612 intra_predict_dc_4x4_msa(above, left, dst, y_stride);
613 }
614
vpx_dc_predictor_8x8_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)615 void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
616 const uint8_t *above, const uint8_t *left) {
617 intra_predict_dc_8x8_msa(above, left, dst, y_stride);
618 }
619
vpx_dc_predictor_16x16_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)620 void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
621 const uint8_t *above, const uint8_t *left) {
622 intra_predict_dc_16x16_msa(above, left, dst, y_stride);
623 }
624
vpx_dc_predictor_32x32_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)625 void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
626 const uint8_t *above, const uint8_t *left) {
627 intra_predict_dc_32x32_msa(above, left, dst, y_stride);
628 }
629
vpx_dc_top_predictor_4x4_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)630 void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
631 const uint8_t *above, const uint8_t *left) {
632 (void)left;
633
634 intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
635 }
636
vpx_dc_top_predictor_8x8_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)637 void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
638 const uint8_t *above, const uint8_t *left) {
639 (void)left;
640
641 intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
642 }
643
vpx_dc_top_predictor_16x16_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)644 void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
645 const uint8_t *above, const uint8_t *left) {
646 (void)left;
647
648 intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
649 }
650
vpx_dc_top_predictor_32x32_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)651 void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
652 const uint8_t *above, const uint8_t *left) {
653 (void)left;
654
655 intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
656 }
657
vpx_dc_left_predictor_4x4_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)658 void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
659 const uint8_t *above, const uint8_t *left) {
660 (void)above;
661
662 intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
663 }
664
vpx_dc_left_predictor_8x8_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)665 void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
666 const uint8_t *above, const uint8_t *left) {
667 (void)above;
668
669 intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
670 }
671
vpx_dc_left_predictor_16x16_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)672 void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
673 const uint8_t *above,
674 const uint8_t *left) {
675 (void)above;
676
677 intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
678 }
679
vpx_dc_left_predictor_32x32_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)680 void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
681 const uint8_t *above,
682 const uint8_t *left) {
683 (void)above;
684
685 intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
686 }
687
vpx_dc_128_predictor_4x4_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)688 void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
689 const uint8_t *above, const uint8_t *left) {
690 (void)above;
691 (void)left;
692
693 intra_predict_128dc_4x4_msa(dst, y_stride);
694 }
695
vpx_dc_128_predictor_8x8_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)696 void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
697 const uint8_t *above, const uint8_t *left) {
698 (void)above;
699 (void)left;
700
701 intra_predict_128dc_8x8_msa(dst, y_stride);
702 }
703
vpx_dc_128_predictor_16x16_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)704 void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
705 const uint8_t *above, const uint8_t *left) {
706 (void)above;
707 (void)left;
708
709 intra_predict_128dc_16x16_msa(dst, y_stride);
710 }
711
vpx_dc_128_predictor_32x32_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)712 void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
713 const uint8_t *above, const uint8_t *left) {
714 (void)above;
715 (void)left;
716
717 intra_predict_128dc_32x32_msa(dst, y_stride);
718 }
719
vpx_tm_predictor_4x4_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)720 void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
721 const uint8_t *above, const uint8_t *left) {
722 intra_predict_tm_4x4_msa(above, left, dst, y_stride);
723 }
724
vpx_tm_predictor_8x8_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)725 void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
726 const uint8_t *above, const uint8_t *left) {
727 intra_predict_tm_8x8_msa(above, left, dst, y_stride);
728 }
729
vpx_tm_predictor_16x16_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)730 void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
731 const uint8_t *above, const uint8_t *left) {
732 intra_predict_tm_16x16_msa(above, left, dst, y_stride);
733 }
734
vpx_tm_predictor_32x32_msa(uint8_t * dst,ptrdiff_t y_stride,const uint8_t * above,const uint8_t * left)735 void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
736 const uint8_t *above, const uint8_t *left) {
737 intra_predict_tm_32x32_msa(above, left, dst, y_stride);
738 }
739