1 /*
2 * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavcodec/hevcdec.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "hevcpred_mips.h"
24
25 static const int8_t intra_pred_angle_up[17] = {
26 -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27 };
28
29 static const int8_t intra_pred_angle_low[16] = {
30 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31 };
32
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34 mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35 res0, res1, mul_val_b0, mul_val_b1, round) \
36 { \
37 v8i16 res0_m, res1_m, res2_m, res3_m; \
38 \
39 MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40 mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
41 \
42 res0_m += mul_val_h1 * tmp0; \
43 res1_m += mul_val_h3 * tmp0; \
44 res2_m += mul_val_h1 * tmp0; \
45 res3_m += mul_val_h3 * tmp0; \
46 \
47 res0_m += mul_val_b0 * src0_r; \
48 res1_m += mul_val_b0 * src0_l; \
49 res2_m += (mul_val_b0 - 1) * src0_r; \
50 res3_m += (mul_val_b0 - 1) * src0_l; \
51 \
52 res0_m += mul_val_b1 * tmp1; \
53 res1_m += mul_val_b1 * tmp1; \
54 res2_m += (mul_val_b1 + 1) * tmp1; \
55 res3_m += (mul_val_b1 + 1) * tmp1; \
56 \
57 SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58 PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
59 }
60
hevc_intra_pred_vert_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62 const uint8_t *src_left,
63 uint8_t *dst, int32_t stride,
64 int32_t flag)
65 {
66 uint32_t col;
67 uint32_t src_data;
68 v8i16 vec0, vec1, vec2;
69 v16i8 zero = { 0 };
70
71 src_data = LW(src_top);
72 SW4(src_data, src_data, src_data, src_data, dst, stride);
73
74 if (0 == flag) {
75 src_data = LW(src_left);
76
77 vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78
79 vec0 = __msa_fill_h(src_left[-1]);
80 vec1 = __msa_fill_h(src_top[0]);
81
82 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83 vec2 -= vec0;
84 vec2 >>= 1;
85 vec2 += vec1;
86 CLIP_SH_0_255(vec2);
87
88 for (col = 0; col < 4; col++) {
89 dst[stride * col] = (uint8_t) vec2[col];
90 }
91 }
92 }
93
hevc_intra_pred_vert_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95 const uint8_t *src_left,
96 uint8_t *dst, int32_t stride,
97 int32_t flag)
98 {
99 uint8_t *tmp_dst = dst;
100 uint32_t row;
101 uint16_t val0, val1, val2, val3;
102 uint64_t src_data1;
103 v8i16 vec0, vec1, vec2;
104 v16i8 zero = { 0 };
105
106 src_data1 = LD(src_top);
107
108 for (row = 8; row--;) {
109 SD(src_data1, tmp_dst);
110 tmp_dst += stride;
111 }
112
113 if (0 == flag) {
114 src_data1 = LD(src_left);
115
116 vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117
118 vec0 = __msa_fill_h(src_left[-1]);
119 vec1 = __msa_fill_h(src_top[0]);
120
121 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122 vec2 -= vec0;
123 vec2 >>= 1;
124 vec2 += vec1;
125 CLIP_SH_0_255(vec2);
126
127 val0 = vec2[0];
128 val1 = vec2[1];
129 val2 = vec2[2];
130 val3 = vec2[3];
131
132 dst[0] = val0;
133 dst[stride] = val1;
134 dst[2 * stride] = val2;
135 dst[3 * stride] = val3;
136
137 val0 = vec2[4];
138 val1 = vec2[5];
139 val2 = vec2[6];
140 val3 = vec2[7];
141
142 dst[4 * stride] = val0;
143 dst[5 * stride] = val1;
144 dst[6 * stride] = val2;
145 dst[7 * stride] = val3;
146 }
147 }
148
hevc_intra_pred_vert_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150 const uint8_t *src_left,
151 uint8_t *dst, int32_t stride,
152 int32_t flag)
153 {
154 int32_t col;
155 uint8_t *tmp_dst = dst;
156 uint32_t row;
157 v16u8 src;
158 v8i16 vec0, vec1, vec2, vec3;
159
160 src = LD_UB(src_top);
161
162 for (row = 16; row--;) {
163 ST_UB(src, tmp_dst);
164 tmp_dst += stride;
165 }
166
167 if (0 == flag) {
168 src = LD_UB(src_left);
169
170 vec0 = __msa_fill_h(src_left[-1]);
171 vec1 = __msa_fill_h(src_top[0]);
172
173 UNPCK_UB_SH(src, vec2, vec3);
174 SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175
176 vec2 >>= 1;
177 vec3 >>= 1;
178
179 ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180 CLIP_SH2_0_255(vec2, vec3);
181
182 src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183
184 for (col = 0; col < 16; col++) {
185 dst[stride * col] = src[col];
186 }
187 }
188 }
189
hevc_intra_pred_horiz_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191 const uint8_t *src_left,
192 uint8_t *dst, int32_t stride,
193 int32_t flag)
194 {
195 uint32_t val0, val1, val2, val3;
196 v16i8 src0;
197 v8i16 src0_r, src_top_val, src_left_val;
198 v16i8 zero = { 0 };
199
200 val0 = src_left[0] * 0x01010101;
201 val1 = src_left[1] * 0x01010101;
202 val2 = src_left[2] * 0x01010101;
203 val3 = src_left[3] * 0x01010101;
204 SW4(val0, val1, val2, val3, dst, stride);
205
206 if (0 == flag) {
207 val0 = LW(src_top);
208 src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209 src_top_val = __msa_fill_h(src_top[-1]);
210 src_left_val = __msa_fill_h(src_left[0]);
211
212 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213
214 src0_r -= src_top_val;
215 src0_r >>= 1;
216 src0_r += src_left_val;
217 CLIP_SH_0_255(src0_r);
218 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219 val0 = __msa_copy_s_w((v4i32) src0, 0);
220 SW(val0, dst);
221 }
222 }
223
hevc_intra_pred_horiz_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225 const uint8_t *src_left,
226 uint8_t *dst, int32_t stride,
227 int32_t flag)
228 {
229 uint64_t val0, val1, val2, val3;
230 v16i8 src0;
231 v8i16 src0_r, src_top_val, src_left_val;
232 v16i8 zero = { 0 };
233
234 val0 = src_left[0] * 0x0101010101010101;
235 val1 = src_left[1] * 0x0101010101010101;
236 val2 = src_left[2] * 0x0101010101010101;
237 val3 = src_left[3] * 0x0101010101010101;
238 SD4(val0, val1, val2, val3, dst, stride);
239
240 val0 = src_left[4] * 0x0101010101010101;
241 val1 = src_left[5] * 0x0101010101010101;
242 val2 = src_left[6] * 0x0101010101010101;
243 val3 = src_left[7] * 0x0101010101010101;
244 SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245
246 if (0 == flag) {
247 val0 = LD(src_top);
248 src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249 src_top_val = __msa_fill_h(src_top[-1]);
250 src_left_val = __msa_fill_h(src_left[0]);
251
252 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253
254 src0_r -= src_top_val;
255 src0_r >>= 1;
256 src0_r += src_left_val;
257 CLIP_SH_0_255(src0_r);
258 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259 val0 = __msa_copy_s_d((v2i64) src0, 0);
260 SD(val0, dst);
261 }
262 }
263
hevc_intra_pred_horiz_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265 const uint8_t *src_left,
266 uint8_t *dst, int32_t stride,
267 int32_t flag)
268 {
269 uint8_t *tmp_dst = dst;
270 uint32_t row;
271 uint8_t inp0, inp1, inp2, inp3;
272 v16i8 src0, src1, src2, src3;
273 v8i16 src0_r, src0_l, src_left_val, src_top_val;
274
275 src_left_val = __msa_fill_h(src_left[0]);
276
277 for (row = 4; row--;) {
278 inp0 = src_left[0];
279 inp1 = src_left[1];
280 inp2 = src_left[2];
281 inp3 = src_left[3];
282 src_left += 4;
283
284 src0 = __msa_fill_b(inp0);
285 src1 = __msa_fill_b(inp1);
286 src2 = __msa_fill_b(inp2);
287 src3 = __msa_fill_b(inp3);
288
289 ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290 tmp_dst += (4 * stride);
291 }
292
293 if (0 == flag) {
294 src0 = LD_SB(src_top);
295 src_top_val = __msa_fill_h(src_top[-1]);
296
297 UNPCK_UB_SH(src0, src0_r, src0_l);
298 SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299
300 src0_r >>= 1;
301 src0_l >>= 1;
302
303 ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304 CLIP_SH2_0_255(src0_r, src0_l);
305 src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306 ST_SB(src0, dst);
307 }
308 }
309
hevc_intra_pred_horiz_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311 const uint8_t *src_left,
312 uint8_t *dst, int32_t stride)
313 {
314 uint32_t row;
315 uint8_t inp0, inp1, inp2, inp3;
316 v16i8 src0, src1, src2, src3;
317
318 for (row = 0; row < 8; row++) {
319 inp0 = src_left[row * 4];
320 inp1 = src_left[row * 4 + 1];
321 inp2 = src_left[row * 4 + 2];
322 inp3 = src_left[row * 4 + 3];
323
324 src0 = __msa_fill_b(inp0);
325 src1 = __msa_fill_b(inp1);
326 src2 = __msa_fill_b(inp2);
327 src3 = __msa_fill_b(inp3);
328
329 ST_SB2(src0, src0, dst, 16);
330 dst += stride;
331 ST_SB2(src1, src1, dst, 16);
332 dst += stride;
333 ST_SB2(src2, src2, dst, 16);
334 dst += stride;
335 ST_SB2(src3, src3, dst, 16);
336 dst += stride;
337 }
338 }
339
hevc_intra_pred_dc_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341 const uint8_t *src_left,
342 uint8_t *dst, int32_t stride,
343 int32_t flag)
344 {
345 uint8_t *tmp_dst = dst;
346 uint32_t addition = 0;
347 uint32_t val0, val1, val2;
348 v16i8 src = { 0 };
349 v16u8 store;
350 v16i8 zero = { 0 };
351 v8u16 sum, vec0, vec1;
352
353 val0 = LW(src_top);
354 val1 = LW(src_left);
355 INSERT_W2_SB(val0, val1, src);
356 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357 sum = (v8u16) __msa_hadd_u_w(sum, sum);
358 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359 sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360 addition = __msa_copy_u_w((v4i32) sum, 0);
361 store = (v16u8) __msa_fill_b(addition);
362 val0 = __msa_copy_u_w((v4i32) store, 0);
363 SW4(val0, val0, val0, val0, dst, stride)
364
365 if (0 == flag) {
366 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367
368 vec1 += vec0;
369 vec0 += vec0;
370 vec1 += vec0;
371
372 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374 val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375 store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376 val0 = __msa_copy_u_w((v4i32) store, 0);
377 SW(val0, tmp_dst);
378
379 val0 = src_left[1];
380 val1 = src_left[2];
381 val2 = src_left[3];
382
383 addition *= 3;
384
385 ADD2(val0, addition, val1, addition, val0, val1);
386 val2 += addition;
387
388 val0 += 2;
389 val1 += 2;
390 val2 += 2;
391 val0 >>= 2;
392 val1 >>= 2;
393 val2 >>= 2;
394
395 tmp_dst[stride * 1] = val0;
396 tmp_dst[stride * 2] = val1;
397 tmp_dst[stride * 3] = val2;
398 }
399 }
400
hevc_intra_pred_dc_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402 const uint8_t *src_left,
403 uint8_t *dst, int32_t stride,
404 int32_t flag)
405 {
406 uint8_t *tmp_dst = dst;
407 uint32_t row, col, val;
408 uint32_t addition = 0;
409 uint64_t val0, val1;
410 v16u8 src = { 0 };
411 v16u8 store;
412 v8u16 sum, vec0, vec1;
413 v16i8 zero = { 0 };
414
415 val0 = LD(src_top);
416 val1 = LD(src_left);
417 INSERT_D2_UB(val0, val1, src);
418 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419 sum = (v8u16) __msa_hadd_u_w(sum, sum);
420 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423 sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424 addition = __msa_copy_u_w((v4i32) sum, 0);
425 store = (v16u8) __msa_fill_b(addition);
426 val0 = __msa_copy_u_d((v2i64) store, 0);
427
428 for (row = 8; row--;) {
429 SD(val0, dst);
430 dst += stride;
431 }
432
433 if (0 == flag) {
434 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435
436 vec1 += vec0;
437 vec0 += vec0;
438 vec1 += vec0;
439 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442 store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443 val0 = __msa_copy_u_d((v2i64) store, 0);
444 SD(val0, tmp_dst);
445
446 val0 = LD(src_left);
447 src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448 vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449 vec0 = (v8u16) __msa_fill_h(addition);
450 vec0 *= 3;
451 vec1 += vec0;
452 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453
454 for (col = 1; col < 8; col++) {
455 tmp_dst[stride * col] = vec1[col];
456 }
457 }
458 }
459
hevc_intra_pred_dc_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461 const uint8_t *src_left,
462 uint8_t *dst, int32_t stride,
463 int32_t flag)
464 {
465 uint8_t *tmp_dst = dst;
466 uint32_t row, col, val;
467 uint32_t addition = 0;
468 v16u8 src_above1, store, src_left1;
469 v8u16 sum, sum_above, sum_left;
470 v8u16 vec0, vec1, vec2;
471 v16i8 zero = { 0 };
472
473 src_above1 = LD_UB(src_top);
474 src_left1 = LD_UB(src_left);
475
476 HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477 sum = sum_above + sum_left;
478 sum = (v8u16) __msa_hadd_u_w(sum, sum);
479 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482 sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483 addition = __msa_copy_u_w((v4i32) sum, 0);
484 store = (v16u8) __msa_fill_b(addition);
485
486 for (row = 16; row--;) {
487 ST_UB(store, dst);
488 dst += stride;
489 }
490
491 if (0 == flag) {
492 vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493 ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495 vec0 += vec0;
496 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497 SRARI_H2_UH(vec1, vec2, 2);
498 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500 store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501 ST_UB(store, tmp_dst);
502
503 ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504 vec0 = (v8u16) __msa_fill_h(addition);
505 vec0 *= 3;
506 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507 SRARI_H2_UH(vec1, vec2, 2);
508 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509
510 for (col = 1; col < 16; col++) {
511 tmp_dst[stride * col] = store[col];
512 }
513 }
514 }
515
hevc_intra_pred_dc_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517 const uint8_t *src_left,
518 uint8_t *dst, int32_t stride)
519 {
520 uint32_t row;
521 v16u8 src_above1, src_above2, store, src_left1, src_left2;
522 v8u16 sum_above1, sum_above2;
523 v8u16 sum_left1, sum_left2;
524 v8u16 sum, sum_above, sum_left;
525
526 LD_UB2(src_top, 16, src_above1, src_above2);
527 LD_UB2(src_left, 16, src_left1, src_left2);
528 HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529 HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530 sum_above = sum_above1 + sum_above2;
531 sum_left = sum_left1 + sum_left2;
532 sum = sum_above + sum_left;
533 sum = (v8u16) __msa_hadd_u_w(sum, sum);
534 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537 sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538 store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539
540 for (row = 16; row--;) {
541 ST_UB2(store, store, dst, 16);
542 dst += stride;
543 ST_UB2(store, store, dst, 16);
544 dst += stride;
545 }
546 }
547
hevc_intra_pred_plane_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549 const uint8_t *src_left,
550 uint8_t *dst, int32_t stride)
551 {
552 uint32_t src0, src1;
553 v16i8 src_vec0, src_vec1;
554 v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555 v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556 v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557 v16i8 zero = { 0 };
558
559 src0 = LW(src_top);
560 src1 = LW(src_left);
561
562 mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563
564 src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565 src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566
567 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568 SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569
570 tmp0 = __msa_fill_h(src_top[4]);
571 tmp1 = __msa_fill_h(src_left[4]);
572
573 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574 res0, res1, res2, res3);
575
576 res0 += mul_val1 * tmp0;
577 res1 += mul_val1 * tmp0;
578 res2 += mul_val1 * tmp0;
579 res3 += mul_val1 * tmp0;
580
581 res0 += 3 * src_vec0_r;
582 res1 += 2 * src_vec0_r;
583 res2 += src_vec0_r;
584 res0 += tmp1;
585 res1 += 2 * tmp1;
586 res2 += 3 * tmp1;
587 res3 += 4 * tmp1;
588
589 PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590 SRARI_H2_SH(res0, res1, 3);
591 src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592 ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593 }
594
hevc_intra_pred_plane_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596 const uint8_t *src_left,
597 uint8_t *dst, int32_t stride)
598 {
599 uint64_t src0, src1;
600 v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601 v8i16 src_vec0_r, src_vec1_r;
602 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604 v8i16 tmp0, tmp1, tmp2;
605 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606 v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607 v16i8 zero = { 0 };
608
609 src0 = LD(src_top);
610 src1 = LD(src_left);
611
612 src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613 src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614
615 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616 SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617 SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618
619 tmp0 = __msa_fill_h(src_top[8]);
620 tmp1 = __msa_fill_h(src_left[8]);
621
622 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623 res0, res1, res2, res3);
624 MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625 res4, res5, res6, res7);
626
627 tmp2 = mul_val1 * tmp0;
628 res0 += tmp2;
629 res1 += tmp2;
630 res2 += tmp2;
631 res3 += tmp2;
632 res4 += tmp2;
633 res5 += tmp2;
634 res6 += tmp2;
635 res7 += tmp2;
636
637 res0 += 7 * src_vec0_r;
638 res1 += 6 * src_vec0_r;
639 res2 += 5 * src_vec0_r;
640 res3 += 4 * src_vec0_r;
641 res4 += 3 * src_vec0_r;
642 res5 += 2 * src_vec0_r;
643 res6 += src_vec0_r;
644
645 res0 += tmp1;
646 res1 += 2 * tmp1;
647 res2 += 3 * tmp1;
648 res3 += 4 * tmp1;
649 res4 += 5 * tmp1;
650 res5 += 6 * tmp1;
651 res6 += 7 * tmp1;
652 res7 += 8 * tmp1;
653
654 SRARI_H4_SH(res0, res1, res2, res3, 4);
655 SRARI_H4_SH(res4, res5, res6, res7, 4);
656 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657 src_vec0, src_vec1, src_vec2, src_vec3);
658
659 ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660 0, 1, 0, 1, dst, stride);
661 }
662
hevc_intra_pred_plane_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664 const uint8_t *src_left,
665 uint8_t *dst, int32_t stride)
666 {
667 v16u8 src0, src1;
668 v8i16 src0_r, src1_r, src0_l, src1_l;
669 v8i16 vec0, vec1;
670 v8i16 res0, res1, tmp0, tmp1;
671 v8i16 mul_val2, mul_val3;
672 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673 v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674
675 src0 = LD_UB(src_top);
676 src1 = LD_UB(src_left);
677
678 UNPCK_UB_SH(src0, src0_r, src0_l);
679 UNPCK_UB_SH(src1, src1_r, src1_l);
680
681 mul_val2 = mul_val0 - 8;
682 mul_val3 = mul_val1 + 8;
683
684 tmp0 = __msa_fill_h(src_top[16]);
685 tmp1 = __msa_fill_h(src_left[16]);
686
687 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689 mul_val0, mul_val1, mul_val2, mul_val3,
690 res0, res1, 15, 1, 5);
691 ST_SH2(res0, res1, dst, stride);
692 dst += (2 * stride);
693
694 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696 mul_val0, mul_val1, mul_val2, mul_val3,
697 res0, res1, 13, 3, 5);
698 ST_SH2(res0, res1, dst, stride);
699 dst += (2 * stride);
700
701 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703 mul_val0, mul_val1, mul_val2, mul_val3,
704 res0, res1, 11, 5, 5);
705 ST_SH2(res0, res1, dst, stride);
706 dst += (2 * stride);
707
708 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710 mul_val0, mul_val1, mul_val2, mul_val3,
711 res0, res1, 9, 7, 5);
712 ST_SH2(res0, res1, dst, stride);
713 dst += (2 * stride);
714
715 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717 mul_val0, mul_val1, mul_val2, mul_val3,
718 res0, res1, 7, 9, 5);
719 ST_SH2(res0, res1, dst, stride);
720 dst += (2 * stride);
721
722 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724 mul_val0, mul_val1, mul_val2, mul_val3,
725 res0, res1, 5, 11, 5);
726 ST_SH2(res0, res1, dst, stride);
727 dst += (2 * stride);
728
729 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731 mul_val0, mul_val1, mul_val2, mul_val3,
732 res0, res1, 3, 13, 5);
733 ST_SH2(res0, res1, dst, stride);
734 dst += (2 * stride);
735
736 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738 mul_val0, mul_val1, mul_val2, mul_val3,
739 res0, res1, 1, 15, 5);
740 ST_SH2(res0, res1, dst, stride);
741 }
742
process_intra_upper_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,uint8_t offset)743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744 const uint8_t *src_left,
745 uint8_t *dst, int32_t stride,
746 uint8_t offset)
747 {
748 v16i8 src0, src1;
749 v8i16 src0_r, src1_r, src0_l, src1_l;
750 v8i16 vec0, vec1, res0, res1;
751 v8i16 tmp0, tmp1;
752 v8i16 mul_val2, mul_val3;
753 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755
756 tmp0 = __msa_fill_h(src_top[32 - offset]);
757 tmp1 = __msa_fill_h(src_left[32]);
758
759 src0 = LD_SB(src_top);
760 src1 = LD_SB(src_left);
761
762 UNPCK_UB_SH(src0, src0_r, src0_l);
763 UNPCK_UB_SH(src1, src1_r, src1_l);
764
765 mul_val1 += offset;
766 mul_val0 -= offset;
767 mul_val2 = mul_val0 - 8;
768 mul_val3 = mul_val1 + 8;
769
770 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772 mul_val0, mul_val1, mul_val2, mul_val3,
773 res0, res1, 31, 1, 6);
774 ST_SH2(res0, res1, dst, stride);
775 dst += (2 * stride);
776
777 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779 mul_val0, mul_val1, mul_val2, mul_val3,
780 res0, res1, 29, 3, 6);
781 ST_SH2(res0, res1, dst, stride);
782 dst += (2 * stride);
783
784 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786 mul_val0, mul_val1, mul_val2, mul_val3,
787 res0, res1, 27, 5, 6);
788 ST_SH2(res0, res1, dst, stride);
789 dst += (2 * stride);
790
791 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793 mul_val0, mul_val1, mul_val2, mul_val3,
794 res0, res1, 25, 7, 6);
795 ST_SH2(res0, res1, dst, stride);
796 dst += (2 * stride);
797
798 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800 mul_val0, mul_val1, mul_val2, mul_val3,
801 res0, res1, 23, 9, 6);
802 ST_SH2(res0, res1, dst, stride);
803 dst += (2 * stride);
804
805 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807 mul_val0, mul_val1, mul_val2, mul_val3,
808 res0, res1, 21, 11, 6);
809 ST_SH2(res0, res1, dst, stride);
810 dst += (2 * stride);
811
812 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814 mul_val0, mul_val1, mul_val2, mul_val3,
815 res0, res1, 19, 13, 6);
816 ST_SH2(res0, res1, dst, stride);
817 dst += (2 * stride);
818
819 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821 mul_val0, mul_val1, mul_val2, mul_val3,
822 res0, res1, 17, 15, 6);
823 ST_SH2(res0, res1, dst, stride);
824 }
825
process_intra_lower_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,uint8_t offset)826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827 const uint8_t *src_left,
828 uint8_t *dst, int32_t stride,
829 uint8_t offset)
830 {
831 v16i8 src0, src1;
832 v8i16 src0_r, src1_r, src0_l, src1_l;
833 v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834 v8i16 mul_val2, mul_val3;
835 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837
838 tmp0 = __msa_fill_h(src_top[32 - offset]);
839 tmp1 = __msa_fill_h(src_left[16]);
840
841 src0 = LD_SB(src_top);
842 src1 = LD_SB(src_left);
843
844 UNPCK_UB_SH(src0, src0_r, src0_l);
845 UNPCK_UB_SH(src1, src1_r, src1_l);
846
847 mul_val1 += offset;
848 mul_val0 -= offset;
849 mul_val2 = mul_val0 - 8;
850 mul_val3 = mul_val1 + 8;
851
852 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854 mul_val0, mul_val1, mul_val2, mul_val3,
855 res0, res1, 15, 17, 6);
856 ST_SH2(res0, res1, dst, stride);
857 dst += (2 * stride);
858
859 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861 mul_val0, mul_val1, mul_val2, mul_val3,
862 res0, res1, 13, 19, 6);
863 ST_SH2(res0, res1, dst, stride);
864 dst += (2 * stride);
865
866 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868 mul_val0, mul_val1, mul_val2, mul_val3,
869 res0, res1, 11, 21, 6);
870 ST_SH2(res0, res1, dst, stride);
871 dst += (2 * stride);
872
873 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875 mul_val0, mul_val1, mul_val2, mul_val3,
876 res0, res1, 9, 23, 6);
877 ST_SH2(res0, res1, dst, stride);
878 dst += (2 * stride);
879
880 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882 mul_val0, mul_val1, mul_val2, mul_val3,
883 res0, res1, 7, 25, 6);
884 ST_SH2(res0, res1, dst, stride);
885 dst += (2 * stride);
886
887 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889 mul_val0, mul_val1, mul_val2, mul_val3,
890 res0, res1, 5, 27, 6);
891 ST_SH2(res0, res1, dst, stride);
892 dst += (2 * stride);
893
894 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896 mul_val0, mul_val1, mul_val2, mul_val3,
897 res0, res1, 3, 29, 6);
898 ST_SH2(res0, res1, dst, stride);
899 dst += (2 * stride);
900
901 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903 mul_val0, mul_val1, mul_val2, mul_val3,
904 res0, res1, 1, 31, 6);
905 ST_SH2(res0, res1, dst, stride);
906 }
907
hevc_intra_pred_plane_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909 const uint8_t *src_left,
910 uint8_t *dst, int32_t stride)
911 {
912 process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913 process_intra_upper_16x16_msa((src_top + 16), src_left,
914 (dst + 16), stride, 16);
915 dst += (16 * stride);
916 src_left += 16;
917
918 process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919 process_intra_lower_16x16_msa((src_top + 16), src_left,
920 (dst + 16), stride, 16);
921 }
922
hevc_intra_pred_angular_upper_4width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)923 static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
924 const uint8_t *src_left,
925 uint8_t *dst,
926 int32_t stride,
927 int32_t mode)
928 {
929 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930 uint8_t ref_array[3 * 32 + 4];
931 uint8_t *ref_tmp = ref_array + 4;
932 const uint8_t *ref;
933 int32_t last;
934 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935 int32_t idx2, fact_val2, idx3, fact_val3;
936 int32_t angle, angle_loop;
937 int32_t inv_angle_val, offset;
938 uint64_t tmp0;
939 v16i8 top0, top1, top2, top3;
940 v16i8 dst_val0;
941 v16i8 zero = { 0 };
942 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944
945 angle = intra_pred_angle_up[mode - 18];
946 inv_angle_val = inv_angle[mode - 18];
947 last = (angle) >> 3;
948 angle_loop = angle;
949
950 ref = src_top - 1;
951 if (angle < 0 && last < -1) {
952 inv_angle_val = inv_angle[mode - 18];
953
954 tmp0 = LD(ref);
955 SD(tmp0, ref_tmp);
956
957 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959 ref_tmp[h_cnt] = src_left[offset];
960 }
961
962 ref = ref_tmp;
963 }
964
965 idx0 = angle_loop >> 5;
966 fact_val0 = angle_loop & 31;
967 angle_loop += angle;
968
969 idx1 = angle_loop >> 5;
970 fact_val1 = angle_loop & 31;
971 angle_loop += angle;
972
973 idx2 = angle_loop >> 5;
974 fact_val2 = angle_loop & 31;
975 angle_loop += angle;
976
977 idx3 = angle_loop >> 5;
978 fact_val3 = angle_loop & 31;
979
980 top0 = LD_SB(ref + idx0 + 1);
981 top1 = LD_SB(ref + idx1 + 1);
982 top2 = LD_SB(ref + idx2 + 1);
983 top3 = LD_SB(ref + idx3 + 1);
984
985 fact0 = __msa_fill_h(fact_val0);
986 fact1 = __msa_fill_h(32 - fact_val0);
987
988 fact2 = __msa_fill_h(fact_val1);
989 fact3 = __msa_fill_h(32 - fact_val1);
990
991 fact4 = __msa_fill_h(fact_val2);
992 fact5 = __msa_fill_h(32 - fact_val2);
993
994 fact6 = __msa_fill_h(fact_val3);
995 fact7 = __msa_fill_h(32 - fact_val3);
996
997 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000 diff0, diff2, diff4, diff6);
1001 SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1002 diff1, diff3, diff5, diff7);
1003 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1006
1007 diff1 += diff0 * fact1;
1008 diff3 += diff2 * fact3;
1009
1010 SRARI_H2_SH(diff1, diff3, 5);
1011 dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1012 ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1013 }
1014
hevc_intra_pred_angular_upper_8width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1015 static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
1016 const uint8_t *src_left,
1017 uint8_t *dst,
1018 int32_t stride,
1019 int32_t mode)
1020 {
1021 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022 uint8_t ref_array[3 * 32 + 4];
1023 uint8_t *ref_tmp = ref_array + 8;
1024 const uint8_t *ref;
1025 const uint8_t *src_left_tmp = src_left - 1;
1026 int32_t last, offset;
1027 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028 int32_t idx2, fact_val2, idx3, fact_val3;
1029 int32_t angle, angle_loop;
1030 int32_t inv_angle_val, inv_angle_val_loop;
1031 int32_t tmp0, tmp1, tmp2;
1032 v16i8 top0, top1, top2, top3;
1033 v16u8 dst_val0, dst_val1;
1034 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036
1037 angle = intra_pred_angle_up[mode - 18];
1038 inv_angle_val = inv_angle[mode - 18];
1039 last = (angle) >> 2;
1040 angle_loop = angle;
1041
1042 ref = src_top - 1;
1043 if (last < -1) {
1044 inv_angle_val_loop = inv_angle_val * last;
1045
1046 tmp0 = LW(ref);
1047 tmp1 = LW(ref + 4);
1048 tmp2 = LW(ref + 8);
1049 SW(tmp0, ref_tmp);
1050 SW(tmp1, ref_tmp + 4);
1051 SW(tmp2, ref_tmp + 8);
1052
1053 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054 offset = (inv_angle_val_loop + 128) >> 8;
1055 ref_tmp[h_cnt] = src_left_tmp[offset];
1056 inv_angle_val_loop += inv_angle_val;
1057 }
1058 ref = ref_tmp;
1059 }
1060
1061 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062 idx0 = (angle_loop) >> 5;
1063 fact_val0 = (angle_loop) & 31;
1064 angle_loop += angle;
1065
1066 idx1 = (angle_loop) >> 5;
1067 fact_val1 = (angle_loop) & 31;
1068 angle_loop += angle;
1069
1070 idx2 = (angle_loop) >> 5;
1071 fact_val2 = (angle_loop) & 31;
1072 angle_loop += angle;
1073
1074 idx3 = (angle_loop) >> 5;
1075 fact_val3 = (angle_loop) & 31;
1076 angle_loop += angle;
1077
1078 top0 = LD_SB(ref + idx0 + 1);
1079 top1 = LD_SB(ref + idx1 + 1);
1080 top2 = LD_SB(ref + idx2 + 1);
1081 top3 = LD_SB(ref + idx3 + 1);
1082
1083 fact0 = __msa_fill_h(fact_val0);
1084 fact1 = __msa_fill_h(32 - fact_val0);
1085 fact2 = __msa_fill_h(fact_val1);
1086 fact3 = __msa_fill_h(32 - fact_val1);
1087 fact4 = __msa_fill_h(fact_val2);
1088 fact5 = __msa_fill_h(32 - fact_val2);
1089 fact6 = __msa_fill_h(fact_val3);
1090 fact7 = __msa_fill_h(32 - fact_val3);
1091
1092 UNPCK_UB_SH(top0, diff0, diff1);
1093 UNPCK_UB_SH(top1, diff2, diff3);
1094 UNPCK_UB_SH(top2, diff4, diff5);
1095 UNPCK_UB_SH(top3, diff6, diff7);
1096
1097 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098 diff1, diff3, diff5, diff7);
1099 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100 diff1, diff3, diff5, diff7);
1101
1102 diff1 += diff0 * fact1;
1103 diff3 += diff2 * fact3;
1104 diff5 += diff4 * fact5;
1105 diff7 += diff6 * fact7;
1106
1107 SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1108 PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1109 ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1110 dst += (4 * stride);
1111 }
1112 }
1113
hevc_intra_pred_angular_upper_16width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1114 static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
1115 const uint8_t *src_left,
1116 uint8_t *dst,
1117 int32_t stride,
1118 int32_t mode)
1119 {
1120 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122 int32_t idx2, fact_val2, idx3, fact_val3;
1123 int32_t tmp0;
1124 int32_t angle, angle_loop, offset;
1125 int32_t inv_angle_val, inv_angle_val_loop;
1126 uint8_t ref_array[3 * 32 + 4];
1127 uint8_t *ref_tmp = ref_array + 16;
1128 const uint8_t *ref;
1129 const uint8_t *src_left_tmp = src_left - 1;
1130 int32_t last;
1131 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132 v16i8 dst0, dst1, dst2, dst3;
1133 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136
1137 angle = intra_pred_angle_up[mode - 18];
1138 inv_angle_val = inv_angle[mode - 18];
1139 last = angle >> 1;
1140 angle_loop = angle;
1141
1142 ref = src_top - 1;
1143 if (last < -1) {
1144 inv_angle_val_loop = inv_angle_val * last;
1145
1146 top0 = LD_UB(ref);
1147 tmp0 = LW(ref + 16);
1148 ST_UB(top0, ref_tmp);
1149 SW(tmp0, ref_tmp + 16);
1150
1151 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152 offset = (inv_angle_val_loop + 128) >> 8;
1153 ref_tmp[h_cnt] = src_left_tmp[offset];
1154 inv_angle_val_loop += inv_angle_val;
1155 }
1156 ref = ref_tmp;
1157 }
1158
1159 for (v_cnt = 4; v_cnt--;) {
1160 idx0 = (angle_loop) >> 5;
1161 fact_val0 = (angle_loop) & 31;
1162 angle_loop += angle;
1163
1164 idx1 = (angle_loop) >> 5;
1165 fact_val1 = (angle_loop) & 31;
1166 angle_loop += angle;
1167
1168 idx2 = (angle_loop) >> 5;
1169 fact_val2 = (angle_loop) & 31;
1170 angle_loop += angle;
1171
1172 idx3 = (angle_loop) >> 5;
1173 fact_val3 = (angle_loop) & 31;
1174 angle_loop += angle;
1175
1176 LD_UB2(ref + idx0 + 1, 16, top0, top1);
1177 LD_UB2(ref + idx1 + 1, 16, top2, top3);
1178 LD_UB2(ref + idx2 + 1, 16, top4, top5);
1179 LD_UB2(ref + idx3 + 1, 16, top6, top7);
1180
1181 fact0 = __msa_fill_h(fact_val0);
1182 fact1 = __msa_fill_h(32 - fact_val0);
1183 fact2 = __msa_fill_h(fact_val1);
1184 fact3 = __msa_fill_h(32 - fact_val1);
1185 fact4 = __msa_fill_h(fact_val2);
1186 fact5 = __msa_fill_h(32 - fact_val2);
1187 fact6 = __msa_fill_h(fact_val3);
1188 fact7 = __msa_fill_h(32 - fact_val3);
1189
1190 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191 top1, top3, top5, top7);
1192 UNPCK_UB_SH(top0, diff0, diff1);
1193 UNPCK_UB_SH(top1, diff2, diff3);
1194 UNPCK_UB_SH(top2, diff4, diff5);
1195 UNPCK_UB_SH(top3, diff6, diff7);
1196 UNPCK_UB_SH(top4, diff8, diff9);
1197 UNPCK_UB_SH(top5, diff10, diff11);
1198 UNPCK_UB_SH(top6, diff12, diff13);
1199 UNPCK_UB_SH(top7, diff14, diff15);
1200
1201 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202 diff2, diff3, diff6, diff7);
1203 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204 diff10, diff11, diff14, diff15);
1205
1206 diff2 += diff0 * fact1;
1207 diff3 += diff1 * fact1;
1208 diff6 += diff4 * fact3;
1209 diff7 += diff5 * fact3;
1210 diff10 += diff8 * fact5;
1211 diff11 += diff9 * fact5;
1212 diff14 += diff12 * fact7;
1213 diff15 += diff13 * fact7;
1214
1215 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1216 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1217 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218 dst0, dst1, dst2, dst3);
1219 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1220 dst += (4 * stride);
1221 }
1222 }
1223
hevc_intra_pred_angular_upper_32width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1224 static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
1225 const uint8_t *src_left,
1226 uint8_t *dst,
1227 int32_t stride,
1228 int32_t mode)
1229 {
1230 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231 uint8_t ref_array[3 * 32 + 4];
1232 uint8_t *ref_tmp;
1233 const uint8_t *ref;
1234 const uint8_t *src_left_tmp = src_left - 1;
1235 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236 int32_t tmp0, tmp1, tmp2, tmp3;
1237 int32_t angle, angle_loop;
1238 int32_t inv_angle_val, inv_angle_val_loop;
1239 int32_t last, offset;
1240 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241 v16i8 dst0, dst1, dst2, dst3;
1242 v8i16 fact0, fact1, fact2, fact3;
1243 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1245
1246 ref_tmp = ref_array + 32;
1247
1248 angle = intra_pred_angle_up[mode - 18];
1249 inv_angle_val = inv_angle[mode - 18];
1250 last = angle;
1251 angle_loop = angle;
1252
1253 ref = src_top - 1;
1254 if (last < -1) {
1255 inv_angle_val_loop = inv_angle_val * last;
1256 LD_UB2(ref, 16, top0, top1);
1257 tmp0 = ref[32];
1258 tmp1 = ref[33];
1259 tmp2 = ref[34];
1260 tmp3 = ref[35];
1261
1262 ST_UB2(top0, top1, ref_tmp, 16);
1263 ref_tmp[32] = tmp0;
1264 ref_tmp[33] = tmp1;
1265 ref_tmp[34] = tmp2;
1266 ref_tmp[35] = tmp3;
1267
1268 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269 offset = (inv_angle_val_loop + 128) >> 8;
1270 ref_tmp[h_cnt] = src_left_tmp[offset];
1271 inv_angle_val_loop += inv_angle_val;
1272 }
1273
1274 ref = ref_tmp;
1275 }
1276
1277 for (v_cnt = 16; v_cnt--;) {
1278 idx0 = (angle_loop) >> 5;
1279 fact_val0 = (angle_loop) & 31;
1280 angle_loop += angle;
1281
1282 idx1 = (angle_loop) >> 5;
1283 fact_val1 = (angle_loop) & 31;
1284 angle_loop += angle;
1285
1286 top0 = LD_UB(ref + idx0 + 1);
1287 top4 = LD_UB(ref + idx1 + 1);
1288 top1 = LD_UB(ref + idx0 + 17);
1289 top5 = LD_UB(ref + idx1 + 17);
1290 top3 = LD_UB(ref + idx0 + 33);
1291 top7 = LD_UB(ref + idx1 + 33);
1292
1293 fact0 = __msa_fill_h(fact_val0);
1294 fact1 = __msa_fill_h(32 - fact_val0);
1295 fact2 = __msa_fill_h(fact_val1);
1296 fact3 = __msa_fill_h(32 - fact_val1);
1297
1298 top2 = top1;
1299 top6 = top5;
1300
1301 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302 top1, top3, top5, top7);
1303 UNPCK_UB_SH(top0, diff0, diff1);
1304 UNPCK_UB_SH(top1, diff2, diff3);
1305 UNPCK_UB_SH(top2, diff4, diff5);
1306 UNPCK_UB_SH(top3, diff6, diff7);
1307 UNPCK_UB_SH(top4, diff8, diff9);
1308 UNPCK_UB_SH(top5, diff10, diff11);
1309 UNPCK_UB_SH(top6, diff12, diff13);
1310 UNPCK_UB_SH(top7, diff14, diff15);
1311
1312 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313 diff2, diff3, diff6, diff7);
1314 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315 diff10, diff11, diff14, diff15);
1316
1317 diff2 += diff0 * fact1;
1318 diff3 += diff1 * fact1;
1319 diff6 += diff4 * fact1;
1320 diff7 += diff5 * fact1;
1321 diff10 += diff8 * fact3;
1322 diff11 += diff9 * fact3;
1323 diff14 += diff12 * fact3;
1324 diff15 += diff13 * fact3;
1325
1326 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1327 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1328 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329 dst0, dst1, dst2, dst3);
1330
1331 ST_SB2(dst0, dst1, dst, 16);
1332 dst += stride;
1333 ST_SB2(dst2, dst3, dst, 16);
1334 dst += stride;
1335 }
1336 }
1337
hevc_intra_pred_angular_lower_4width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1338 static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
1339 const uint8_t *src_left,
1340 uint8_t *dst,
1341 int32_t stride,
1342 int32_t mode)
1343 {
1344 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345 uint8_t ref_array[3 * 32 + 4];
1346 uint8_t *ref_tmp = ref_array + 4;
1347 const uint8_t *ref;
1348 int32_t last, offset;
1349 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350 int32_t idx2, fact_val2, idx3, fact_val3;
1351 int32_t angle, angle_loop, inv_angle_val;
1352 uint64_t tmp0;
1353 v16i8 dst_val0, dst_val1;
1354 v16u8 top0, top1, top2, top3;
1355 v16u8 zero = { 0 };
1356 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1358
1359 angle = intra_pred_angle_low[mode - 2];
1360 last = angle >> 3;
1361 angle_loop = angle;
1362
1363 ref = src_left - 1;
1364 if (last < -1) {
1365 inv_angle_val = inv_angle[mode - 11];
1366
1367 tmp0 = LD(ref);
1368 SD(tmp0, ref_tmp);
1369
1370 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372 ref_tmp[h_cnt] = src_top[offset];
1373 }
1374
1375 ref = ref_tmp;
1376 }
1377
1378 idx0 = angle_loop >> 5;
1379 fact_val0 = angle_loop & 31;
1380 angle_loop += angle;
1381
1382 idx1 = angle_loop >> 5;
1383 fact_val1 = angle_loop & 31;
1384 angle_loop += angle;
1385
1386 idx2 = angle_loop >> 5;
1387 fact_val2 = angle_loop & 31;
1388 angle_loop += angle;
1389
1390 idx3 = angle_loop >> 5;
1391 fact_val3 = angle_loop & 31;
1392
1393 top0 = LD_UB(ref + idx0 + 1);
1394 top1 = LD_UB(ref + idx1 + 1);
1395 top2 = LD_UB(ref + idx2 + 1);
1396 top3 = LD_UB(ref + idx3 + 1);
1397
1398 fact0 = __msa_fill_h(fact_val0);
1399 fact1 = __msa_fill_h(32 - fact_val0);
1400 fact2 = __msa_fill_h(fact_val1);
1401 fact3 = __msa_fill_h(32 - fact_val1);
1402 fact4 = __msa_fill_h(fact_val2);
1403 fact5 = __msa_fill_h(32 - fact_val2);
1404 fact6 = __msa_fill_h(fact_val3);
1405 fact7 = __msa_fill_h(32 - fact_val3);
1406
1407 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1409 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1410 diff0, diff2, diff4, diff6);
1411 SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1412 diff1, diff3, diff5, diff7);
1413 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1416
1417 diff1 += diff0 * fact1;
1418 diff3 += diff2 * fact3;
1419
1420 SRARI_H2_SH(diff1, diff3, 5);
1421 PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1422
1423 diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424 diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1425
1426 diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1427
1428 dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429 dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1430
1431 ST_W2(dst_val0, 0, 1, dst, stride);
1432 ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1433 }
1434
hevc_intra_pred_angular_lower_8width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1435 static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
1436 const uint8_t *src_left,
1437 uint8_t *dst,
1438 int32_t stride,
1439 int32_t mode)
1440 {
1441 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442 uint8_t ref_array[3 * 32 + 4];
1443 uint8_t *ref_tmp = ref_array + 8;
1444 const uint8_t *ref;
1445 const uint8_t *src_top_tmp = src_top - 1;
1446 uint8_t *dst_org;
1447 int32_t last, offset, tmp0, tmp1, tmp2;
1448 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449 int32_t idx2, fact_val2, idx3, fact_val3;
1450 int32_t angle, angle_loop, inv_angle_val;
1451 v16i8 top0, top1, top2, top3;
1452 v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1455
1456 angle = intra_pred_angle_low[mode - 2];
1457 last = (angle) >> 2;
1458 angle_loop = angle;
1459
1460 ref = src_left - 1;
1461 if (last < -1) {
1462 inv_angle_val = inv_angle[mode - 11];
1463
1464 tmp0 = LW(ref);
1465 tmp1 = LW(ref + 4);
1466 tmp2 = LW(ref + 8);
1467 SW(tmp0, ref_tmp);
1468 SW(tmp1, ref_tmp + 4);
1469 SW(tmp2, ref_tmp + 8);
1470
1471 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472 offset = (h_cnt * inv_angle_val + 128) >> 8;
1473 ref_tmp[h_cnt] = src_top_tmp[offset];
1474 }
1475
1476 ref = ref_tmp;
1477 }
1478
1479 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480 dst_org = dst;
1481
1482 idx0 = angle_loop >> 5;
1483 fact_val0 = angle_loop & 31;
1484 angle_loop += angle;
1485
1486 idx1 = angle_loop >> 5;
1487 fact_val1 = angle_loop & 31;
1488 angle_loop += angle;
1489
1490 idx2 = angle_loop >> 5;
1491 fact_val2 = angle_loop & 31;
1492 angle_loop += angle;
1493
1494 idx3 = angle_loop >> 5;
1495 fact_val3 = angle_loop & 31;
1496 angle_loop += angle;
1497
1498 top0 = LD_SB(ref + idx0 + 1);
1499 top1 = LD_SB(ref + idx1 + 1);
1500 top2 = LD_SB(ref + idx2 + 1);
1501 top3 = LD_SB(ref + idx3 + 1);
1502
1503 fact0 = __msa_fill_h(fact_val0);
1504 fact1 = __msa_fill_h(32 - fact_val0);
1505 fact2 = __msa_fill_h(fact_val1);
1506 fact3 = __msa_fill_h(32 - fact_val1);
1507 fact4 = __msa_fill_h(fact_val2);
1508 fact5 = __msa_fill_h(32 - fact_val2);
1509 fact6 = __msa_fill_h(fact_val3);
1510 fact7 = __msa_fill_h(32 - fact_val3);
1511
1512 UNPCK_UB_SH(top0, diff0, diff1);
1513 UNPCK_UB_SH(top1, diff2, diff3);
1514 UNPCK_UB_SH(top2, diff4, diff5);
1515 UNPCK_UB_SH(top3, diff6, diff7);
1516 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517 diff1, diff3, diff5, diff7);
1518 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519 diff1, diff3, diff5, diff7);
1520
1521 diff1 += diff0 * fact1;
1522 diff3 += diff2 * fact3;
1523 diff5 += diff4 * fact5;
1524 diff7 += diff6 * fact7;
1525
1526 SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1527 PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528 dst_val0, dst_val1, dst_val2, dst_val3);
1529 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1530 ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1531 ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1532 dst += 4;
1533 }
1534 }
1535
hevc_intra_pred_angular_lower_16width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1536 static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
1537 const uint8_t *src_left,
1538 uint8_t *dst,
1539 int32_t stride,
1540 int32_t mode)
1541 {
1542 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544 int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1550 int32_t angle, angle_loop, inv_angle_val, offset;
1551 uint8_t ref_array[3 * 32 + 4];
1552 uint8_t *ref_tmp = ref_array + 16;
1553 const uint8_t *ref, *src_top_tmp = src_top - 1;
1554 uint8_t *dst_org;
1555 int32_t last;
1556
1557 angle = intra_pred_angle_low[mode - 2];
1558 last = (angle) >> 1;
1559 angle_loop = angle;
1560
1561 ref = src_left - 1;
1562 if (last < -1) {
1563 inv_angle_val = inv_angle[mode - 11];
1564
1565 top0 = LD_SB(ref);
1566 tmp0 = LW(ref + 16);
1567 ST_SB(top0, ref_tmp);
1568 SW(tmp0, ref_tmp + 16);
1569
1570 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571 offset = (h_cnt * inv_angle_val + 128) >> 8;
1572 ref_tmp[h_cnt] = src_top_tmp[offset];
1573 }
1574
1575 ref = ref_tmp;
1576 }
1577
1578 for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579 dst_org = dst;
1580
1581 idx0 = angle_loop >> 5;
1582 fact_val0 = angle_loop & 31;
1583 angle_loop += angle;
1584
1585 idx1 = angle_loop >> 5;
1586 fact_val1 = angle_loop & 31;
1587 angle_loop += angle;
1588
1589 idx2 = angle_loop >> 5;
1590 fact_val2 = angle_loop & 31;
1591 angle_loop += angle;
1592
1593 idx3 = angle_loop >> 5;
1594 fact_val3 = angle_loop & 31;
1595 angle_loop += angle;
1596
1597 LD_SB2(ref + idx0 + 1, 16, top0, top1);
1598 LD_SB2(ref + idx1 + 1, 16, top2, top3);
1599 LD_SB2(ref + idx2 + 1, 16, top4, top5);
1600 LD_SB2(ref + idx3 + 1, 16, top6, top7);
1601
1602 fact0 = __msa_fill_h(fact_val0);
1603 fact1 = __msa_fill_h(32 - fact_val0);
1604 fact2 = __msa_fill_h(fact_val1);
1605 fact3 = __msa_fill_h(32 - fact_val1);
1606 fact4 = __msa_fill_h(fact_val2);
1607 fact5 = __msa_fill_h(32 - fact_val2);
1608 fact6 = __msa_fill_h(fact_val3);
1609 fact7 = __msa_fill_h(32 - fact_val3);
1610
1611 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612 top1, top3, top5, top7);
1613
1614 UNPCK_UB_SH(top0, diff0, diff1);
1615 UNPCK_UB_SH(top1, diff2, diff3);
1616 UNPCK_UB_SH(top2, diff4, diff5);
1617 UNPCK_UB_SH(top3, diff6, diff7);
1618 UNPCK_UB_SH(top4, diff8, diff9);
1619 UNPCK_UB_SH(top5, diff10, diff11);
1620 UNPCK_UB_SH(top6, diff12, diff13);
1621 UNPCK_UB_SH(top7, diff14, diff15);
1622
1623 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624 diff2, diff3, diff6, diff7);
1625 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626 diff10, diff11, diff14, diff15);
1627
1628 diff2 += diff0 * fact1;
1629 diff3 += diff1 * fact1;
1630 diff6 += diff4 * fact3;
1631 diff7 += diff5 * fact3;
1632 diff10 += diff8 * fact5;
1633 diff11 += diff9 * fact5;
1634 diff14 += diff12 * fact7;
1635 diff15 += diff13 * fact7;
1636
1637 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1638 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1639 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640 dst_val0, dst_val1, dst_val2, dst_val3);
1641 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642 ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1643 ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1644 ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1645 ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646 dst_org += (8 * stride);
1647 ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1648 dst += 4;
1649 }
1650 }
1651
hevc_intra_pred_angular_lower_32width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1652 static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
1653 const uint8_t *src_left,
1654 uint8_t *dst,
1655 int32_t stride,
1656 int32_t mode)
1657 {
1658 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662 v8i16 fact0, fact1, fact2, fact3;
1663 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1665 int32_t angle, angle_loop, inv_angle_val, offset;
1666 uint8_t ref_array[3 * 32 + 4];
1667 uint8_t *ref_tmp = ref_array + 32;
1668 const uint8_t *ref, *src_top_tmp = src_top - 1;
1669 uint8_t *dst_org;
1670 int32_t last;
1671
1672 angle = intra_pred_angle_low[mode - 2];
1673 last = angle;
1674 angle_loop = angle;
1675
1676 ref = src_left - 1;
1677 if (last < -1) {
1678 inv_angle_val = inv_angle[mode - 11];
1679
1680 LD_SB2(ref, 16, top0, top1);
1681 tmp0 = LW(ref + 32);
1682 ST_SB2(top0, top1, ref_tmp, 16);
1683 SW(tmp0, ref_tmp + 32);
1684
1685 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686 offset = (h_cnt * inv_angle_val + 128) >> 8;
1687 ref_tmp[h_cnt] = src_top_tmp[offset];
1688 }
1689
1690 ref = ref_tmp;
1691 }
1692
1693 for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1694 dst_org = dst;
1695 idx0 = angle_loop >> 5;
1696 fact_val0 = angle_loop & 31;
1697 angle_loop += angle;
1698
1699 idx1 = angle_loop >> 5;
1700 fact_val1 = angle_loop & 31;
1701 angle_loop += angle;
1702
1703 top0 = LD_SB(ref + idx0 + 1);
1704 top4 = LD_SB(ref + idx1 + 1);
1705 top1 = LD_SB(ref + idx0 + 17);
1706 top5 = LD_SB(ref + idx1 + 17);
1707 top3 = LD_SB(ref + idx0 + 33);
1708 top7 = LD_SB(ref + idx1 + 33);
1709
1710 fact0 = __msa_fill_h(fact_val0);
1711 fact1 = __msa_fill_h(32 - fact_val0);
1712 fact2 = __msa_fill_h(fact_val1);
1713 fact3 = __msa_fill_h(32 - fact_val1);
1714
1715 top2 = top1;
1716 top6 = top5;
1717
1718 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719 top1, top3, top5, top7);
1720
1721 UNPCK_UB_SH(top0, diff0, diff1);
1722 UNPCK_UB_SH(top1, diff2, diff3);
1723 UNPCK_UB_SH(top2, diff4, diff5);
1724 UNPCK_UB_SH(top3, diff6, diff7);
1725 UNPCK_UB_SH(top4, diff8, diff9);
1726 UNPCK_UB_SH(top5, diff10, diff11);
1727 UNPCK_UB_SH(top6, diff12, diff13);
1728 UNPCK_UB_SH(top7, diff14, diff15);
1729
1730 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731 diff2, diff3, diff6, diff7);
1732 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733 diff10, diff11, diff14, diff15);
1734
1735 diff2 += diff0 * fact1;
1736 diff3 += diff1 * fact1;
1737 diff6 += diff4 * fact1;
1738 diff7 += diff5 * fact1;
1739 diff10 += diff8 * fact3;
1740 diff11 += diff9 * fact3;
1741 diff14 += diff12 * fact3;
1742 diff15 += diff13 * fact3;
1743
1744 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1745 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1746 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747 dst_val0, dst_val1, dst_val2, dst_val3);
1748 ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1749 ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1750
1751 ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752 dst_org += (8 * stride);
1753 ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754 dst_org += (8 * stride);
1755 ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756 dst_org += (8 * stride);
1757 ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1758 dst_org += (8 * stride);
1759
1760 dst += 2;
1761 }
1762 }
1763
intra_predict_vert_32x32_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)1764 static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
1765 int32_t dst_stride)
1766 {
1767 uint32_t row;
1768 v16u8 src1, src2;
1769
1770 src1 = LD_UB(src);
1771 src2 = LD_UB(src + 16);
1772
1773 for (row = 32; row--;) {
1774 ST_UB2(src1, src2, dst, 16);
1775 dst += dst_stride;
1776 }
1777 }
1778
ff_hevc_intra_pred_planar_0_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1779 void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
1780 const uint8_t *src_top,
1781 const uint8_t *src_left,
1782 ptrdiff_t stride)
1783 {
1784 hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1785 }
1786
ff_hevc_intra_pred_planar_1_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1787 void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
1788 const uint8_t *src_top,
1789 const uint8_t *src_left,
1790 ptrdiff_t stride)
1791 {
1792 hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1793 }
1794
ff_hevc_intra_pred_planar_2_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1795 void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
1796 const uint8_t *src_top,
1797 const uint8_t *src_left,
1798 ptrdiff_t stride)
1799 {
1800 hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1801 }
1802
ff_hevc_intra_pred_planar_3_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1803 void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
1804 const uint8_t *src_top,
1805 const uint8_t *src_left,
1806 ptrdiff_t stride)
1807 {
1808 hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1809 }
1810
ff_hevc_intra_pred_dc_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int log2,int c_idx)1811 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1812 const uint8_t *src_left,
1813 ptrdiff_t stride, int log2, int c_idx)
1814 {
1815 switch (log2) {
1816 case 2:
1817 hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1818 break;
1819
1820 case 3:
1821 hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1822 break;
1823
1824 case 4:
1825 hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1826 break;
1827
1828 case 5:
1829 hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1830 break;
1831 }
1832 }
1833
ff_pred_intra_pred_angular_0_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1834 void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
1835 const uint8_t *src_top,
1836 const uint8_t *src_left,
1837 ptrdiff_t stride, int c_idx, int mode)
1838 {
1839 if (mode == 10) {
1840 hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841 } else if (mode == 26) {
1842 hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1843 } else if (mode >= 18) {
1844 hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1845 dst, stride, mode);
1846 } else {
1847 hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1848 dst, stride, mode);
1849 }
1850 }
1851
ff_pred_intra_pred_angular_1_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1852 void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
1853 const uint8_t *src_top,
1854 const uint8_t *src_left,
1855 ptrdiff_t stride, int c_idx, int mode)
1856 {
1857 if (mode == 10) {
1858 hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859 } else if (mode == 26) {
1860 hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1861 } else if (mode >= 18) {
1862 hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1863 dst, stride, mode);
1864 } else {
1865 hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1866 dst, stride, mode);
1867 }
1868 }
1869
ff_pred_intra_pred_angular_2_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1870 void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
1871 const uint8_t *src_top,
1872 const uint8_t *src_left,
1873 ptrdiff_t stride, int c_idx, int mode)
1874 {
1875 if (mode == 10) {
1876 hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877 } else if (mode == 26) {
1878 hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1879 } else if (mode >= 18) {
1880 hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
1881 dst, stride, mode);
1882 } else {
1883 hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
1884 dst, stride, mode);
1885 }
1886 }
1887
ff_pred_intra_pred_angular_3_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1888 void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
1889 const uint8_t *src_top,
1890 const uint8_t *src_left,
1891 ptrdiff_t stride, int c_idx, int mode)
1892 {
1893 if (mode == 10) {
1894 hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1895 } else if (mode == 26) {
1896 intra_predict_vert_32x32_msa(src_top, dst, stride);
1897 } else if (mode >= 18) {
1898 hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
1899 dst, stride, mode);
1900 } else {
1901 hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
1902 dst, stride, mode);
1903 }
1904 }
1905
ff_intra_pred_8_16x16_msa(HEVCContext * s,int x0,int y0,int c_idx)1906 void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1907 {
1908 v16u8 vec0;
1909 HEVCLocalContext *lc = s->HEVClc;
1910 int i;
1911 int hshift = s->ps.sps->hshift[c_idx];
1912 int vshift = s->ps.sps->vshift[c_idx];
1913 int size_in_luma_h = 16 << hshift;
1914 int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1915 int size_in_luma_v = 16 << vshift;
1916 int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1917 int x = x0 >> hshift;
1918 int y = y0 >> vshift;
1919 int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1920 int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1921
1922 int cur_tb_addr =
1923 s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1924
1925 ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1926 uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1927
1928 int min_pu_width = s->ps.sps->min_pu_width;
1929
1930 enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1931 lc->tu.intra_pred_mode;
1932 uint32_t a;
1933 uint8_t left_array[2 * 32 + 1];
1934 uint8_t filtered_left_array[2 * 32 + 1];
1935 uint8_t top_array[2 * 32 + 1];
1936 uint8_t filtered_top_array[2 * 32 + 1];
1937
1938 uint8_t *left = left_array + 1;
1939 uint8_t *top = top_array + 1;
1940 uint8_t *filtered_left = filtered_left_array + 1;
1941 uint8_t *filtered_top = filtered_top_array + 1;
1942 int cand_bottom_left = lc->na.cand_bottom_left
1943 && cur_tb_addr >
1944 s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1945 (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1946 int cand_left = lc->na.cand_left;
1947 int cand_up_left = lc->na.cand_up_left;
1948 int cand_up = lc->na.cand_up;
1949 int cand_up_right = lc->na.cand_up_right
1950 && cur_tb_addr >
1951 s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1952 ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1953
1954 int bottom_left_size =
1955 (((y0 + 2 * size_in_luma_v) >
1956 (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1957 2 * size_in_luma_v)) -
1958 (y0 + size_in_luma_v)) >> vshift;
1959 int top_right_size =
1960 (((x0 + 2 * size_in_luma_h) >
1961 (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1962 (x0 + size_in_luma_h)) >> hshift;
1963
1964 if (s->ps.pps->constrained_intra_pred_flag == 1) {
1965 int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1966 int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1967 int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1968 int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1969 if (!size_in_luma_pu_h)
1970 size_in_luma_pu_h++;
1971 if (cand_bottom_left == 1 && on_pu_edge_x) {
1972 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1973 int y_bottom_pu =
1974 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1975 int max =
1976 ((size_in_luma_pu_v) >
1977 (s->ps.sps->min_pu_height -
1978 y_bottom_pu) ? (s->ps.sps->min_pu_height -
1979 y_bottom_pu) : (size_in_luma_pu_v));
1980 cand_bottom_left = 0;
1981 for (i = 0; i < max; i += 2)
1982 cand_bottom_left |=
1983 ((s->ref->tab_mvf[(x_left_pu) +
1984 (y_bottom_pu +
1985 i) * min_pu_width]).pred_flag ==
1986 PF_INTRA);
1987 }
1988 if (cand_left == 1 && on_pu_edge_x) {
1989 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1990 int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1991 int max =
1992 ((size_in_luma_pu_v) >
1993 (s->ps.sps->min_pu_height -
1994 y_left_pu) ? (s->ps.sps->min_pu_height -
1995 y_left_pu) : (size_in_luma_pu_v));
1996 cand_left = 0;
1997 for (i = 0; i < max; i += 2)
1998 cand_left |=
1999 ((s->ref->tab_mvf[(x_left_pu) +
2000 (y_left_pu +
2001 i) * min_pu_width]).pred_flag ==
2002 PF_INTRA);
2003 }
2004 if (cand_up_left == 1) {
2005 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2006 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2007 cand_up_left =
2008 (s->ref->tab_mvf[(x_left_pu) +
2009 (y_top_pu) * min_pu_width]).pred_flag ==
2010 PF_INTRA;
2011 }
2012 if (cand_up == 1 && on_pu_edge_y) {
2013 int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2014 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2015 int max =
2016 ((size_in_luma_pu_h) >
2017 (s->ps.sps->min_pu_width -
2018 x_top_pu) ? (s->ps.sps->min_pu_width -
2019 x_top_pu) : (size_in_luma_pu_h));
2020 cand_up = 0;
2021 for (i = 0; i < max; i += 2)
2022 cand_up |=
2023 ((s->ref->tab_mvf[(x_top_pu + i) +
2024 (y_top_pu) *
2025 min_pu_width]).pred_flag == PF_INTRA);
2026 }
2027 if (cand_up_right == 1 && on_pu_edge_y) {
2028 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2029 int x_right_pu =
2030 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2031 int max =
2032 ((size_in_luma_pu_h) >
2033 (s->ps.sps->min_pu_width -
2034 x_right_pu) ? (s->ps.sps->min_pu_width -
2035 x_right_pu) : (size_in_luma_pu_h));
2036 cand_up_right = 0;
2037 for (i = 0; i < max; i += 2)
2038 cand_up_right |=
2039 ((s->ref->tab_mvf[(x_right_pu + i) +
2040 (y_top_pu) *
2041 min_pu_width]).pred_flag == PF_INTRA);
2042 }
2043
2044 vec0 = (v16u8) __msa_ldi_b(128);
2045
2046 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2047
2048 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2049
2050 top[-1] = 128;
2051 }
2052 if (cand_up_left) {
2053 left[-1] = src[(-1) + stride * (-1)];
2054 top[-1] = left[-1];
2055 }
2056 if (cand_up) {
2057 vec0 = LD_UB(src - stride);
2058 ST_UB(vec0, top);
2059 }
2060 if (cand_up_right) {
2061 vec0 = LD_UB(src - stride + 16);
2062 ST_UB(vec0, (top + 16));
2063
2064 do {
2065 uint32_t pix =
2066 ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2067 0x01010101U);
2068 for (i = 0; i < (16 - top_right_size); i += 4)
2069 ((((union unaligned_32 *) (top + 16 + top_right_size +
2070 i))->l) = (pix));
2071 } while (0);
2072 }
2073 if (cand_left)
2074 for (i = 0; i < 16; i++)
2075 left[i] = src[(-1) + stride * (i)];
2076 if (cand_bottom_left) {
2077 for (i = 16; i < 16 + bottom_left_size; i++)
2078 left[i] = src[(-1) + stride * (i)];
2079 do {
2080 uint32_t pix =
2081 ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2082 0x01010101U);
2083 for (i = 0; i < (16 - bottom_left_size); i += 4)
2084 ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2085 i))->l) = (pix));
2086 } while (0);
2087 }
2088
2089 if (s->ps.pps->constrained_intra_pred_flag == 1) {
2090 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2091 || cand_up_right) {
2092 int size_max_x =
2093 x0 + ((2 * 16) << hshift) <
2094 s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2095 int size_max_y =
2096 y0 + ((2 * 16) << vshift) <
2097 s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2098 int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2099 if (!cand_up_right) {
2100 size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2101 16 : (s->ps.sps->width - x0) >> hshift;
2102 }
2103 if (!cand_bottom_left) {
2104 size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2105 16 : (s->ps.sps->height - y0) >> vshift;
2106 }
2107 if (cand_bottom_left || cand_left || cand_up_left) {
2108 while (j > -1
2109 &&
2110 !((s->ref->tab_mvf[(((x0 +
2111 ((-1) << hshift)) >> s->ps.sps->
2112 log2_min_pu_size)) + (((y0 +
2113 ((j) <<
2114 vshift))
2115 >> s->ps.sps->
2116 log2_min_pu_size))
2117 * min_pu_width]).pred_flag ==
2118 PF_INTRA))
2119 j--;
2120 if (!
2121 ((s->ref->tab_mvf[(((x0 +
2122 ((-1) << hshift)) >> s->ps.sps->
2123 log2_min_pu_size)) + (((y0 + ((j)
2124 <<
2125 vshift))
2126 >> s->ps.sps->
2127 log2_min_pu_size))
2128 * min_pu_width]).pred_flag == PF_INTRA)) {
2129 j = 0;
2130 while (j < size_max_x
2131 &&
2132 !((s->ref->tab_mvf[(((x0 +
2133 ((j) << hshift)) >> s->ps.sps->
2134 log2_min_pu_size)) + (((y0 +
2135 ((-1) <<
2136 vshift))
2137 >> s->
2138 ps.sps->
2139 log2_min_pu_size))
2140 * min_pu_width]).pred_flag ==
2141 PF_INTRA))
2142 j++;
2143 for (i = j; i > (j) - (j + 1); i--)
2144 if (!
2145 ((s->ref->tab_mvf[(((x0 +
2146 ((i -
2147 1) << hshift)) >> s->ps.sps->
2148 log2_min_pu_size)) + (((y0 +
2149 ((-1) <<
2150 vshift))
2151 >> s->
2152 ps.sps->
2153 log2_min_pu_size))
2154 * min_pu_width]).pred_flag ==
2155 PF_INTRA))
2156 top[i - 1] = top[i];
2157 left[-1] = top[-1];
2158 }
2159 } else {
2160 j = 0;
2161 while (j < size_max_x
2162 &&
2163 !((s->ref->tab_mvf[(((x0 +
2164 ((j) << hshift)) >> s->ps.sps->
2165 log2_min_pu_size)) + (((y0 + ((-1)
2166 <<
2167 vshift))
2168 >> s->ps.sps->
2169 log2_min_pu_size))
2170 * min_pu_width]).pred_flag ==
2171 PF_INTRA))
2172 j++;
2173 if (j > 0)
2174 if (x0 > 0) {
2175 for (i = j; i > (j) - (j + 1); i--)
2176 if (!
2177 ((s->ref->tab_mvf[(((x0 +
2178 ((i -
2179 1) << hshift)) >>
2180 s->ps.sps->log2_min_pu_size))
2181 + (((y0 + ((-1)
2182 << vshift))
2183 >>
2184 s->ps.sps->log2_min_pu_size))
2185 *
2186 min_pu_width]).pred_flag ==
2187 PF_INTRA))
2188 top[i - 1] = top[i];
2189 } else {
2190 for (i = j; i > (j) - (j); i--)
2191 if (!
2192 ((s->ref->tab_mvf[(((x0 +
2193 ((i -
2194 1) << hshift)) >>
2195 s->ps.sps->log2_min_pu_size))
2196 + (((y0 + ((-1)
2197 << vshift))
2198 >>
2199 s->ps.sps->log2_min_pu_size))
2200 *
2201 min_pu_width]).pred_flag ==
2202 PF_INTRA))
2203 top[i - 1] = top[i];
2204 top[-1] = top[0];
2205 }
2206 left[-1] = top[-1];
2207 }
2208 left[-1] = top[-1];
2209 if (cand_bottom_left || cand_left) {
2210 a = ((left[-1]) * 0x01010101U);
2211 for (i = 0; i < (0) + (size_max_y); i += 4)
2212 if (!
2213 ((s->ref->tab_mvf[(((x0 +
2214 ((-1) << hshift)) >> s->ps.sps->
2215 log2_min_pu_size)) + (((y0 +
2216 ((i) <<
2217 vshift))
2218 >> s->ps.sps->
2219 log2_min_pu_size))
2220 * min_pu_width]).pred_flag ==
2221 PF_INTRA))
2222 ((((union unaligned_32 *) (&left[i]))->l) = (a));
2223 else
2224 a = ((left[i + 3]) * 0x01010101U);
2225 }
2226 if (!cand_left) {
2227 vec0 = (v16u8) __msa_fill_b(left[-1]);
2228
2229 ST_UB(vec0, left);
2230 }
2231 if (!cand_bottom_left) {
2232
2233 vec0 = (v16u8) __msa_fill_b(left[15]);
2234
2235 ST_UB(vec0, (left + 16));
2236 }
2237 if (x0 != 0 && y0 != 0) {
2238 a = ((left[size_max_y - 1]) * 0x01010101U);
2239 for (i = (size_max_y - 1);
2240 i > (size_max_y - 1) - (size_max_y); i -= 4)
2241 if (!
2242 ((s->ref->tab_mvf[(((x0 +
2243 ((-1) << hshift)) >> s->ps.sps->
2244 log2_min_pu_size)) + (((y0 +
2245 ((i -
2246 3) <<
2247 vshift))
2248 >> s->ps.sps->
2249 log2_min_pu_size))
2250 * min_pu_width]).pred_flag ==
2251 PF_INTRA))
2252 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2253 else
2254 a = ((left[i - 3]) * 0x01010101U);
2255 if (!
2256 ((s->ref->tab_mvf[(((x0 +
2257 ((-1) << hshift)) >> s->ps.sps->
2258 log2_min_pu_size)) + (((y0 + ((-1)
2259 <<
2260 vshift))
2261 >> s->ps.sps->
2262 log2_min_pu_size))
2263 * min_pu_width]).pred_flag == PF_INTRA))
2264 left[-1] = left[0];
2265 } else if (x0 == 0) {
2266 do {
2267 uint32_t pix = ((0) * 0x01010101U);
2268 for (i = 0; i < (size_max_y); i += 4)
2269 ((((union unaligned_32 *) (left + i))->l) = (pix));
2270 } while (0);
2271 } else {
2272 a = ((left[size_max_y - 1]) * 0x01010101U);
2273 for (i = (size_max_y - 1);
2274 i > (size_max_y - 1) - (size_max_y); i -= 4)
2275 if (!
2276 ((s->ref->tab_mvf[(((x0 +
2277 ((-1) << hshift)) >> s->ps.sps->
2278 log2_min_pu_size)) + (((y0 +
2279 ((i -
2280 3) <<
2281 vshift))
2282 >> s->ps.sps->
2283 log2_min_pu_size))
2284 * min_pu_width]).pred_flag ==
2285 PF_INTRA))
2286 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2287 else
2288 a = ((left[i - 3]) * 0x01010101U);
2289 }
2290 top[-1] = left[-1];
2291 if (y0 != 0) {
2292 a = ((left[-1]) * 0x01010101U);
2293 for (i = 0; i < (0) + (size_max_x); i += 4)
2294 if (!
2295 ((s->ref->tab_mvf[(((x0 +
2296 ((i) << hshift)) >> s->ps.sps->
2297 log2_min_pu_size)) + (((y0 + ((-1)
2298 <<
2299 vshift))
2300 >> s->ps.sps->
2301 log2_min_pu_size))
2302 * min_pu_width]).pred_flag ==
2303 PF_INTRA))
2304 ((((union unaligned_32 *) (&top[i]))->l) = (a));
2305 else
2306 a = ((top[i + 3]) * 0x01010101U);
2307 }
2308 }
2309 }
2310
2311 if (!cand_bottom_left) {
2312 if (cand_left) {
2313 vec0 = (v16u8) __msa_fill_b(left[15]);
2314
2315 ST_UB(vec0, (left + 16));
2316
2317 } else if (cand_up_left) {
2318 vec0 = (v16u8) __msa_fill_b(left[-1]);
2319
2320 ST_UB2(vec0, vec0, left, 16);
2321
2322 cand_left = 1;
2323 } else if (cand_up) {
2324 left[-1] = top[0];
2325
2326 vec0 = (v16u8) __msa_fill_b(left[-1]);
2327
2328 ST_UB2(vec0, vec0, left, 16);
2329
2330 cand_up_left = 1;
2331 cand_left = 1;
2332 } else if (cand_up_right) {
2333 vec0 = (v16u8) __msa_fill_b(top[16]);
2334
2335 ST_UB(vec0, top);
2336
2337 left[-1] = top[16];
2338
2339 ST_UB2(vec0, vec0, left, 16);
2340
2341 cand_up = 1;
2342 cand_up_left = 1;
2343 cand_left = 1;
2344 } else {
2345 left[-1] = 128;
2346 vec0 = (v16u8) __msa_ldi_b(128);
2347
2348 ST_UB2(vec0, vec0, top, 16);
2349 ST_UB2(vec0, vec0, left, 16);
2350 }
2351 }
2352
2353 if (!cand_left) {
2354 vec0 = (v16u8) __msa_fill_b(left[16]);
2355 ST_UB(vec0, left);
2356 }
2357 if (!cand_up_left) {
2358 left[-1] = left[0];
2359 }
2360 if (!cand_up) {
2361 vec0 = (v16u8) __msa_fill_b(left[-1]);
2362 ST_UB(vec0, top);
2363 }
2364 if (!cand_up_right) {
2365 vec0 = (v16u8) __msa_fill_b(top[15]);
2366 ST_UB(vec0, (top + 16));
2367 }
2368
2369 top[-1] = left[-1];
2370
2371
2372 if (!s->ps.sps->intra_smoothing_disabled_flag
2373 && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2374 if (mode != INTRA_DC && 16 != 4) {
2375 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376 int min_dist_vert_hor =
2377 (((((int) (mode - 26U)) >=
2378 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2379 ((((int) (mode - 10U)) >=
2380 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381 ? ((((int) (mode - 10U)) >=
2382 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2383 : ((((int) (mode - 26U)) >=
2384 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2385 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386 filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2387 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388 for (i = 2 * 16 - 2; i >= 0; i--)
2389 filtered_left[i] = (left[i + 1] + 2 * left[i] +
2390 left[i - 1] + 2) >> 2;
2391 filtered_top[-1] =
2392 filtered_left[-1] =
2393 (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2394 for (i = 2 * 16 - 2; i >= 0; i--)
2395 filtered_top[i] = (top[i + 1] + 2 * top[i] +
2396 top[i - 1] + 2) >> 2;
2397 left = filtered_left;
2398 top = filtered_top;
2399 }
2400 }
2401 }
2402
2403 switch (mode) {
2404 case INTRA_PLANAR:
2405 s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2406 (uint8_t *) left, stride);
2407 break;
2408 case INTRA_DC:
2409 s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2410 (uint8_t *) left, stride, 4, c_idx);
2411 break;
2412 default:
2413 s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2414 (uint8_t *) left, stride, c_idx, mode);
2415 break;
2416 }
2417 }
2418
ff_intra_pred_8_32x32_msa(HEVCContext * s,int x0,int y0,int c_idx)2419 void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2420 {
2421 v16u8 vec0, vec1;
2422 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423 v8i16 res0, res1, res2, res3;
2424 v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2426 HEVCLocalContext *lc = s->HEVClc;
2427 int i;
2428 int hshift = s->ps.sps->hshift[c_idx];
2429 int vshift = s->ps.sps->vshift[c_idx];
2430 int size_in_luma_h = 32 << hshift;
2431 int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2432 int size_in_luma_v = 32 << vshift;
2433 int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2434 int x = x0 >> hshift;
2435 int y = y0 >> vshift;
2436 int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2437 int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2438
2439 int cur_tb_addr =
2440 s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2441
2442 ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2443 uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2444
2445 int min_pu_width = s->ps.sps->min_pu_width;
2446
2447 enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2448 lc->tu.intra_pred_mode;
2449 uint32_t a;
2450 uint8_t left_array[2 * 32 + 1];
2451 uint8_t filtered_left_array[2 * 32 + 1];
2452 uint8_t top_array[2 * 32 + 1];
2453 uint8_t filtered_top_array[2 * 32 + 1];
2454
2455 uint8_t *left = left_array + 1;
2456 uint8_t *top = top_array + 1;
2457 uint8_t *filtered_left = filtered_left_array + 1;
2458 uint8_t *filtered_top = filtered_top_array + 1;
2459 int cand_bottom_left = lc->na.cand_bottom_left
2460 && cur_tb_addr >
2461 s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2462 (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2463 int cand_left = lc->na.cand_left;
2464 int cand_up_left = lc->na.cand_up_left;
2465 int cand_up = lc->na.cand_up;
2466 int cand_up_right = lc->na.cand_up_right
2467 && cur_tb_addr >
2468 s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2469 ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2470
2471 int bottom_left_size =
2472 (((y0 + 2 * size_in_luma_v) >
2473 (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2474 2 * size_in_luma_v)) -
2475 (y0 + size_in_luma_v)) >> vshift;
2476 int top_right_size =
2477 (((x0 + 2 * size_in_luma_h) >
2478 (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2479 (x0 + size_in_luma_h)) >> hshift;
2480
2481 if (s->ps.pps->constrained_intra_pred_flag == 1) {
2482 int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2483 int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2484 int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2485 int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2486 if (!size_in_luma_pu_h)
2487 size_in_luma_pu_h++;
2488 if (cand_bottom_left == 1 && on_pu_edge_x) {
2489 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2490 int y_bottom_pu =
2491 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2492 int max =
2493 ((size_in_luma_pu_v) >
2494 (s->ps.sps->min_pu_height -
2495 y_bottom_pu) ? (s->ps.sps->min_pu_height -
2496 y_bottom_pu) : (size_in_luma_pu_v));
2497 cand_bottom_left = 0;
2498 for (i = 0; i < max; i += 2)
2499 cand_bottom_left |=
2500 ((s->ref->tab_mvf[(x_left_pu) +
2501 (y_bottom_pu +
2502 i) * min_pu_width]).pred_flag ==
2503 PF_INTRA);
2504 }
2505 if (cand_left == 1 && on_pu_edge_x) {
2506 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2507 int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2508 int max =
2509 ((size_in_luma_pu_v) >
2510 (s->ps.sps->min_pu_height -
2511 y_left_pu) ? (s->ps.sps->min_pu_height -
2512 y_left_pu) : (size_in_luma_pu_v));
2513 cand_left = 0;
2514 for (i = 0; i < max; i += 2)
2515 cand_left |=
2516 ((s->ref->tab_mvf[(x_left_pu) +
2517 (y_left_pu +
2518 i) * min_pu_width]).pred_flag ==
2519 PF_INTRA);
2520 }
2521 if (cand_up_left == 1) {
2522 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2523 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2524 cand_up_left =
2525 (s->ref->tab_mvf[(x_left_pu) +
2526 (y_top_pu) * min_pu_width]).pred_flag ==
2527 PF_INTRA;
2528 }
2529 if (cand_up == 1 && on_pu_edge_y) {
2530 int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2531 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2532 int max =
2533 ((size_in_luma_pu_h) >
2534 (s->ps.sps->min_pu_width -
2535 x_top_pu) ? (s->ps.sps->min_pu_width -
2536 x_top_pu) : (size_in_luma_pu_h));
2537 cand_up = 0;
2538 for (i = 0; i < max; i += 2)
2539 cand_up |=
2540 ((s->ref->tab_mvf[(x_top_pu + i) +
2541 (y_top_pu) *
2542 min_pu_width]).pred_flag == PF_INTRA);
2543 }
2544 if (cand_up_right == 1 && on_pu_edge_y) {
2545 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2546 int x_right_pu =
2547 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2548 int max =
2549 ((size_in_luma_pu_h) >
2550 (s->ps.sps->min_pu_width -
2551 x_right_pu) ? (s->ps.sps->min_pu_width -
2552 x_right_pu) : (size_in_luma_pu_h));
2553 cand_up_right = 0;
2554 for (i = 0; i < max; i += 2)
2555 cand_up_right |=
2556 ((s->ref->tab_mvf[(x_right_pu + i) +
2557 (y_top_pu) *
2558 min_pu_width]).pred_flag == PF_INTRA);
2559 }
2560 vec0 = (v16u8) __msa_ldi_b(128);
2561
2562 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2563 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2564
2565 top[-1] = 128;
2566 }
2567 if (cand_up_left) {
2568 left[-1] = src[(-1) + stride * (-1)];
2569 top[-1] = left[-1];
2570 }
2571 if (cand_up) {
2572 LD_UB2(src - stride, 16, vec0, vec1);
2573 ST_UB2(vec0, vec1, top, 16);
2574 }
2575
2576 if (cand_up_right) {
2577 LD_UB2(src - stride + 32, 16, vec0, vec1);
2578 ST_UB2(vec0, vec1, (top + 32), 16);
2579 do {
2580 uint32_t pix =
2581 ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2582 0x01010101U);
2583 for (i = 0; i < (32 - top_right_size); i += 4)
2584 ((((union unaligned_32 *) (top + 32 + top_right_size +
2585 i))->l) = (pix));
2586 } while (0);
2587 }
2588 if (cand_left)
2589 for (i = 0; i < 32; i++)
2590 left[i] = src[(-1) + stride * (i)];
2591 if (cand_bottom_left) {
2592 for (i = 32; i < 32 + bottom_left_size; i++)
2593 left[i] = src[(-1) + stride * (i)];
2594 do {
2595 uint32_t pix =
2596 ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2597 0x01010101U);
2598 for (i = 0; i < (32 - bottom_left_size); i += 4)
2599 ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2600 i))->l) = (pix));
2601 } while (0);
2602 }
2603
2604 if (s->ps.pps->constrained_intra_pred_flag == 1) {
2605 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2606 || cand_up_right) {
2607 int size_max_x =
2608 x0 + ((2 * 32) << hshift) <
2609 s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2610 int size_max_y =
2611 y0 + ((2 * 32) << vshift) <
2612 s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2613 int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2614 if (!cand_up_right) {
2615 size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2616 32 : (s->ps.sps->width - x0) >> hshift;
2617 }
2618 if (!cand_bottom_left) {
2619 size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2620 32 : (s->ps.sps->height - y0) >> vshift;
2621 }
2622 if (cand_bottom_left || cand_left || cand_up_left) {
2623 while (j > -1
2624 &&
2625 !((s->ref->tab_mvf[(((x0 +
2626 ((-1) << hshift)) >> s->ps.sps->
2627 log2_min_pu_size)) + (((y0 +
2628 ((j) <<
2629 vshift))
2630 >> s->ps.sps->
2631 log2_min_pu_size))
2632 * min_pu_width]).pred_flag ==
2633 PF_INTRA))
2634 j--;
2635 if (!
2636 ((s->ref->tab_mvf[(((x0 +
2637 ((-1) << hshift)) >> s->ps.sps->
2638 log2_min_pu_size)) + (((y0 + ((j)
2639 <<
2640 vshift))
2641 >> s->ps.sps->
2642 log2_min_pu_size))
2643 * min_pu_width]).pred_flag == PF_INTRA)) {
2644 j = 0;
2645 while (j < size_max_x
2646 &&
2647 !((s->ref->tab_mvf[(((x0 +
2648 ((j) << hshift)) >> s->ps.sps->
2649 log2_min_pu_size)) + (((y0 +
2650 ((-1) <<
2651 vshift))
2652 >> s->
2653 ps.sps->
2654 log2_min_pu_size))
2655 * min_pu_width]).pred_flag ==
2656 PF_INTRA))
2657 j++;
2658 for (i = j; i > (j) - (j + 1); i--)
2659 if (!
2660 ((s->ref->tab_mvf[(((x0 +
2661 ((i -
2662 1) << hshift)) >> s->ps.sps->
2663 log2_min_pu_size)) + (((y0 +
2664 ((-1) <<
2665 vshift))
2666 >> s->
2667 ps.sps->
2668 log2_min_pu_size))
2669 * min_pu_width]).pred_flag ==
2670 PF_INTRA))
2671 top[i - 1] = top[i];
2672 left[-1] = top[-1];
2673 }
2674 } else {
2675 j = 0;
2676 while (j < size_max_x
2677 &&
2678 !((s->ref->tab_mvf[(((x0 +
2679 ((j) << hshift)) >> s->ps.sps->
2680 log2_min_pu_size)) + (((y0 + ((-1)
2681 <<
2682 vshift))
2683 >> s->ps.sps->
2684 log2_min_pu_size))
2685 * min_pu_width]).pred_flag ==
2686 PF_INTRA))
2687 j++;
2688 if (j > 0)
2689 if (x0 > 0) {
2690 for (i = j; i > (j) - (j + 1); i--)
2691 if (!
2692 ((s->ref->tab_mvf[(((x0 +
2693 ((i -
2694 1) << hshift)) >>
2695 s->ps.sps->log2_min_pu_size))
2696 + (((y0 + ((-1)
2697 << vshift))
2698 >>
2699 s->ps.sps->log2_min_pu_size))
2700 *
2701 min_pu_width]).pred_flag ==
2702 PF_INTRA))
2703 top[i - 1] = top[i];
2704 } else {
2705 for (i = j; i > (j) - (j); i--)
2706 if (!
2707 ((s->ref->tab_mvf[(((x0 +
2708 ((i -
2709 1) << hshift)) >>
2710 s->ps.sps->log2_min_pu_size))
2711 + (((y0 + ((-1)
2712 << vshift))
2713 >>
2714 s->ps.sps->log2_min_pu_size))
2715 *
2716 min_pu_width]).pred_flag ==
2717 PF_INTRA))
2718 top[i - 1] = top[i];
2719 top[-1] = top[0];
2720 }
2721 left[-1] = top[-1];
2722 }
2723 left[-1] = top[-1];
2724 if (cand_bottom_left || cand_left) {
2725 a = ((left[-1]) * 0x01010101U);
2726 for (i = 0; i < (0) + (size_max_y); i += 4)
2727 if (!
2728 ((s->ref->tab_mvf[(((x0 +
2729 ((-1) << hshift)) >> s->ps.sps->
2730 log2_min_pu_size)) + (((y0 +
2731 ((i) <<
2732 vshift))
2733 >> s->ps.sps->
2734 log2_min_pu_size))
2735 * min_pu_width]).pred_flag ==
2736 PF_INTRA))
2737 ((((union unaligned_32 *) (&left[i]))->l) = (a));
2738 else
2739 a = ((left[i + 3]) * 0x01010101U);
2740 }
2741 if (!cand_left) {
2742 vec0 = (v16u8) __msa_fill_b(left[-1]);
2743
2744 ST_UB2(vec0, vec0, left, 16);
2745 }
2746 if (!cand_bottom_left) {
2747 vec0 = (v16u8) __msa_fill_b(left[31]);
2748
2749 ST_UB2(vec0, vec0, (left + 32), 16);
2750 }
2751 if (x0 != 0 && y0 != 0) {
2752 a = ((left[size_max_y - 1]) * 0x01010101U);
2753 for (i = (size_max_y - 1);
2754 i > (size_max_y - 1) - (size_max_y); i -= 4)
2755 if (!
2756 ((s->ref->tab_mvf[(((x0 +
2757 ((-1) << hshift)) >> s->ps.sps->
2758 log2_min_pu_size)) + (((y0 +
2759 ((i -
2760 3) <<
2761 vshift))
2762 >> s->ps.sps->
2763 log2_min_pu_size))
2764 * min_pu_width]).pred_flag ==
2765 PF_INTRA))
2766 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2767 else
2768 a = ((left[i - 3]) * 0x01010101U);
2769 if (!
2770 ((s->ref->tab_mvf[(((x0 +
2771 ((-1) << hshift)) >> s->ps.sps->
2772 log2_min_pu_size)) + (((y0 + ((-1)
2773 <<
2774 vshift))
2775 >> s->ps.sps->
2776 log2_min_pu_size))
2777 * min_pu_width]).pred_flag == PF_INTRA))
2778 left[-1] = left[0];
2779 } else if (x0 == 0) {
2780 do {
2781 uint32_t pix = ((0) * 0x01010101U);
2782 for (i = 0; i < (size_max_y); i += 4)
2783 ((((union unaligned_32 *) (left + i))->l) = (pix));
2784 } while (0);
2785 } else {
2786 a = ((left[size_max_y - 1]) * 0x01010101U);
2787 for (i = (size_max_y - 1);
2788 i > (size_max_y - 1) - (size_max_y); i -= 4)
2789 if (!
2790 ((s->ref->tab_mvf[(((x0 +
2791 ((-1) << hshift)) >> s->ps.sps->
2792 log2_min_pu_size)) + (((y0 +
2793 ((i -
2794 3) <<
2795 vshift))
2796 >> s->ps.sps->
2797 log2_min_pu_size))
2798 * min_pu_width]).pred_flag ==
2799 PF_INTRA))
2800 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2801 else
2802 a = ((left[i - 3]) * 0x01010101U);
2803 }
2804 top[-1] = left[-1];
2805 if (y0 != 0) {
2806 a = ((left[-1]) * 0x01010101U);
2807 for (i = 0; i < (0) + (size_max_x); i += 4)
2808 if (!
2809 ((s->ref->tab_mvf[(((x0 +
2810 ((i) << hshift)) >> s->ps.sps->
2811 log2_min_pu_size)) + (((y0 + ((-1)
2812 <<
2813 vshift))
2814 >> s->ps.sps->
2815 log2_min_pu_size))
2816 * min_pu_width]).pred_flag ==
2817 PF_INTRA))
2818 ((((union unaligned_32 *) (&top[i]))->l) = (a));
2819 else
2820 a = ((top[i + 3]) * 0x01010101U);
2821 }
2822 }
2823 }
2824
2825 if (!cand_bottom_left) {
2826 if (cand_left) {
2827 vec0 = (v16u8) __msa_fill_b(left[31]);
2828
2829 ST_UB2(vec0, vec0, (left + 32), 16);
2830 } else if (cand_up_left) {
2831 vec0 = (v16u8) __msa_fill_b(left[-1]);
2832
2833 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2834
2835 cand_left = 1;
2836 } else if (cand_up) {
2837 left[-1] = top[0];
2838
2839 vec0 = (v16u8) __msa_fill_b(left[-1]);
2840
2841 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2842
2843 cand_up_left = 1;
2844 cand_left = 1;
2845 } else if (cand_up_right) {
2846 vec0 = (v16u8) __msa_fill_b(top[32]);
2847
2848 ST_UB2(vec0, vec0, top, 16);
2849
2850 left[-1] = top[32];
2851
2852 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2853
2854 cand_up = 1;
2855 cand_up_left = 1;
2856 cand_left = 1;
2857 } else {
2858 left[-1] = 128;
2859
2860 vec0 = (v16u8) __msa_ldi_b(128);
2861
2862 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2863 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2864 }
2865 }
2866
2867 if (!cand_left) {
2868 vec0 = (v16u8) __msa_fill_b(left[32]);
2869
2870 ST_UB2(vec0, vec0, left, 16);
2871 }
2872 if (!cand_up_left) {
2873 left[-1] = left[0];
2874 }
2875 if (!cand_up) {
2876 vec0 = (v16u8) __msa_fill_b(left[-1]);
2877
2878 ST_UB2(vec0, vec0, top, 16);
2879 }
2880 if (!cand_up_right) {
2881 vec0 = (v16u8) __msa_fill_b(top[31]);
2882
2883 ST_UB2(vec0, vec0, (top + 32), 16);
2884 }
2885
2886 top[-1] = left[-1];
2887
2888
2889 if (!s->ps.sps->intra_smoothing_disabled_flag
2890 && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2891 if (mode != INTRA_DC && 32 != 4) {
2892 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2893 int min_dist_vert_hor =
2894 (((((int) (mode - 26U)) >=
2895 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2896 ((((int) (mode - 10U)) >=
2897 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898 ? ((((int) (mode - 10U)) >=
2899 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2900 : ((((int) (mode - 26U)) >=
2901 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2902 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2903 int threshold = 1 << (8 - 5);
2904 if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
2905 && c_idx == 0
2906 && ((top[-1] + top[63] - 2 * top[31]) >=
2907 0 ? (top[-1] + top[63] -
2908 2 * top[31]) : (-(top[-1] + top[63] -
2909 2 * top[31]))) < threshold
2910 && ((left[-1] + left[63] - 2 * left[31]) >=
2911 0 ? (left[-1] + left[63] -
2912 2 * left[31]) : (-(left[-1] + left[63] -
2913 2 * left[31]))) < threshold) {
2914
2915
2916 filtered_top[-1] = top[-1];
2917 filtered_top[63] = top[63];
2918
2919
2920 for (i = 0; i < 63; i++) {
2921 filtered_top[i] =
2922 ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2923 }
2924
2925 tmp0 = __msa_fill_h(top[-1]);
2926 tmp1 = __msa_fill_h(top[63]);
2927
2928 tmp2 = mul_val0 - 8;
2929 tmp3 = mul_val0 - 16;
2930 tmp4 = mul_val0 - 24;
2931 tmp5 = mul_val1 + 8;
2932 tmp6 = mul_val1 + 16;
2933 tmp7 = mul_val1 + 24;
2934
2935 res0 = mul_val0 * tmp0;
2936 res1 = tmp2 * tmp0;
2937 res2 = tmp3 * tmp0;
2938 res3 = tmp4 * tmp0;
2939 res0 += mul_val1 * tmp1;
2940 res1 += tmp5 * tmp1;
2941 res2 += tmp6 * tmp1;
2942 res3 += tmp7 * tmp1;
2943
2944 res0 = __msa_srari_h(res0, 6);
2945 res1 = __msa_srari_h(res1, 6);
2946 res2 = __msa_srari_h(res2, 6);
2947 res3 = __msa_srari_h(res3, 6);
2948
2949 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2950 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2951
2952 ST_UB2(vec0, vec1, filtered_top, 16);
2953
2954 res0 = mul_val0 - 32;
2955 tmp2 = mul_val0 - 40;
2956 tmp3 = mul_val0 - 48;
2957 tmp4 = mul_val0 - 56;
2958 res3 = mul_val1 + 32;
2959 tmp5 = mul_val1 + 40;
2960 tmp6 = mul_val1 + 48;
2961 tmp7 = mul_val1 + 56;
2962
2963 res0 = res0 * tmp0;
2964 res1 = tmp2 * tmp0;
2965 res2 = tmp3 * tmp0;
2966 res0 += res3 * tmp1;
2967 res3 = tmp4 * tmp0;
2968 res1 += tmp5 * tmp1;
2969 res2 += tmp6 * tmp1;
2970 res3 += tmp7 * tmp1;
2971
2972 res0 = __msa_srari_h(res0, 6);
2973 res1 = __msa_srari_h(res1, 6);
2974 res2 = __msa_srari_h(res2, 6);
2975 res3 = __msa_srari_h(res3, 6);
2976
2977 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2978 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2979
2980 ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2981
2982 filtered_top[63] = top[63];
2983
2984 tmp0 = __msa_fill_h(left[-1]);
2985 tmp1 = __msa_fill_h(left[63]);
2986
2987 tmp2 = mul_val0 - 8;
2988 tmp3 = mul_val0 - 16;
2989 tmp4 = mul_val0 - 24;
2990 tmp5 = mul_val1 + 8;
2991 tmp6 = mul_val1 + 16;
2992 tmp7 = mul_val1 + 24;
2993
2994 res0 = mul_val0 * tmp0;
2995 res1 = tmp2 * tmp0;
2996 res2 = tmp3 * tmp0;
2997 res3 = tmp4 * tmp0;
2998 res0 += mul_val1 * tmp1;
2999 res1 += tmp5 * tmp1;
3000 res2 += tmp6 * tmp1;
3001 res3 += tmp7 * tmp1;
3002
3003 res0 = __msa_srari_h(res0, 6);
3004 res1 = __msa_srari_h(res1, 6);
3005 res2 = __msa_srari_h(res2, 6);
3006 res3 = __msa_srari_h(res3, 6);
3007
3008 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3009 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3010
3011 ST_UB2(vec0, vec1, left, 16);
3012
3013 res0 = mul_val0 - 32;
3014 tmp2 = mul_val0 - 40;
3015 tmp3 = mul_val0 - 48;
3016 tmp4 = mul_val0 - 56;
3017 res3 = mul_val1 + 32;
3018 tmp5 = mul_val1 + 40;
3019 tmp6 = mul_val1 + 48;
3020 tmp7 = mul_val1 + 56;
3021
3022 res0 = res0 * tmp0;
3023 res1 = tmp2 * tmp0;
3024 res2 = tmp3 * tmp0;
3025 res0 += res3 * tmp1;
3026 res3 = tmp4 * tmp0;
3027 res1 += tmp5 * tmp1;
3028 res2 += tmp6 * tmp1;
3029 res3 += tmp7 * tmp1;
3030
3031 res0 = __msa_srari_h(res0, 6);
3032 res1 = __msa_srari_h(res1, 6);
3033 res2 = __msa_srari_h(res2, 6);
3034 res3 = __msa_srari_h(res3, 6);
3035
3036 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3037 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3038
3039 ST_UB2(vec0, vec1, (left + 32), 16);
3040
3041 left[63] = tmp1[0];
3042
3043 top = filtered_top;
3044 } else {
3045 filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3046 filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3047 for (i = 2 * 32 - 2; i >= 0; i--)
3048 filtered_left[i] = (left[i + 1] + 2 * left[i] +
3049 left[i - 1] + 2) >> 2;
3050 filtered_top[-1] =
3051 filtered_left[-1] =
3052 (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3053 for (i = 2 * 32 - 2; i >= 0; i--)
3054 filtered_top[i] = (top[i + 1] + 2 * top[i] +
3055 top[i - 1] + 2) >> 2;
3056 left = filtered_left;
3057 top = filtered_top;
3058 }
3059 }
3060 }
3061 }
3062
3063 switch (mode) {
3064 case INTRA_PLANAR:
3065 s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3066 (uint8_t *) left, stride);
3067 break;
3068 case INTRA_DC:
3069 s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3070 (uint8_t *) left, stride, 5, c_idx);
3071 break;
3072 default:
3073 s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3074 (uint8_t *) left, stride, c_idx, mode);
3075 break;
3076 }
3077 }
3078