1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp8_rtcd.h"
12 #include "vp8/common/mips/msa/vp8_macros_msa.h"
13
14 #define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
15 { \
16 v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \
17 \
18 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
19 ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \
20 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
21 ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \
22 PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \
23 PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \
24 }
25
26 #define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \
27 { \
28 v8i16 tmp0_m; \
29 \
30 SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \
31 ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \
32 }
33
34 #define RET_1_IF_NZERO_H(in0) \
35 ({ \
36 v8i16 tmp0_m; \
37 v8i16 one_m = __msa_ldi_h(1); \
38 \
39 tmp0_m = __msa_ceqi_h(in0, 0); \
40 tmp0_m = tmp0_m ^ 255; \
41 tmp0_m = one_m & tmp0_m; \
42 \
43 tmp0_m; \
44 })
45
46 #define RET_1_IF_NZERO_W(in0) \
47 ({ \
48 v4i32 tmp0_m; \
49 v4i32 one_m = __msa_ldi_w(1); \
50 \
51 tmp0_m = __msa_ceqi_w(in0, 0); \
52 tmp0_m = tmp0_m ^ 255; \
53 tmp0_m = one_m & tmp0_m; \
54 \
55 tmp0_m; \
56 })
57
58 #define RET_1_IF_NEG_W(in0) \
59 ({ \
60 v4i32 tmp0_m; \
61 \
62 v4i32 one_m = __msa_ldi_w(1); \
63 tmp0_m = __msa_clti_s_w(in0, 0); \
64 tmp0_m = one_m & tmp0_m; \
65 \
66 tmp0_m; \
67 })
68
vp8_short_fdct4x4_msa(int16_t * input,int16_t * output,int32_t pitch)69 void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
70 v8i16 in0, in1, in2, in3;
71 v8i16 temp0, temp1;
72 v8i16 const0, const1;
73 v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
74 v4i32 out0, out1, out2, out3;
75 v8i16 zero = { 0 };
76
77 LD_SH4(input, pitch / 2, in0, in1, in2, in3);
78 TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
79
80 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
81 SLLI_4V(temp0, temp1, in1, in3, 3);
82 in0 = temp0 + temp1;
83 in2 = temp0 - temp1;
84 SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
85 temp0 = __msa_ilvr_h(in3, in1);
86 in1 = __msa_splati_h(coeff, 3);
87 out0 = (v4i32)__msa_ilvev_h(zero, in1);
88 coeff = __msa_ilvl_h(zero, coeff);
89 out1 = __msa_splati_w((v4i32)coeff, 0);
90 DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
91 out0 >>= 12;
92 out1 >>= 12;
93 PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
94 TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
95
96 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
97 in0 = temp0 + temp1 + 7;
98 in2 = temp0 - temp1 + 7;
99 in0 >>= 4;
100 in2 >>= 4;
101 ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
102 temp1 = RET_1_IF_NZERO_H(in3);
103 ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
104 SPLATI_W2_SW(coeff, 2, out3, out1);
105 out3 += out1;
106 out1 = __msa_splati_w((v4i32)coeff, 1);
107 DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
108 out1 >>= 16;
109 out3 >>= 16;
110 out1 += (v4i32)temp1;
111 PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
112 ST_SH2(in0, in2, output, 8);
113 }
114
vp8_short_fdct8x4_msa(int16_t * input,int16_t * output,int32_t pitch)115 void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
116 v8i16 in0, in1, in2, in3;
117 v8i16 temp0, temp1, tmp0, tmp1;
118 v8i16 const0, const1, const2;
119 v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
120 v8i16 zero = { 0 };
121 v4i32 vec0_w, vec1_w, vec2_w, vec3_w;
122
123 LD_SH4(input, pitch / 2, in0, in1, in2, in3);
124 TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
125
126 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
127 SLLI_4V(temp0, temp1, in1, in3, 3);
128 in0 = temp0 + temp1;
129 in2 = temp0 - temp1;
130 SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
131 temp0 = __msa_splati_h(coeff, 3);
132 vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
133 coeff = __msa_ilvl_h(zero, coeff);
134 vec3_w = __msa_splati_w((v4i32)coeff, 0);
135 ILVRL_H2_SH(in3, in1, tmp1, tmp0);
136 vec0_w = vec1_w;
137 vec2_w = vec3_w;
138 DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
139 vec1_w, vec2_w, vec3_w);
140 SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
141 PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
142 TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
143
144 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
145 in0 = temp0 + temp1 + 7;
146 in2 = temp0 - temp1 + 7;
147 in0 >>= 4;
148 in2 >>= 4;
149 SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
150 vec3_w += vec1_w;
151 vec1_w = __msa_splati_w((v4i32)coeff, 1);
152 const0 = RET_1_IF_NZERO_H(in3);
153 ILVRL_H2_SH(in3, in1, tmp1, tmp0);
154 vec0_w = vec1_w;
155 vec2_w = vec3_w;
156 DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
157 vec1_w, vec2_w, vec3_w);
158 SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
159 PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
160 in1 += const0;
161 PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
162 ST_SH2(temp0, temp1, output, 8);
163
164 PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
165 ST_SH2(in0, in2, output + 16, 8);
166 }
167
vp8_short_walsh4x4_msa(int16_t * input,int16_t * output,int32_t pitch)168 void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
169 v8i16 in0_h, in1_h, in2_h, in3_h;
170 v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;
171
172 LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
173 TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);
174
175 UNPCK_R_SH_SW(in0_h, in0_w);
176 UNPCK_R_SH_SW(in1_h, in1_w);
177 UNPCK_R_SH_SW(in2_h, in2_w);
178 UNPCK_R_SH_SW(in3_h, in3_w);
179 BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
180 SLLI_4V(temp0, temp1, temp2, temp3, 2);
181 BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
182 temp0 = RET_1_IF_NZERO_W(temp0);
183 in0_w += temp0;
184 TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);
185
186 BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
187 BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
188 in0_w += RET_1_IF_NEG_W(in0_w);
189 in1_w += RET_1_IF_NEG_W(in1_w);
190 in2_w += RET_1_IF_NEG_W(in2_w);
191 in3_w += RET_1_IF_NEG_W(in3_w);
192 ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
193 SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
194 PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
195 ST_SH2(in0_h, in1_h, output, 8);
196 }
197