1 /*
2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h263dsp_mips.h"
23
h263_dct_unquantize_msa(int16_t * block,int16_t qmul,int16_t qadd,int8_t n_coeffs,uint8_t loop_start)24 static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
25 int16_t qadd, int8_t n_coeffs,
26 uint8_t loop_start)
27 {
28 int16_t *block_dup = block;
29 int32_t level, cnt;
30 v8i16 block_vec, qmul_vec, qadd_vec, sub;
31 v8i16 add, mask, mul, zero_mask;
32
33 qmul_vec = __msa_fill_h(qmul);
34 qadd_vec = __msa_fill_h(qadd);
35 for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
36 block_vec = LD_SH(block_dup + loop_start);
37 mask = __msa_clti_s_h(block_vec, 0);
38 zero_mask = __msa_ceqi_h(block_vec, 0);
39 mul = block_vec * qmul_vec;
40 sub = mul - qadd_vec;
41 add = mul + qadd_vec;
42 add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
43 block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
44 (v16u8) zero_mask);
45 ST_SH(block_vec, block_dup + loop_start);
46 block_dup += 8;
47 }
48
49 cnt = ((n_coeffs >> 3) * 8) + loop_start;
50
51 for (; cnt <= n_coeffs; cnt++) {
52 level = block[cnt];
53 if (level) {
54 if (level < 0) {
55 level = level * qmul - qadd;
56 } else {
57 level = level * qmul + qadd;
58 }
59 block[cnt] = level;
60 }
61 }
62 }
63
mpeg2_dct_unquantize_inter_msa(int16_t * block,int32_t qscale,const int16_t * quant_matrix)64 static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block,
65 int32_t qscale,
66 const int16_t *quant_matrix)
67 {
68 int32_t cnt, sum_res = -1;
69 v8i16 block_vec, block_neg, qscale_vec, mask;
70 v8i16 block_org0, block_org1, block_org2, block_org3;
71 v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
72 v8i16 sum, mul, zero_mask;
73 v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
74 v4i32 block_l, block_r, sad;
75
76 qscale_vec = __msa_fill_h(qscale);
77 for (cnt = 0; cnt < 2; cnt++) {
78 LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
79 LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
80 mask = __msa_clti_s_h(block_org0, 0);
81 zero_mask = __msa_ceqi_h(block_org0, 0);
82 block_neg = -block_org0;
83 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
84 (v16u8) mask);
85 block_vec <<= 1;
86 block_vec += 1;
87 UNPCK_SH_SW(block_vec, block_r, block_l);
88 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
89 UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
90 mul_vec = block_l * qscale_l;
91 mul_vec *= quant_m_l;
92 block_l = mul_vec >> 4;
93 mul_vec = block_r * qscale_r;
94 mul_vec *= quant_m_r;
95 block_r = mul_vec >> 4;
96 mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
97 block_neg = - mul;
98 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
99 (v16u8) mask);
100 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
101 (v16u8) zero_mask);
102 ST_SH(sum, block);
103 block += 8;
104 quant_matrix += 8;
105 sad = __msa_hadd_s_w(sum, sum);
106 sum_res += HADD_SW_S32(sad);
107 mask = __msa_clti_s_h(block_org1, 0);
108 zero_mask = __msa_ceqi_h(block_org1, 0);
109 block_neg = - block_org1;
110 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
111 (v16u8) mask);
112 block_vec <<= 1;
113 block_vec += 1;
114 UNPCK_SH_SW(block_vec, block_r, block_l);
115 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
116 UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
117 mul_vec = block_l * qscale_l;
118 mul_vec *= quant_m_l;
119 block_l = mul_vec >> 4;
120 mul_vec = block_r * qscale_r;
121 mul_vec *= quant_m_r;
122 block_r = mul_vec >> 4;
123 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
124 block_neg = - mul;
125 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
126 (v16u8) mask);
127 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
128 (v16u8) zero_mask);
129 ST_SH(sum, block);
130
131 block += 8;
132 quant_matrix += 8;
133 sad = __msa_hadd_s_w(sum, sum);
134 sum_res += HADD_SW_S32(sad);
135 mask = __msa_clti_s_h(block_org2, 0);
136 zero_mask = __msa_ceqi_h(block_org2, 0);
137 block_neg = - block_org2;
138 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
139 (v16u8) mask);
140 block_vec <<= 1;
141 block_vec += 1;
142 UNPCK_SH_SW(block_vec, block_r, block_l);
143 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
144 UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
145 mul_vec = block_l * qscale_l;
146 mul_vec *= quant_m_l;
147 block_l = mul_vec >> 4;
148 mul_vec = block_r * qscale_r;
149 mul_vec *= quant_m_r;
150 block_r = mul_vec >> 4;
151 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
152 block_neg = - mul;
153 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
154 (v16u8) mask);
155 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
156 (v16u8) zero_mask);
157 ST_SH(sum, block);
158
159 block += 8;
160 quant_matrix += 8;
161 sad = __msa_hadd_s_w(sum, sum);
162 sum_res += HADD_SW_S32(sad);
163 mask = __msa_clti_s_h(block_org3, 0);
164 zero_mask = __msa_ceqi_h(block_org3, 0);
165 block_neg = - block_org3;
166 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
167 (v16u8) mask);
168 block_vec <<= 1;
169 block_vec += 1;
170 UNPCK_SH_SW(block_vec, block_r, block_l);
171 UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
172 UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
173 mul_vec = block_l * qscale_l;
174 mul_vec *= quant_m_l;
175 block_l = mul_vec >> 4;
176 mul_vec = block_r * qscale_r;
177 mul_vec *= quant_m_r;
178 block_r = mul_vec >> 4;
179 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
180 block_neg = - mul;
181 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
182 (v16u8) mask);
183 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
184 (v16u8) zero_mask);
185 ST_SH(sum, block);
186
187 block += 8;
188 quant_matrix += 8;
189 sad = __msa_hadd_s_w(sum, sum);
190 sum_res += HADD_SW_S32(sad);
191 }
192
193 return sum_res;
194 }
195
ff_dct_unquantize_h263_intra_msa(MpegEncContext * s,int16_t * block,int32_t index,int32_t qscale)196 void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
197 int16_t *block, int32_t index,
198 int32_t qscale)
199 {
200 int32_t qmul, qadd;
201 int32_t nCoeffs;
202
203 av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
204
205 qmul = qscale << 1;
206
207 if (!s->h263_aic) {
208 block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
209 qadd = (qscale - 1) | 1;
210 } else {
211 qadd = 0;
212 }
213 if (s->ac_pred)
214 nCoeffs = 63;
215 else
216 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
217
218 h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
219 }
220
ff_dct_unquantize_h263_inter_msa(MpegEncContext * s,int16_t * block,int32_t index,int32_t qscale)221 void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
222 int16_t *block, int32_t index,
223 int32_t qscale)
224 {
225 int32_t qmul, qadd;
226 int32_t nCoeffs;
227
228 av_assert2(s->block_last_index[index] >= 0);
229
230 qadd = (qscale - 1) | 1;
231 qmul = qscale << 1;
232
233 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
234
235 h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
236 }
237
ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext * s,int16_t * block,int32_t index,int32_t qscale)238 void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
239 int16_t *block, int32_t index,
240 int32_t qscale)
241 {
242 const uint16_t *quant_matrix;
243 int32_t sum = -1;
244
245 quant_matrix = s->inter_matrix;
246
247 sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
248
249 block[63] ^= sum & 1;
250 }
251