1 /*
2 * Loongson SIMD optimized mpegvideo
3 *
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "mpegvideo_mips.h"
26 #include "libavutil/mips/mmiutils.h"
27
ff_dct_unquantize_h263_intra_mmi(MpegEncContext * s,int16_t * block,int n,int qscale)28 void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
29 int n, int qscale)
30 {
31 int64_t level, qmul, qadd, nCoeffs;
32 double ftmp[6];
33 mips_reg addr[1];
34 DECLARE_VAR_ALL64;
35
36 qmul = qscale << 1;
37 av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
38
39 if (!s->h263_aic) {
40 if (n<4)
41 level = block[0] * s->y_dc_scale;
42 else
43 level = block[0] * s->c_dc_scale;
44 qadd = (qscale-1) | 1;
45 } else {
46 qadd = 0;
47 level = block[0];
48 }
49
50 if(s->ac_pred)
51 nCoeffs = 63;
52 else
53 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
54
55 __asm__ volatile (
56 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
57 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
58 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
59 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
60 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
61 "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t"
62 "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
63 ".p2align 4 \n\t"
64
65 "1: \n\t"
66 PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t"
67 MMI_LDC1(%[ftmp1], %[addr0], 0x00)
68 MMI_LDC1(%[ftmp2], %[addr0], 0x08)
69 "mov.d %[ftmp3], %[ftmp1] \n\t"
70 "mov.d %[ftmp4], %[ftmp2] \n\t"
71 "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t"
72 "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t"
73 "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
74 "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
75 "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
76 "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
77 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
78 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
79 "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
80 "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
81 "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
82 "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
83 "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
84 "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
85 PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t"
86 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
87 MMI_SDC1(%[ftmp2], %[addr0], 0x08)
88 "blez %[nCoeffs], 1b \n\t"
89 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
90 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
91 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
92 RESTRICT_ASM_ALL64
93 [addr0]"=&r"(addr[0])
94 : [block]"r"((mips_reg)(block+nCoeffs)),
95 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
96 [qmul]"f"(qmul), [qadd]"f"(qadd)
97 : "memory"
98 );
99
100 block[0] = level;
101 }
102
ff_dct_unquantize_h263_inter_mmi(MpegEncContext * s,int16_t * block,int n,int qscale)103 void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
104 int n, int qscale)
105 {
106 int64_t qmul, qadd, nCoeffs;
107 double ftmp[6];
108 mips_reg addr[1];
109 DECLARE_VAR_ALL64;
110
111 qmul = qscale << 1;
112 qadd = (qscale - 1) | 1;
113 av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
114 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
115
116 __asm__ volatile (
117 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
118 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
119 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
120 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
121 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
122 "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t"
123 "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
124 ".p2align 4 \n\t"
125 "1: \n\t"
126 PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t"
127 MMI_LDC1(%[ftmp1], %[addr0], 0x00)
128 MMI_LDC1(%[ftmp2], %[addr0], 0x08)
129 "mov.d %[ftmp3], %[ftmp1] \n\t"
130 "mov.d %[ftmp4], %[ftmp2] \n\t"
131 "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t"
132 "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t"
133 "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
134 "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
135 "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
136 "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
137 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
138 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
139 "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
140 "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
141 "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
142 "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
143 "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
144 "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
145 PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t"
146 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
147 MMI_SDC1(%[ftmp2], %[addr0], 0x08)
148 "blez %[nCoeffs], 1b \n\t"
149 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
150 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
151 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
152 RESTRICT_ASM_ALL64
153 [addr0]"=&r"(addr[0])
154 : [block]"r"((mips_reg)(block+nCoeffs)),
155 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
156 [qmul]"f"(qmul), [qadd]"f"(qadd)
157 : "memory"
158 );
159 }
160
ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext * s,int16_t * block,int n,int qscale)161 void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
162 int n, int qscale)
163 {
164 int64_t nCoeffs;
165 const uint16_t *quant_matrix;
166 int block0;
167 double ftmp[10];
168 uint64_t tmp[1];
169 mips_reg addr[1];
170 DECLARE_VAR_ALL64;
171 DECLARE_VAR_ADDRT;
172
173 av_assert2(s->block_last_index[n]>=0);
174 nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
175
176 if (n<4)
177 block0 = block[0] * s->y_dc_scale;
178 else
179 block0 = block[0] * s->c_dc_scale;
180
181 /* XXX: only mpeg1 */
182 quant_matrix = s->intra_matrix;
183
184 __asm__ volatile (
185 "dli %[tmp0], 0x0f \n\t"
186 "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
187 "dmtc1 %[tmp0], %[ftmp4] \n\t"
188 "dmtc1 %[qscale], %[ftmp1] \n\t"
189 "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
190 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
191 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
192 "or %[addr0], %[nCoeffs], $0 \n\t"
193 ".p2align 4 \n\t"
194
195 "1: \n\t"
196 MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
197 MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
198 "mov.d %[ftmp4], %[ftmp2] \n\t"
199 "mov.d %[ftmp5], %[ftmp3] \n\t"
200 MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
201 MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
202 "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
203 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
204 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
205 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
206 "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
207 "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t"
208 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
209 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
210 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
211 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
212 "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
213 "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
214 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
215 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
216 "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
217 "dli %[tmp0], 0x03 \n\t"
218 "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
219 "dmtc1 %[tmp0], %[ftmp4] \n\t"
220 "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
221 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
222 "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
223 "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
224 "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
225 "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
226 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
227 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
228 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
229 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
230 "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
231 "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
232 MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
233 MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
234 PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
235 "bltz %[addr0], 1b \n\t"
236 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
237 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
238 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
239 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
240 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
241 [tmp0]"=&r"(tmp[0]),
242 RESTRICT_ASM_ALL64
243 RESTRICT_ASM_ADDRT
244 [addr0]"=&r"(addr[0])
245 : [block]"r"((mips_reg)(block+nCoeffs)),
246 [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
247 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
248 [qscale]"r"(qscale)
249 : "memory"
250 );
251
252 block[0] = block0;
253 }
254
ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext * s,int16_t * block,int n,int qscale)255 void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
256 int n, int qscale)
257 {
258 int64_t nCoeffs;
259 const uint16_t *quant_matrix;
260 double ftmp[10];
261 uint64_t tmp[1];
262 mips_reg addr[1];
263 DECLARE_VAR_ALL64;
264 DECLARE_VAR_ADDRT;
265
266 av_assert2(s->block_last_index[n] >= 0);
267 nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
268 quant_matrix = s->inter_matrix;
269
270 __asm__ volatile (
271 "dli %[tmp0], 0x0f \n\t"
272 "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
273 "dmtc1 %[tmp0], %[ftmp4] \n\t"
274 "dmtc1 %[qscale], %[ftmp1] \n\t"
275 "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
276 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
277 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
278 "or %[addr0], %[nCoeffs], $0 \n\t"
279 ".p2align 4 \n\t"
280
281 "1: \n\t"
282 MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
283 MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
284 "mov.d %[ftmp4], %[ftmp2] \n\t"
285 "mov.d %[ftmp5], %[ftmp3] \n\t"
286 MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
287 MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
288 "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
289 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
290 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
291 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
292 "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
293 "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t"
294 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
295 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
296 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
297 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
298 "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
299 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
300 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
301 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
302 "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
303 "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
304 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
305 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
306 "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
307 "dli %[tmp0], 0x04 \n\t"
308 "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
309 "dmtc1 %[tmp0], %[ftmp4] \n\t"
310 "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
311 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
312 "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
313 "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
314 "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
315 "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
316 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
317 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
318 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
319 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
320 "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
321 "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
322 MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
323 MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
324 PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
325 "bltz %[addr0], 1b \n\t"
326 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
327 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
328 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
329 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
330 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
331 [tmp0]"=&r"(tmp[0]),
332 RESTRICT_ASM_ALL64
333 RESTRICT_ASM_ADDRT
334 [addr0]"=&r"(addr[0])
335 : [block]"r"((mips_reg)(block+nCoeffs)),
336 [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
337 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
338 [qscale]"r"(qscale)
339 : "memory"
340 );
341 }
342
ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext * s,int16_t * block,int n,int qscale)343 void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
344 int n, int qscale)
345 {
346 uint64_t nCoeffs;
347 const uint16_t *quant_matrix;
348 int block0;
349 double ftmp[10];
350 uint64_t tmp[1];
351 mips_reg addr[1];
352 DECLARE_VAR_ALL64;
353 DECLARE_VAR_ADDRT;
354
355 assert(s->block_last_index[n]>=0);
356
357 if (s->alternate_scan)
358 nCoeffs = 63;
359 else
360 nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
361
362 if (n < 4)
363 block0 = block[0] * s->y_dc_scale;
364 else
365 block0 = block[0] * s->c_dc_scale;
366
367 quant_matrix = s->intra_matrix;
368
369 __asm__ volatile (
370 "dli %[tmp0], 0x0f \n\t"
371 "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
372 "mtc1 %[tmp0], %[ftmp3] \n\t"
373 "mtc1 %[qscale], %[ftmp9] \n\t"
374 "psrlh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
375 "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
376 "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
377 "or %[addr0], %[nCoeffs], $0 \n\t"
378 ".p2align 4 \n\t"
379
380 "1: \n\t"
381 MMI_LDXC1(%[ftmp1], %[addr0], %[block], 0x00)
382 MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x08)
383 "mov.d %[ftmp3], %[ftmp1] \n\t"
384 "mov.d %[ftmp4], %[ftmp2] \n\t"
385 MMI_LDXC1(%[ftmp5], %[addr0], %[quant], 0x00)
386 MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x08)
387 "pmullh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
388 "pmullh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
389 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
390 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
391 "pcmpgth %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
392 "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
393 "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
394 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
395 "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
396 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
397 "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
398 "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
399 "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
400 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
401 "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
402 "dli %[tmp0], 0x03 \n\t"
403 "pcmpeqh %[ftmp6] , %[ftmp6], %[ftmp4] \n\t"
404 "mtc1 %[tmp0], %[ftmp3] \n\t"
405 "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
406 "psrah %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
407 "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
408 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
409 "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
410 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
411 "pandn %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
412 "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
413 MMI_SDXC1(%[ftmp5], %[addr0], %[block], 0x00)
414 MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x08)
415 PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
416 "blez %[addr0], 1b \n\t"
417 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
418 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
419 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
420 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
421 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
422 [tmp0]"=&r"(tmp[0]),
423 RESTRICT_ASM_ALL64
424 RESTRICT_ASM_ADDRT
425 [addr0]"=&r"(addr[0])
426 : [block]"r"((mips_reg)(block+nCoeffs)),
427 [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
428 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
429 [qscale]"r"(qscale)
430 : "memory"
431 );
432
433 block[0]= block0;
434 }
435
ff_denoise_dct_mmi(MpegEncContext * s,int16_t * block)436 void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
437 {
438 const int intra = s->mb_intra;
439 int *sum = s->dct_error_sum[intra];
440 uint16_t *offset = s->dct_offset[intra];
441 double ftmp[8];
442 mips_reg addr[1];
443 DECLARE_VAR_ALL64;
444
445 s->dct_count[intra]++;
446
447 __asm__ volatile(
448 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
449 "1: \n\t"
450 MMI_LDC1(%[ftmp1], %[block], 0x00)
451 "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
452 MMI_LDC1(%[ftmp3], %[block], 0x08)
453 "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
454 "pcmpgth %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
455 "pcmpgth %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
456 "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
457 "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
458 "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
459 "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
460 MMI_LDC1(%[ftmp6], %[offset], 0x00)
461 "mov.d %[ftmp5], %[ftmp1] \n\t"
462 "psubush %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
463 MMI_LDC1(%[ftmp6], %[offset], 0x08)
464 "mov.d %[ftmp7], %[ftmp3] \n\t"
465 "psubush %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
466 "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
467 "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
468 "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
469 "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
470 MMI_SDC1(%[ftmp1], %[block], 0x00)
471 MMI_SDC1(%[ftmp3], %[block], 0x08)
472 "mov.d %[ftmp1], %[ftmp5] \n\t"
473 "mov.d %[ftmp3], %[ftmp7] \n\t"
474 "punpcklhw %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
475 "punpckhhw %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
476 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
477 "punpckhhw %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
478 MMI_LDC1(%[ftmp2], %[sum], 0x00)
479 "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
480 MMI_LDC1(%[ftmp2], %[sum], 0x08)
481 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
482 MMI_LDC1(%[ftmp2], %[sum], 0x10)
483 "paddw %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
484 MMI_LDC1(%[ftmp2], %[sum], 0x18)
485 "paddw %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
486 MMI_SDC1(%[ftmp5], %[sum], 0x00)
487 MMI_SDC1(%[ftmp1], %[sum], 0x08)
488 MMI_SDC1(%[ftmp7], %[sum], 0x10)
489 MMI_SDC1(%[ftmp3], %[sum], 0x18)
490 PTR_ADDIU "%[block], %[block], 0x10 \n\t"
491 PTR_ADDIU "%[sum], %[sum], 0x20 \n\t"
492 PTR_SUBU "%[addr0], %[block1], %[block] \n\t"
493 PTR_ADDIU "%[offset], %[offset], 0x10 \n\t"
494 "bgtz %[addr0], 1b \n\t"
495 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
496 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
497 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
498 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
499 RESTRICT_ASM_ALL64
500 [addr0]"=&r"(addr[0]),
501 [block]"+&r"(block), [sum]"+&r"(sum),
502 [offset]"+&r"(offset)
503 : [block1]"r"(block+64)
504 : "memory"
505 );
506 }
507