1 /*
2 * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3 *
4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "libavutil/attributes.h"
24 #include "libavutil/avassert.h"
25 #include "libavutil/mem_internal.h"
26
27 #include "libavcodec/vc1dsp.h"
28 #include "constants.h"
29 #include "vc1dsp_mips.h"
30 #include "hpeldsp_mips.h"
31 #include "libavutil/mips/mmiutils.h"
32
33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
34 "li %[tmp0], "#r1" \n\t" \
35 "mtc1 %[tmp0], %[ftmp13] \n\t" \
36 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
37 "li %[tmp0], "#r2" \n\t" \
38 "mtc1 %[tmp0], %[ftmp14] \n\t" \
39 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
40 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
41 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
42 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
43 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
44 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
45 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
46 \
47 "li %[tmp0], "#r3" \n\t" \
48 "mtc1 %[tmp0], %[ftmp13] \n\t" \
49 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
50 "li %[tmp0], "#r4" \n\t" \
51 "mtc1 %[tmp0], %[ftmp14] \n\t" \
52 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
53 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
54 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
55 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
56 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
57 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
58 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
59 \
60 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
61 "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
62 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
63 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
64 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
65 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
66 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
67 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
68 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
69 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
70 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
71 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
72 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
73 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
74 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
75 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
76
77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
78 "li %[tmp0], "#r1" \n\t" \
79 "mtc1 %[tmp0], %[ftmp13] \n\t" \
80 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
81 "li %[tmp0], "#r2" \n\t" \
82 "mtc1 %[tmp0], %[ftmp14] \n\t" \
83 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
84 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
85 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
86 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
87 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
88 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
89 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
90 \
91 "li %[tmp0], "#r3" \n\t" \
92 "mtc1 %[tmp0], %[ftmp13] \n\t" \
93 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
94 "li %[tmp0], "#r4" \n\t" \
95 "mtc1 %[tmp0], %[ftmp14] \n\t" \
96 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
97 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
98 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
99 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
100 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
101 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
102 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
103 \
104 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
105 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
106 "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
107 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
108 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
109 "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
110 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
111 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
112 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
113 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
114 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
115 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
116 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
117 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
118 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
119 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
120 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
121 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
122 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
123 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
124
125 /* Do inverse transform on 8x8 block */
ff_vc1_inv_trans_8x8_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)126 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
127 {
128 int dc = block[0];
129 double ftmp[9];
130 mips_reg addr[1];
131 int count;
132 union mmi_intfloat64 dc_u;
133
134 dc = (3 * dc + 1) >> 1;
135 dc = (3 * dc + 16) >> 5;
136 dc_u.i = dc;
137
138 __asm__ volatile(
139 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
140 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
141 "li %[count], 0x02 \n\t"
142
143 "1: \n\t"
144 MMI_LDC1(%[ftmp1], %[dest], 0x00)
145 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
146 MMI_LDC1(%[ftmp2], %[addr0], 0x00)
147 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
148 MMI_LDC1(%[ftmp3], %[addr0], 0x00)
149 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
150 MMI_LDC1(%[ftmp4], %[addr0], 0x00)
151
152 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
153 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
154 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
155 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
156 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
157 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
158 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
159 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
160
161 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
162 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
163 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
164 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
165 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
166 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
167 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
168 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
169
170 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
171 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
172 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
173 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
174
175 MMI_SDC1(%[ftmp1], %[dest], 0x00)
176 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
177 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
178 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
179 MMI_SDC1(%[ftmp3], %[addr0], 0x00)
180 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
181 MMI_SDC1(%[ftmp4], %[addr0], 0x00)
182
183 "addiu %[count], %[count], -0x01 \n\t"
184 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
185 "bnez %[count], 1b \n\t"
186 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
187 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
188 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
189 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
190 [ftmp8]"=&f"(ftmp[8]),
191 [addr0]"=&r"(addr[0]),
192 [count]"=&r"(count), [dest]"+&r"(dest)
193 : [linesize]"r"((mips_reg)linesize),
194 [dc]"f"(dc_u.f)
195 : "memory"
196 );
197 }
198
199 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_8x8_mmi(int16_t block[64])200 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
201 {
202 DECLARE_ALIGNED(16, int16_t, temp[64]);
203 double ftmp[23];
204 uint64_t tmp[1];
205
206 __asm__ volatile (
207 /* 1st loop: start */
208 "li %[tmp0], 0x03 \n\t"
209 "mtc1 %[tmp0], %[ftmp0] \n\t"
210
211 // 1st part
212 MMI_LDC1(%[ftmp1], %[block], 0x00)
213 MMI_LDC1(%[ftmp11], %[block], 0x10)
214 MMI_LDC1(%[ftmp2], %[block], 0x20)
215 MMI_LDC1(%[ftmp12], %[block], 0x30)
216 MMI_LDC1(%[ftmp3], %[block], 0x40)
217 MMI_LDC1(%[ftmp13], %[block], 0x50)
218 MMI_LDC1(%[ftmp4], %[block], 0x60)
219 MMI_LDC1(%[ftmp14], %[block], 0x70)
220 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
221 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
222 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
223 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
224
225 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
226 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
227 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
228 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
229
230 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
231 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
232 0x000f0010, 0x00040009, %[ff_pw_4])
233
234 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
235 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
236 0xfffc000f, 0xfff7fff0, %[ff_pw_4])
237
238 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
239 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
240 0xfff00009, 0x000f0004, %[ff_pw_4])
241
242 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
243 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
244 0xfff70004, 0xfff0000f, %[ff_pw_4])
245
246 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
247 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
248
249 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
250 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
251
252 MMI_SDC1(%[ftmp15], %[temp], 0x00)
253 MMI_SDC1(%[ftmp19], %[temp], 0x08)
254 MMI_SDC1(%[ftmp16], %[temp], 0x10)
255 MMI_SDC1(%[ftmp20], %[temp], 0x18)
256 MMI_SDC1(%[ftmp17], %[temp], 0x20)
257 MMI_SDC1(%[ftmp21], %[temp], 0x28)
258 MMI_SDC1(%[ftmp18], %[temp], 0x30)
259 MMI_SDC1(%[ftmp22], %[temp], 0x38)
260
261 // 2nd part
262 MMI_LDC1(%[ftmp1], %[block], 0x08)
263 MMI_LDC1(%[ftmp11], %[block], 0x18)
264 MMI_LDC1(%[ftmp2], %[block], 0x28)
265 MMI_LDC1(%[ftmp12], %[block], 0x38)
266 MMI_LDC1(%[ftmp3], %[block], 0x48)
267 MMI_LDC1(%[ftmp13], %[block], 0x58)
268 MMI_LDC1(%[ftmp4], %[block], 0x68)
269 MMI_LDC1(%[ftmp14], %[block], 0x78)
270 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
271 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
272 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
273 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
274
275 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
276 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
277 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
278 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
279
280 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
281 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
282 0x000f0010, 0x00040009, %[ff_pw_4])
283
284 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
285 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
286 0xfffc000f, 0xfff7fff0, %[ff_pw_4])
287
288 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
289 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
290 0xfff00009, 0x000f0004, %[ff_pw_4])
291
292 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
293 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
294 0xfff70004, 0xfff0000f, %[ff_pw_4])
295
296 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
297 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
298
299 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
300 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
301
302 MMI_SDC1(%[ftmp19], %[temp], 0x48)
303 MMI_SDC1(%[ftmp20], %[temp], 0x58)
304 MMI_SDC1(%[ftmp21], %[temp], 0x68)
305 MMI_SDC1(%[ftmp22], %[temp], 0x78)
306 /* 1st loop: end */
307
308 /* 2nd loop: start */
309 "li %[tmp0], 0x07 \n\t"
310 "mtc1 %[tmp0], %[ftmp0] \n\t"
311
312 // 1st part
313 MMI_LDC1(%[ftmp1], %[temp], 0x00)
314 MMI_LDC1(%[ftmp11], %[temp], 0x10)
315 MMI_LDC1(%[ftmp2], %[temp], 0x20)
316 MMI_LDC1(%[ftmp12], %[temp], 0x30)
317 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
318 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
319 "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
320 "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
321
322 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
323 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
324 "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
325 "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
326
327 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
328 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
329 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
330
331 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
332 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
333 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
334
335 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
336 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
337 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
338
339 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
340 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
341 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
342
343 MMI_SDC1(%[ftmp15], %[block], 0x00)
344 MMI_SDC1(%[ftmp16], %[block], 0x10)
345 MMI_SDC1(%[ftmp17], %[block], 0x20)
346 MMI_SDC1(%[ftmp18], %[block], 0x30)
347 MMI_SDC1(%[ftmp19], %[block], 0x40)
348 MMI_SDC1(%[ftmp20], %[block], 0x50)
349 MMI_SDC1(%[ftmp21], %[block], 0x60)
350 MMI_SDC1(%[ftmp22], %[block], 0x70)
351
352 // 2nd part
353 MMI_LDC1(%[ftmp1], %[temp], 0x08)
354 MMI_LDC1(%[ftmp11], %[temp], 0x18)
355 MMI_LDC1(%[ftmp2], %[temp], 0x28)
356 MMI_LDC1(%[ftmp12], %[temp], 0x38)
357 MMI_LDC1(%[ftmp3], %[temp], 0x48)
358 MMI_LDC1(%[ftmp13], %[temp], 0x58)
359 MMI_LDC1(%[ftmp4], %[temp], 0x68)
360 MMI_LDC1(%[ftmp14], %[temp], 0x78)
361 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
362 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
363 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
364 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
365
366 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
367 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
368 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
369 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
370
371 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
372 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
373 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
374
375 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
376 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
377 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
378
379 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
380 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
381 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
382
383 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
384 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
385 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
386
387 MMI_SDC1(%[ftmp15], %[block], 0x08)
388 MMI_SDC1(%[ftmp16], %[block], 0x18)
389 MMI_SDC1(%[ftmp17], %[block], 0x28)
390 MMI_SDC1(%[ftmp18], %[block], 0x38)
391 MMI_SDC1(%[ftmp19], %[block], 0x48)
392 MMI_SDC1(%[ftmp20], %[block], 0x58)
393 MMI_SDC1(%[ftmp21], %[block], 0x68)
394 MMI_SDC1(%[ftmp22], %[block], 0x78)
395 /* 2nd loop: end */
396 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
397 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
398 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
399 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
400 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
401 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
402 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
403 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
404 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
405 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
406 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
407 [ftmp22]"=&f"(ftmp[22]),
408 [tmp0]"=&r"(tmp[0])
409 : [ff_pw_1]"f"(ff_pw_32_1.f), [ff_pw_64]"f"(ff_pw_32_64.f),
410 [ff_pw_4]"f"(ff_pw_32_4.f), [block]"r"(block),
411 [temp]"r"(temp)
412 : "memory"
413 );
414 }
415 #endif
416
417 /* Do inverse transform on 8x4 part of block */
ff_vc1_inv_trans_8x4_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)418 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
419 {
420 int dc = block[0];
421 double ftmp[9];
422 union mmi_intfloat64 dc_u;
423
424 dc = ( 3 * dc + 1) >> 1;
425 dc = (17 * dc + 64) >> 7;
426 dc_u.i = dc;
427
428 __asm__ volatile(
429 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
430 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
431
432 MMI_LDC1(%[ftmp1], %[dest0], 0x00)
433 MMI_LDC1(%[ftmp2], %[dest1], 0x00)
434 MMI_LDC1(%[ftmp3], %[dest2], 0x00)
435 MMI_LDC1(%[ftmp4], %[dest3], 0x00)
436
437 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
438 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
439 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
440 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
441 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
442 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
443 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
444 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
445
446 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
447 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
448 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
449 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
450 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
451 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
452 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
453 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
454
455 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
456 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
457 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
458 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
459
460 MMI_SDC1(%[ftmp1], %[dest0], 0x00)
461 MMI_SDC1(%[ftmp2], %[dest1], 0x00)
462 MMI_SDC1(%[ftmp3], %[dest2], 0x00)
463 MMI_SDC1(%[ftmp4], %[dest3], 0x00)
464 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
465 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
466 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
467 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
468 [ftmp8]"=&f"(ftmp[8])
469 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
470 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
471 [dc]"f"(dc_u.f)
472 : "memory"
473 );
474 }
475
476 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_8x4_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)477 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
478 {
479 int16_t *src = block;
480 int16_t *dst = block;
481 double ftmp[16];
482 uint32_t tmp[1];
483 int16_t count = 4;
484 int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
485 12, 15, 6, -4, -12, -16, -16, -9,
486 12, 9, -6, -16, -12, 4, 16, 15,
487 12, 4, -16, -9, 12, 15, -6, -16,
488 12, -4, -16, 9, 12, -15, -6, 16,
489 12, -9, -6, 16, -12, -4, 16, -15,
490 12, -15, 6, 4, -12, 16, -16, 9,
491 12, -16, 16, -15, 12, -9, 6, -4};
492
493 // 1st loop
494 __asm__ volatile (
495 "li %[tmp0], 0x03 \n\t"
496 "mtc1 %[tmp0], %[ftmp0] \n\t"
497
498 "1: \n\t"
499 MMI_LDC1(%[ftmp1], %[src], 0x00)
500 MMI_LDC1(%[ftmp2], %[src], 0x08)
501
502 /* ftmp11: dst1,dst0 */
503 MMI_LDC1(%[ftmp3], %[coeff], 0x00)
504 MMI_LDC1(%[ftmp4], %[coeff], 0x08)
505 MMI_LDC1(%[ftmp5], %[coeff], 0x10)
506 MMI_LDC1(%[ftmp6], %[coeff], 0x18)
507 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
508 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
509 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
510 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
511 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
512 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
513 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
514 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
515 "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
516 "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
517
518 /* ftmp12: dst3,dst2 */
519 MMI_LDC1(%[ftmp3], %[coeff], 0x20)
520 MMI_LDC1(%[ftmp4], %[coeff], 0x28)
521 MMI_LDC1(%[ftmp5], %[coeff], 0x30)
522 MMI_LDC1(%[ftmp6], %[coeff], 0x38)
523 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
524 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
525 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
526 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
527 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
528 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
529 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
530 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
531 "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
532 "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
533
534 /* ftmp13: dst5,dst4 */
535 MMI_LDC1(%[ftmp3], %[coeff], 0x40)
536 MMI_LDC1(%[ftmp4], %[coeff], 0x48)
537 MMI_LDC1(%[ftmp5], %[coeff], 0x50)
538 MMI_LDC1(%[ftmp6], %[coeff], 0x58)
539 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
540 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
541 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
542 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
543 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
544 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
545 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
546 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
547 "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
548 "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
549
550 /* ftmp14: dst7,dst6 */
551 MMI_LDC1(%[ftmp3], %[coeff], 0x60)
552 MMI_LDC1(%[ftmp4], %[coeff], 0x68)
553 MMI_LDC1(%[ftmp5], %[coeff], 0x70)
554 MMI_LDC1(%[ftmp6], %[coeff], 0x78)
555 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
556 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
557 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
558 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
559 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
560 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
561 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
562 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
563 "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
564 "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
565
566 /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
567 "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
568 "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
569 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
570 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
571 "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
572 "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
573 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
574 "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
575 "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
576 "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
577 MMI_SDC1(%[ftmp9], %[dst], 0x00)
578 MMI_SDC1(%[ftmp10], %[dst], 0x08)
579
580 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
581 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
582 "addiu %[count], %[count], -0x01 \n\t"
583 "bnez %[count], 1b \n\t"
584 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
585 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
586 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
587 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
588 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
589 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
590 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
591 [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
592 [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
593 : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff)
594 : "memory"
595 );
596
597 src = block;
598
599 // 2nd loop
600 __asm__ volatile (
601 "li %[tmp0], 0x44 \n\t"
602 "mtc1 %[tmp0], %[ftmp15] \n\t"
603
604 // 1st part
605 "li %[tmp0], 0x07 \n\t"
606 "mtc1 %[tmp0], %[ftmp0] \n\t"
607 MMI_LDC1(%[ftmp1], %[src], 0x00)
608 MMI_LDC1(%[ftmp2], %[src], 0x10)
609 MMI_LDC1(%[ftmp3], %[src], 0x20)
610 MMI_LDC1(%[ftmp4], %[src], 0x30)
611 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
612 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
613 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
614 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
615
616 /* ftmp11: dst03,dst02,dst01,dst00 */
617 "li %[tmp0], 0x00160011 \n\t"
618 "mtc1 %[tmp0], %[ftmp3] \n\t"
619 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
620 "li %[tmp0], 0x000a0011 \n\t"
621 "mtc1 %[tmp0], %[ftmp4] \n\t"
622 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
623 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
624 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
625 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
626 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
627 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
628 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
629 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
630 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
631 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
632 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
633 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
634 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
635 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
636
637 /* ftmp12: dst13,dst12,dst11,dst10 */
638 "li %[tmp0], 0x000a0011 \n\t"
639 "mtc1 %[tmp0], %[ftmp3] \n\t"
640 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
641 "li %[tmp0], 0xffeaffef \n\t"
642 "mtc1 %[tmp0], %[ftmp4] \n\t"
643 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
644 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
645 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
646 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
647 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
648 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
649 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
650 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
651 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
652 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
653 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
654 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
655 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
656 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
657
658 /* ftmp13: dst23,dst22,dst21,dst20 */
659 "li %[tmp0], 0xfff60011 \n\t"
660 "mtc1 %[tmp0], %[ftmp3] \n\t"
661 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
662 "li %[tmp0], 0x0016ffef \n\t"
663 "mtc1 %[tmp0], %[ftmp4] \n\t"
664 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
665 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
666 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
667 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
668 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
669 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
670 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
671 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
672 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
673 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
674 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
675 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
676 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
677 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
678
679 /* ftmp14: dst33,dst32,dst31,dst30 */
680 "li %[tmp0], 0xffea0011 \n\t"
681 "mtc1 %[tmp0], %[ftmp3] \n\t"
682 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
683 "li %[tmp0], 0xfff60011 \n\t"
684 "mtc1 %[tmp0], %[ftmp4] \n\t"
685 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
686 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
687 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
688 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
689 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
690 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
691 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
692 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
693 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
694 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
695 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
696 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
697 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
698 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
699
700 MMI_LWC1(%[ftmp1], %[dest], 0x00)
701 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
702 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
703 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
704 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
705 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
706 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
707 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
708 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
709 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
710 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
711 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
712 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
713 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
714 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
715 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
716 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
717 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
718 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
719 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
720 MMI_SWC1(%[ftmp1], %[dest], 0x00)
721 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
722 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
723 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
724 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
725 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
726 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
727
728 // 2nd part
729 "li %[tmp0], 0x07 \n\t"
730 "mtc1 %[tmp0], %[ftmp0] \n\t"
731 MMI_LDC1(%[ftmp1], %[src], 0x08)
732 MMI_LDC1(%[ftmp2], %[src], 0x18)
733 MMI_LDC1(%[ftmp3], %[src], 0x28)
734 MMI_LDC1(%[ftmp4], %[src], 0x38)
735 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
736 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
737 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
738 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
739
740 /* ftmp11: dst03,dst02,dst01,dst00 */
741 "li %[tmp0], 0x00160011 \n\t"
742 "mtc1 %[tmp0], %[ftmp3] \n\t"
743 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
744 "li %[tmp0], 0x000a0011 \n\t"
745 "mtc1 %[tmp0], %[ftmp4] \n\t"
746 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
747 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
748 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
749 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
750 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
751 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
752 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
753 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
754 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
755 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
756 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
757 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
758 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
759 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
760
761 /* ftmp12: dst13,dst12,dst11,dst10 */
762 "li %[tmp0], 0x000a0011 \n\t"
763 "mtc1 %[tmp0], %[ftmp3] \n\t"
764 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
765 "li %[tmp0], 0xffeaffef \n\t"
766 "mtc1 %[tmp0], %[ftmp4] \n\t"
767 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
768 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
769 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
770 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
771 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
772 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
773 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
774 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
775 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
776 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
777 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
778 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
779 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
780 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
781
782 /* ftmp13: dst23,dst22,dst21,dst20 */
783 "li %[tmp0], 0xfff60011 \n\t"
784 "mtc1 %[tmp0], %[ftmp3] \n\t"
785 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
786 "li %[tmp0], 0x0016ffef \n\t"
787 "mtc1 %[tmp0], %[ftmp4] \n\t"
788 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
789 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
790 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
791 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
792 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
793 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
794 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
795 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
796 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
797 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
798 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
799 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
800 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
801 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
802
803 /* ftmp14: dst33,dst32,dst31,dst30 */
804 "li %[tmp0], 0xffea0011 \n\t"
805 "mtc1 %[tmp0], %[ftmp3] \n\t"
806 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
807 "li %[tmp0], 0xfff60011 \n\t"
808 "mtc1 %[tmp0], %[ftmp4] \n\t"
809 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
810 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
811 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
812 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
813 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
814 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
815 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
816 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
817 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
818 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
819 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
820 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
821 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
822 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
823
824 MMI_LWC1(%[ftmp1], %[dest], 0x04)
825 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
826 MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
827 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
828 MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
829 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
830 MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
831 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
832 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
833 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
834 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
835 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
836 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
837 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
838 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
839 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
840 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
841 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
842 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
843 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
844 MMI_SWC1(%[ftmp1], %[dest], 0x04)
845 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
846 MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
847 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
848 MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
849 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
850 MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
851
852 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
853 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
854 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
855 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
856 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
857 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
858 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
859 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
860 [tmp0]"=&r"(tmp[0])
861 : [ff_pw_64]"f"(ff_pw_32_64.f),
862 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
863 :"memory"
864 );
865 }
866 #endif
867
868 /* Do inverse transform on 4x8 parts of block */
ff_vc1_inv_trans_4x8_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)869 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
870 {
871 int dc = block[0];
872 double ftmp[9];
873 union mmi_intfloat64 dc_u;
874 DECLARE_VAR_LOW32;
875
876 dc = (17 * dc + 4) >> 3;
877 dc = (12 * dc + 64) >> 7;
878 dc_u.i = dc;
879
880 __asm__ volatile(
881 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
882 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
883
884 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
885 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
886 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
887 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
888 MMI_LWC1(%[ftmp5], %[dest4], 0x00)
889 MMI_LWC1(%[ftmp6], %[dest5], 0x00)
890 MMI_LWC1(%[ftmp7], %[dest6], 0x00)
891 MMI_LWC1(%[ftmp8], %[dest7], 0x00)
892
893 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
894 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
895 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
896 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
897 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
898 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
899 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
900 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
901
902 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
903 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
904 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
905 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
906 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
907 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
908 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
909 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
910
911 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
912 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
913 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
914 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
915 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
916 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
917 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
918 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
919
920 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
921 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
922 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
923 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
924 MMI_SWC1(%[ftmp5], %[dest4], 0x00)
925 MMI_SWC1(%[ftmp6], %[dest5], 0x00)
926 MMI_SWC1(%[ftmp7], %[dest6], 0x00)
927 MMI_SWC1(%[ftmp8], %[dest7], 0x00)
928 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
929 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
930 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
931 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
932 RESTRICT_ASM_LOW32
933 [ftmp8]"=&f"(ftmp[8])
934 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
935 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
936 [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
937 [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
938 [dc]"f"(dc_u.f)
939 : "memory"
940 );
941 }
942
943 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_4x8_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)944 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
945 {
946 int16_t *src = block;
947 int16_t *dst = block;
948 double ftmp[23];
949 uint64_t count = 8, tmp[1];
950 int16_t coeff[16] = {17, 22, 17, 10,
951 17, 10,-17,-22,
952 17,-10,-17, 22,
953 17,-22, 17,-10};
954
955 // 1st loop
956 __asm__ volatile (
957
958 "li %[tmp0], 0x03 \n\t"
959 "mtc1 %[tmp0], %[ftmp0] \n\t"
960
961 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
962 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
963 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
964 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
965 "1: \n\t"
966 /* ftmp8: dst3,dst2,dst1,dst0 */
967 MMI_LDC1(%[ftmp1], %[src], 0x00)
968 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
969 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
970 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
971 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
972 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
973 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
974 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
975 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
976 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
977 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
978 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
979 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
980 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
981 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
982 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
983 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
984 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
985 MMI_SDC1(%[ftmp8], %[dst], 0x00)
986
987 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
988 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
989 "addiu %[count], %[count], -0x01 \n\t"
990 "bnez %[count], 1b \n\t"
991 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
992 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
993 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
994 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
995 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
996 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
997 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
998 [src]"+&r"(src), [dst]"+&r"(dst)
999 : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff)
1000 : "memory"
1001 );
1002
1003 src = block;
1004
1005 // 2nd loop
1006 __asm__ volatile (
1007 "li %[tmp0], 0x07 \n\t"
1008 "mtc1 %[tmp0], %[ftmp0] \n\t"
1009
1010 MMI_LDC1(%[ftmp1], %[src], 0x00)
1011 MMI_LDC1(%[ftmp2], %[src], 0x20)
1012 MMI_LDC1(%[ftmp3], %[src], 0x40)
1013 MMI_LDC1(%[ftmp4], %[src], 0x60)
1014 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1015 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1016 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1017 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1018
1019 MMI_LDC1(%[ftmp1], %[src], 0x10)
1020 MMI_LDC1(%[ftmp2], %[src], 0x30)
1021 MMI_LDC1(%[ftmp3], %[src], 0x50)
1022 MMI_LDC1(%[ftmp4], %[src], 0x70)
1023 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1024 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1025 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1026 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1027
1028 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1029 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1030 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1031
1032 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1033 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1034 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1035
1036 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1037 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1038 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1039
1040 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1041 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1042 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1043
1044 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1045 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1046 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1047 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1048 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1049 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1050 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1051 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1052 MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1053 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1054 MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1055 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1056 MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1057 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1058 MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1059 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1060 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1061 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1062 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1063 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1064 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1065 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1066 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1067 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1068
1069 "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1070 "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1071 "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1072 "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1073 "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1074 "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1075 "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1076 "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1077
1078 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1079 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1080 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1081 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1082 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1083 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1084 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1085 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1086
1087 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1088 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1089 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1090 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1091 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1092 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1093 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1094 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1095 MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1096 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1097 MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1098 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1099 MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1100 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1101 MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1102
1103 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1104 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1105 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1106 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1107 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1108 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1109 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1110 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1111 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1112 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1113 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1114 [ftmp22]"=&f"(ftmp[22]),
1115 [tmp0]"=&r"(tmp[0])
1116 : [ff_pw_1]"f"(ff_pw_32_1.f), [ff_pw_64]"f"(ff_pw_32_64.f),
1117 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1118 : "memory"
1119 );
1120 }
1121 #endif
1122
1123 /* Do inverse transform on 4x4 part of block */
ff_vc1_inv_trans_4x4_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)1124 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1125 {
1126 int dc = block[0];
1127 double ftmp[5];
1128 union mmi_intfloat64 dc_u;
1129 DECLARE_VAR_LOW32;
1130
1131 dc = (17 * dc + 4) >> 3;
1132 dc = (17 * dc + 64) >> 7;
1133 dc_u.i = dc;
1134
1135 __asm__ volatile(
1136 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1137 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1138
1139 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1143
1144 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1145 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1146 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1147 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1148
1149 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1150 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1151 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1152 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1153
1154 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1155 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1156 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1157 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1158
1159 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1164 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1165 RESTRICT_ASM_LOW32
1166 [ftmp4]"=&f"(ftmp[4])
1167 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1168 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1169 [dc]"f"(dc_u.f)
1170 : "memory"
1171 );
1172 }
1173
ff_vc1_inv_trans_4x4_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)1174 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1175 {
1176 int16_t *src = block;
1177 int16_t *dst = block;
1178 double ftmp[16];
1179 uint32_t count = 4, tmp[1];
1180 int16_t coeff[16] = {17, 22, 17, 10,
1181 17, 10,-17,-22,
1182 17,-10,-17, 22,
1183 17,-22, 17,-10};
1184 // 1st loop
1185 __asm__ volatile (
1186
1187 "li %[tmp0], 0x03 \n\t"
1188 "mtc1 %[tmp0], %[ftmp0] \n\t"
1189 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1190 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1191 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1192 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1193 "1: \n\t"
1194 /* ftmp8: dst3,dst2,dst1,dst0 */
1195 MMI_LDC1(%[ftmp1], %[src], 0x00)
1196 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1197 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1198 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1199 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1200 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1201 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1202 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1203 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1204 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1205 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1206 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1207 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1208 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1209 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1210 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1211 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1212 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1213 MMI_SDC1(%[ftmp8], %[dst], 0x00)
1214
1215 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1216 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1217 "addiu %[count], %[count], -0x01 \n\t"
1218 "bnez %[count], 1b \n\t"
1219 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1220 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1221 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1222 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1223 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1224 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1225 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1226 [src]"+&r"(src), [dst]"+&r"(dst)
1227 : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff)
1228 : "memory"
1229 );
1230
1231 src = block;
1232
1233 // 2nd loop
1234 __asm__ volatile (
1235 "li %[tmp0], 0x07 \n\t"
1236 "mtc1 %[tmp0], %[ftmp0] \n\t"
1237 "li %[tmp0], 0x44 \n\t"
1238 "mtc1 %[tmp0], %[ftmp15] \n\t"
1239
1240 MMI_LDC1(%[ftmp1], %[src], 0x00)
1241 MMI_LDC1(%[ftmp2], %[src], 0x10)
1242 MMI_LDC1(%[ftmp3], %[src], 0x20)
1243 MMI_LDC1(%[ftmp4], %[src], 0x30)
1244 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1245 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1246 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1247 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1248
1249 /* ftmp11: dst03,dst02,dst01,dst00 */
1250 "li %[tmp0], 0x00160011 \n\t"
1251 "mtc1 %[tmp0], %[ftmp3] \n\t"
1252 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1253 "li %[tmp0], 0x000a0011 \n\t"
1254 "mtc1 %[tmp0], %[ftmp4] \n\t"
1255 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1256 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1257 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1258 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1259 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1260 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1261 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1262 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1263 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1264 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1265 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1266 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1267 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1268 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1269
1270 /* ftmp12: dst13,dst12,dst11,dst10 */
1271 "li %[tmp0], 0x000a0011 \n\t"
1272 "mtc1 %[tmp0], %[ftmp3] \n\t"
1273 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1274 "li %[tmp0], 0xffeaffef \n\t"
1275 "mtc1 %[tmp0], %[ftmp4] \n\t"
1276 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1277 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1278 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1279 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1280 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1281 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1282 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1283 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1284 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1285 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1286 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1287 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1288 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1289 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1290
1291 /* ftmp13: dst23,dst22,dst21,dst20 */
1292 "li %[tmp0], 0xfff60011 \n\t"
1293 "mtc1 %[tmp0], %[ftmp3] \n\t"
1294 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1295 "li %[tmp0], 0x0016ffef \n\t"
1296 "mtc1 %[tmp0], %[ftmp4] \n\t"
1297 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1298 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1299 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1300 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1301 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1302 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1303 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1304 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1305 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1306 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1307 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1308 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1309 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1310 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1311
1312 /* ftmp14: dst33,dst32,dst31,dst30 */
1313 "li %[tmp0], 0xffea0011 \n\t"
1314 "mtc1 %[tmp0], %[ftmp3] \n\t"
1315 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1316 "li %[tmp0], 0xfff60011 \n\t"
1317 "mtc1 %[tmp0], %[ftmp4] \n\t"
1318 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1319 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1320 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1321 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1322 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1323 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1324 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1325 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1326 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1327 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1328 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1329 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1330 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1331 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1332
1333 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1334 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1335 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1336 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1337 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1338 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1339 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1340 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1341 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1342 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1343 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1344 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1345 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1346 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1347 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1348 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1349 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1350 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1351 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1352 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1353
1354 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1355 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1356 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1357 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1358 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1359 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1360 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1361
1362 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1363 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1364 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1365 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1366 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1367 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1368 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1369 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1370 [tmp0]"=&r"(tmp[0])
1371 : [ff_pw_64]"f"(ff_pw_32_64.f),
1372 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1373 :"memory"
1374 );
1375 }
1376
1377 /* Apply overlap transform to horizontal edge */
ff_vc1_h_overlap_mmi(uint8_t * src,ptrdiff_t stride)1378 void ff_vc1_h_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1379 {
1380 int i;
1381 int a, b, c, d;
1382 int d1, d2;
1383 int rnd = 1;
1384 for (i = 0; i < 8; i++) {
1385 a = src[-2];
1386 b = src[-1];
1387 c = src[0];
1388 d = src[1];
1389 d1 = (a - d + 3 + rnd) >> 3;
1390 d2 = (a - d + b - c + 4 - rnd) >> 3;
1391
1392 src[-2] = a - d1;
1393 src[-1] = av_clip_uint8(b - d2);
1394 src[0] = av_clip_uint8(c + d2);
1395 src[1] = d + d1;
1396 src += stride;
1397 rnd = !rnd;
1398 }
1399 }
1400
ff_vc1_h_s_overlap_mmi(int16_t * left,int16_t * right,ptrdiff_t left_stride,ptrdiff_t right_stride,int flags)1401 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags)
1402 {
1403 int i;
1404 int a, b, c, d;
1405 int d1, d2;
1406 int rnd1 = flags & 2 ? 3 : 4;
1407 int rnd2 = 7 - rnd1;
1408 for (i = 0; i < 8; i++) {
1409 a = left[6];
1410 b = left[7];
1411 c = right[0];
1412 d = right[1];
1413 d1 = a - d;
1414 d2 = a - d + b - c;
1415
1416 left[6] = ((a << 3) - d1 + rnd1) >> 3;
1417 left[7] = ((b << 3) - d2 + rnd2) >> 3;
1418 right[0] = ((c << 3) + d2 + rnd1) >> 3;
1419 right[1] = ((d << 3) + d1 + rnd2) >> 3;
1420
1421 right += right_stride;
1422 left += left_stride;
1423 if (flags & 1) {
1424 rnd2 = 7 - rnd2;
1425 rnd1 = 7 - rnd1;
1426 }
1427 }
1428 }
1429
1430 /* Apply overlap transform to vertical edge */
ff_vc1_v_overlap_mmi(uint8_t * src,ptrdiff_t stride)1431 void ff_vc1_v_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1432 {
1433 int i;
1434 int a, b, c, d;
1435 int d1, d2;
1436 int rnd = 1;
1437 for (i = 0; i < 8; i++) {
1438 a = src[-2 * stride];
1439 b = src[-stride];
1440 c = src[0];
1441 d = src[stride];
1442 d1 = (a - d + 3 + rnd) >> 3;
1443 d2 = (a - d + b - c + 4 - rnd) >> 3;
1444
1445 src[-2 * stride] = a - d1;
1446 src[-stride] = av_clip_uint8(b - d2);
1447 src[0] = av_clip_uint8(c + d2);
1448 src[stride] = d + d1;
1449 src++;
1450 rnd = !rnd;
1451 }
1452 }
1453
ff_vc1_v_s_overlap_mmi(int16_t * top,int16_t * bottom)1454 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1455 {
1456 int i;
1457 int a, b, c, d;
1458 int d1, d2;
1459 int rnd1 = 4, rnd2 = 3;
1460 for (i = 0; i < 8; i++) {
1461 a = top[48];
1462 b = top[56];
1463 c = bottom[0];
1464 d = bottom[8];
1465 d1 = a - d;
1466 d2 = a - d + b - c;
1467
1468 top[48] = ((a << 3) - d1 + rnd1) >> 3;
1469 top[56] = ((b << 3) - d2 + rnd2) >> 3;
1470 bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1471 bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1472
1473 bottom++;
1474 top++;
1475 rnd2 = 7 - rnd2;
1476 rnd1 = 7 - rnd1;
1477 }
1478 }
1479
1480 /**
1481 * VC-1 in-loop deblocking filter for one line
1482 * @param src source block type
1483 * @param stride block stride
1484 * @param pq block quantizer
1485 * @return whether other 3 pairs should be filtered or not
1486 * @see 8.6
1487 */
vc1_filter_line(uint8_t * src,int stride,int pq)1488 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1489 {
1490 int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1491 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1492 int a0_sign = a0 >> 31; /* Store sign */
1493
1494 a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1495 if (a0 < pq) {
1496 int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1497 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1498 int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1499 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1500 if (a1 < a0 || a2 < a0) {
1501 int clip = src[-1 * stride] - src[0 * stride];
1502 int clip_sign = clip >> 31;
1503
1504 clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1505 if (clip) {
1506 int a3 = FFMIN(a1, a2);
1507 int d = 5 * (a3 - a0);
1508 int d_sign = (d >> 31);
1509
1510 d = ((d ^ d_sign) - d_sign) >> 3;
1511 d_sign ^= a0_sign;
1512
1513 if (d_sign ^ clip_sign)
1514 d = 0;
1515 else {
1516 d = FFMIN(d, clip);
1517 d = (d ^ d_sign) - d_sign; /* Restore sign */
1518 src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1519 src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1520 }
1521 return 1;
1522 }
1523 }
1524 }
1525 return 0;
1526 }
1527
1528 /**
1529 * VC-1 in-loop deblocking filter
1530 * @param src source block type
1531 * @param step distance between horizontally adjacent elements
1532 * @param stride distance between vertically adjacent elements
1533 * @param len edge length to filter (4 or 8 pixels)
1534 * @param pq block quantizer
1535 * @see 8.6
1536 */
vc1_loop_filter(uint8_t * src,int step,int stride,int len,int pq)1537 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1538 int len, int pq)
1539 {
1540 int i;
1541 int filt3;
1542
1543 for (i = 0; i < len; i += 4) {
1544 filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1545 if (filt3) {
1546 vc1_filter_line(src + 0 * step, stride, pq);
1547 vc1_filter_line(src + 1 * step, stride, pq);
1548 vc1_filter_line(src + 3 * step, stride, pq);
1549 }
1550 src += step * 4;
1551 }
1552 }
1553
ff_vc1_v_loop_filter4_mmi(uint8_t * src,ptrdiff_t stride,int pq)1554 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1555 {
1556 vc1_loop_filter(src, 1, stride, 4, pq);
1557 }
1558
ff_vc1_h_loop_filter4_mmi(uint8_t * src,ptrdiff_t stride,int pq)1559 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1560 {
1561 vc1_loop_filter(src, stride, 1, 4, pq);
1562 }
1563
ff_vc1_v_loop_filter8_mmi(uint8_t * src,ptrdiff_t stride,int pq)1564 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1565 {
1566 vc1_loop_filter(src, 1, stride, 8, pq);
1567 }
1568
ff_vc1_h_loop_filter8_mmi(uint8_t * src,ptrdiff_t stride,int pq)1569 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1570 {
1571 vc1_loop_filter(src, stride, 1, 8, pq);
1572 }
1573
ff_vc1_v_loop_filter16_mmi(uint8_t * src,ptrdiff_t stride,int pq)1574 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1575 {
1576 vc1_loop_filter(src, 1, stride, 16, pq);
1577 }
1578
ff_vc1_h_loop_filter16_mmi(uint8_t * src,ptrdiff_t stride,int pq)1579 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1580 {
1581 vc1_loop_filter(src, stride, 1, 16, pq);
1582 }
1583
ff_put_vc1_mspel_mc00_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1584 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1585 ptrdiff_t stride, int rnd)
1586 {
1587 ff_put_pixels8_8_mmi(dst, src, stride, 8);
1588 }
ff_put_vc1_mspel_mc00_16_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1589 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1590 ptrdiff_t stride, int rnd)
1591 {
1592 ff_put_pixels16_8_mmi(dst, src, stride, 16);
1593 }
ff_avg_vc1_mspel_mc00_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1594 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1595 ptrdiff_t stride, int rnd)
1596 {
1597 ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1598 }
ff_avg_vc1_mspel_mc00_16_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1599 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1600 ptrdiff_t stride, int rnd)
1601 {
1602 ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1603 }
1604
1605 #define OP_PUT(S, D)
1606 #define OP_AVG(S, D) \
1607 "ldc1 $f16, "#S" \n\t" \
1608 "pavgb "#D", "#D", $f16 \n\t"
1609
1610 /** Add rounder from $f14 to $f6 and pack result at destination */
1611 #define NORMALIZE_MMI(SHIFT) \
1612 "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1613 "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1614 "psrah $f6, $f6, "SHIFT" \n\t" \
1615 "psrah $f8, $f8, "SHIFT" \n\t"
1616
1617 #define TRANSFER_DO_PACK(OP) \
1618 "packushb $f6, $f6, $f8 \n\t" \
1619 OP((%[dst]), $f6) \
1620 "sdc1 $f6, 0x00(%[dst]) \n\t"
1621
1622 #define TRANSFER_DONT_PACK(OP) \
1623 OP(0(%[dst]), $f6) \
1624 OP(8(%[dst]), $f8) \
1625 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1626 "sdc1 $f8, 0x08(%[dst]) \n\t"
1627
1628 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1629 #define DO_UNPACK(reg) \
1630 "punpcklbh "reg", "reg", $f0 \n\t"
1631 #define DONT_UNPACK(reg)
1632
1633 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1634 #define LOAD_ROUNDER_MMI(ROUND) \
1635 "lwc1 $f14, "ROUND" \n\t" \
1636 "punpcklhw $f14, $f14, $f14 \n\t" \
1637 "punpcklwd $f14, $f14, $f14 \n\t"
1638
1639
1640 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1641 "paddh "#R1", "#R1", "#R2" \n\t" \
1642 PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1643 MMI_ULWC1(R0, $9, 0x00) \
1644 "pmullh "#R1", "#R1", $f6 \n\t" \
1645 "punpcklbh "#R0", "#R0", $f0 \n\t" \
1646 PTR_ADDU "$9, %[src], %[stride] \n\t" \
1647 MMI_ULWC1(R3, $9, 0x00) \
1648 "psubh "#R1", "#R1", "#R0" \n\t" \
1649 "punpcklbh "#R3", "#R3", $f0 \n\t" \
1650 "paddh "#R1", "#R1", $f14 \n\t" \
1651 "psubh "#R1", "#R1", "#R3" \n\t" \
1652 "psrah "#R1", "#R1", %[shift] \n\t" \
1653 MMI_SDC1(R1, %[dst], OFF) \
1654 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1655
1656 /** Sacrificing $f12 makes it possible to pipeline loads from src */
vc1_put_ver_16b_shift2_mmi(int16_t * dst,const uint8_t * src,mips_reg stride,int rnd,int64_t shift)1657 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1658 const uint8_t *src, mips_reg stride,
1659 int rnd, int64_t shift)
1660 {
1661 union mmi_intfloat64 shift_u;
1662 DECLARE_VAR_LOW32;
1663 DECLARE_VAR_ADDRT;
1664 shift_u.i = shift;
1665
1666 __asm__ volatile(
1667 "pxor $f0, $f0, $f0 \n\t"
1668 "li $8, 0x03 \n\t"
1669 LOAD_ROUNDER_MMI("%[rnd]")
1670 "1: \n\t"
1671 MMI_ULWC1($f4, %[src], 0x00)
1672 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1673 MMI_ULWC1($f6, %[src], 0x00)
1674 "punpcklbh $f4, $f4, $f0 \n\t"
1675 "punpcklbh $f6, $f6, $f0 \n\t"
1676 SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1677 SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1678 SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1679 SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1680 SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1681 SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1682 SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1683 SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1684 PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1685 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1686 "addiu $8, $8, -0x01 \n\t"
1687 "bnez $8, 1b \n\t"
1688 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1689 [src]"+r"(src), [dst]"+r"(dst)
1690 : [stride]"r"(stride), [stride1]"r"(-2*stride),
1691 [shift]"f"(shift_u.f), [rnd]"m"(rnd),
1692 [stride2]"r"(9*stride-4)
1693 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
1694 "$f14", "$f16", "memory"
1695 );
1696 }
1697
1698 /**
1699 * Data is already unpacked, so some operations can directly be made from
1700 * memory.
1701 */
1702 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1703 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1704 const int16_t *src, int rnd) \
1705 { \
1706 int h = 8; \
1707 DECLARE_VAR_ALL64; \
1708 DECLARE_VAR_ADDRT; \
1709 \
1710 src -= 1; \
1711 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1712 \
1713 __asm__ volatile( \
1714 LOAD_ROUNDER_MMI("%[rnd]") \
1715 "1: \n\t" \
1716 MMI_ULDC1($f2, %[src], 0x00) \
1717 MMI_ULDC1($f4, %[src], 0x08) \
1718 MMI_ULDC1($f6, %[src], 0x02) \
1719 MMI_ULDC1($f8, %[src], 0x0a) \
1720 MMI_ULDC1($f0, %[src], 0x06) \
1721 "paddh $f2, $f2, $f0 \n\t" \
1722 MMI_ULDC1($f0, %[src], 0x0e) \
1723 "paddh $f4, $f4, $f0 \n\t" \
1724 MMI_ULDC1($f0, %[src], 0x04) \
1725 "paddh $f6, $f6, $f0 \n\t" \
1726 MMI_ULDC1($f0, %[src], 0x0b) \
1727 "paddh $f8, $f8, $f0 \n\t" \
1728 "pmullh $f6, $f6, %[ff_pw_9] \n\t" \
1729 "pmullh $f8, $f8, %[ff_pw_9] \n\t" \
1730 "psubh $f6, $f6, $f2 \n\t" \
1731 "psubh $f8, $f8, $f4 \n\t" \
1732 "li $8, 0x07 \n\t" \
1733 "mtc1 $8, $f16 \n\t" \
1734 NORMALIZE_MMI("$f16") \
1735 /* Remove bias */ \
1736 "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1737 "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1738 TRANSFER_DO_PACK(OP) \
1739 "addiu %[h], %[h], -0x01 \n\t" \
1740 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1741 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1742 "bnez %[h], 1b \n\t" \
1743 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1744 [h]"+r"(h), \
1745 [src]"+r"(src), [dst]"+r"(dst) \
1746 : [stride]"r"(stride), [rnd]"m"(rnd), \
1747 [ff_pw_9]"f"(ff_pw_9.f), [ff_pw_128]"f"(ff_pw_128.f) \
1748 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f14", \
1749 "$f16", "memory" \
1750 ); \
1751 }
1752
1753 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1754 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1755
1756 /**
1757 * Purely vertical or horizontal 1/2 shift interpolation.
1758 * Sacrify $f12 for *9 factor.
1759 */
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1762 mips_reg stride, int rnd, \
1763 mips_reg offset) \
1764 { \
1765 DECLARE_VAR_LOW32; \
1766 DECLARE_VAR_ADDRT; \
1767 \
1768 rnd = 8 - rnd; \
1769 \
1770 __asm__ volatile( \
1771 "pxor $f0, $f0, $f0 \n\t" \
1772 "li $10, 0x08 \n\t" \
1773 LOAD_ROUNDER_MMI("%[rnd]") \
1774 "1: \n\t" \
1775 MMI_ULWC1($f6, %[src], 0x00) \
1776 MMI_ULWC1($f8, %[src], 0x04) \
1777 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1778 MMI_ULWC1($f2, $9, 0x00) \
1779 MMI_ULWC1($f4, $9, 0x04) \
1780 PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1781 "punpcklbh $f6, $f6, $f0 \n\t" \
1782 "punpcklbh $f8, $f8, $f0 \n\t" \
1783 "punpcklbh $f2, $f2, $f0 \n\t" \
1784 "punpcklbh $f4, $f4, $f0 \n\t" \
1785 "paddh $f6, $f6, $f2 \n\t" \
1786 "paddh $f8, $f8, $f4 \n\t" \
1787 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1788 MMI_ULWC1($f2, $9, 0x00) \
1789 MMI_ULWC1($f4, $9, 0x04) \
1790 "pmullh $f6, $f6, %[ff_pw_9] \n\t" /* 0,9,9,0*/ \
1791 "pmullh $f8, $f8, %[ff_pw_9] \n\t" /* 0,9,9,0*/ \
1792 "punpcklbh $f2, $f2, $f0 \n\t" \
1793 "punpcklbh $f4, $f4, $f0 \n\t" \
1794 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1795 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1796 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1797 MMI_ULWC1($f2, $9, 0x00) \
1798 MMI_ULWC1($f4, $9, 0x04) \
1799 "punpcklbh $f2, $f2, $f0 \n\t" \
1800 "punpcklbh $f4, $f4, $f0 \n\t" \
1801 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1802 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1803 "li $8, 0x04 \n\t" \
1804 "mtc1 $8, $f16 \n\t" \
1805 NORMALIZE_MMI("$f16") \
1806 "packushb $f6, $f6, $f8 \n\t" \
1807 OP((%[dst]), $f6) \
1808 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1809 "addiu $10, $10, -0x01 \n\t" \
1810 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1811 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1812 "bnez $10, 1b \n\t" \
1813 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1814 [src]"+r"(src), [dst]"+r"(dst) \
1815 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1816 [stride]"r"(stride), [rnd]"m"(rnd), \
1817 [stride1]"r"(stride-offset), \
1818 [ff_pw_9]"f"(ff_pw_9.f) \
1819 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1820 "$f14", "$f16", "memory" \
1821 ); \
1822 }
1823
1824 VC1_SHIFT2(OP_PUT, put_)
1825 VC1_SHIFT2(OP_AVG, avg_)
1826
1827 /**
1828 * Core of the 1/4 and 3/4 shift bicubic interpolation.
1829 *
1830 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1831 * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1832 * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1833 * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1834 * @param A2 Stride address of 2nd tap
1835 * @param A3 Stride address of 3rd tap
1836 * @param A4 Stride address of 4th tap
1837 */
1838 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1839 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1840 LOAD($f2, $9, M*0) \
1841 LOAD($f4, $9, M*4) \
1842 UNPACK("$f2") \
1843 UNPACK("$f4") \
1844 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1845 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1846 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1847 LOAD($f6, $9, M*0) \
1848 LOAD($f8, $9, M*4) \
1849 UNPACK("$f6") \
1850 UNPACK("$f8") \
1851 "pmullh $f6, $f6, %[ff_pw_18] \n\t" /* *18 */ \
1852 "pmullh $f8, $f8, %[ff_pw_18] \n\t" /* *18 */ \
1853 "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1854 "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1855 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1856 LOAD($f2, $9, M*0) \
1857 LOAD($f4, $9, M*4) \
1858 UNPACK("$f2") \
1859 UNPACK("$f4") \
1860 "li $8, 0x02 \n\t" \
1861 "mtc1 $8, $f16 \n\t" \
1862 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1863 "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1864 "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1865 "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1866 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1867 LOAD($f2, $9, M*0) \
1868 LOAD($f4, $9, M*4) \
1869 UNPACK("$f2") \
1870 UNPACK("$f4") \
1871 "pmullh $f2, $f2, %[ff_pw_53] \n\t" /* *53 */ \
1872 "pmullh $f4, $f4, %[ff_pw_53] \n\t" /* *53 */ \
1873 "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1874 "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1875
1876 /**
1877 * Macro to build the vertical 16bits version of vc1_put_shift[13].
1878 * Here, offset=src_stride. Parameters passed A1 to A4 must use
1879 * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1880 *
1881 * @param NAME Either 1 or 3
1882 * @see MSPEL_FILTER13_CORE for information on A1->A4
1883 */
1884 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1885 static void \
1886 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1887 mips_reg src_stride, \
1888 int rnd, int64_t shift) \
1889 { \
1890 int h = 8; \
1891 union mmi_intfloat64 shift_u; \
1892 DECLARE_VAR_LOW32; \
1893 DECLARE_VAR_ADDRT; \
1894 shift_u.i = shift; \
1895 \
1896 src -= src_stride; \
1897 \
1898 __asm__ volatile( \
1899 "pxor $f0, $f0, $f0 \n\t" \
1900 LOAD_ROUNDER_MMI("%[rnd]") \
1901 ".p2align 3 \n\t" \
1902 "1: \n\t" \
1903 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1904 NORMALIZE_MMI("%[shift]") \
1905 TRANSFER_DONT_PACK(OP_PUT) \
1906 /* Last 3 (in fact 4) bytes on the line */ \
1907 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1908 MMI_ULWC1($f2, $9, 0x08) \
1909 DO_UNPACK("$f2") \
1910 "mov.d $f6, $f2 \n\t" \
1911 "paddh $f2, $f2, $f2 \n\t" \
1912 "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1913 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1914 MMI_ULWC1($f6, $9, 0x08) \
1915 DO_UNPACK("$f6") \
1916 "pmullh $f6, $f6, %[ff_pw_18] \n\t" /* *18 */ \
1917 "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1918 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1919 MMI_ULWC1($f2, $9, 0x08) \
1920 DO_UNPACK("$f2") \
1921 "pmullh $f2, $f2, %[ff_pw_53] \n\t" /* *53 */ \
1922 "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1923 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1924 MMI_ULWC1($f2, $9, 0x08) \
1925 DO_UNPACK("$f2") \
1926 "li $8, 0x02 \n\t" \
1927 "mtc1 $8, $f16 \n\t" \
1928 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1929 "psubh $f6, $f6, $f2 \n\t" \
1930 "paddh $f6, $f6, $f14 \n\t" \
1931 "li $8, 0x06 \n\t" \
1932 "mtc1 $8, $f16 \n\t" \
1933 "psrah $f6, $f6, $f16 \n\t" \
1934 "sdc1 $f6, 0x10(%[dst]) \n\t" \
1935 "addiu %[h], %[h], -0x01 \n\t" \
1936 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1937 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1938 "bnez %[h], 1b \n\t" \
1939 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1940 [h]"+r"(h), \
1941 [src]"+r"(src), [dst]"+r"(dst) \
1942 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1943 [stride_x3]"r"(3*src_stride), \
1944 [rnd]"m"(rnd), [shift]"f"(shift_u.f), \
1945 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
1946 [ff_pw_3]"f"(ff_pw_3.f) \
1947 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
1948 "$f14", "$f16", "memory" \
1949 ); \
1950 }
1951
1952 /**
1953 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1954 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1955 *
1956 * @param NAME Either 1 or 3
1957 * @see MSPEL_FILTER13_CORE for information on A1->A4
1958 */
1959 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1960 static void \
1961 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1962 const int16_t *src, int rnd) \
1963 { \
1964 int h = 8; \
1965 DECLARE_VAR_ALL64; \
1966 DECLARE_VAR_ADDRT; \
1967 \
1968 src -= 1; \
1969 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1970 \
1971 __asm__ volatile( \
1972 "pxor $f0, $f0, $f0 \n\t" \
1973 LOAD_ROUNDER_MMI("%[rnd]") \
1974 ".p2align 3 \n\t" \
1975 "1: \n\t" \
1976 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1977 "li $8, 0x07 \n\t" \
1978 "mtc1 $8, $f16 \n\t" \
1979 NORMALIZE_MMI("$f16") \
1980 /* Remove bias */ \
1981 "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1982 "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1983 TRANSFER_DO_PACK(OP) \
1984 "addiu %[h], %[h], -0x01 \n\t" \
1985 PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1986 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1987 "bnez %[h], 1b \n\t" \
1988 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1989 [h]"+r"(h), \
1990 [src]"+r"(src), [dst]"+r"(dst) \
1991 : [stride]"r"(stride), [rnd]"m"(rnd), \
1992 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
1993 [ff_pw_3]"f"(ff_pw_3.f), [ff_pw_128]"f"(ff_pw_128.f) \
1994 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
1995 "$f14", "$f16", "memory" \
1996 ); \
1997 }
1998
1999 /**
2000 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2001 * Here, offset=src_stride. Parameters passed A1 to A4 must use
2002 * %3 (offset), %4 (2*offset) and %5 (3*offset).
2003 *
2004 * @param NAME Either 1 or 3
2005 * @see MSPEL_FILTER13_CORE for information on A1->A4
2006 */
2007 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2008 static void \
2009 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2010 mips_reg stride, int rnd, mips_reg offset) \
2011 { \
2012 int h = 8; \
2013 DECLARE_VAR_LOW32; \
2014 DECLARE_VAR_ADDRT; \
2015 \
2016 src -= offset; \
2017 rnd = 32-rnd; \
2018 \
2019 __asm__ volatile ( \
2020 "pxor $f0, $f0, $f0 \n\t" \
2021 LOAD_ROUNDER_MMI("%[rnd]") \
2022 ".p2align 3 \n\t" \
2023 "1: \n\t" \
2024 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2025 "li $8, 0x06 \n\t" \
2026 "mtc1 $8, $f16 \n\t" \
2027 NORMALIZE_MMI("$f16") \
2028 TRANSFER_DO_PACK(OP) \
2029 "addiu %[h], %[h], -0x01 \n\t" \
2030 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2031 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2032 "bnez %[h], 1b \n\t" \
2033 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2034 [h]"+r"(h), \
2035 [src]"+r"(src), [dst]"+r"(dst) \
2036 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2037 [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2038 [rnd]"m"(rnd), \
2039 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
2040 [ff_pw_3]"f"(ff_pw_3.f) \
2041 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
2042 "$f14", "$f16", "memory" \
2043 ); \
2044 }
2045
2046
2047 /** 1/4 shift bicubic interpolation */
2048 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2049 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2050 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2051 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2052 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2053
2054 /** 3/4 shift bicubic interpolation */
2055 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2056 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2057 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2058 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2059 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2060
2061 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2062 (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2063 int64_t shift);
2064 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2065 (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2066 typedef void (*vc1_mspel_mc_filter_8bits)
2067 (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2068 mips_reg offset);
2069
2070 /**
2071 * Interpolate fractional pel values by applying proper vertical then
2072 * horizontal filter.
2073 *
2074 * @param dst Destination buffer for interpolated pels.
2075 * @param src Source buffer.
2076 * @param stride Stride for both src and dst buffers.
2077 * @param hmode Horizontal filter (expressed in quarter pixels shift).
2078 * @param hmode Vertical filter.
2079 * @param rnd Rounding bias.
2080 */
2081 #define VC1_MSPEL_MC(OP) \
2082 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2083 int hmode, int vmode, int rnd) \
2084 { \
2085 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2086 { NULL, vc1_put_ver_16b_shift1_mmi, \
2087 vc1_put_ver_16b_shift2_mmi, \
2088 vc1_put_ver_16b_shift3_mmi }; \
2089 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2090 { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2091 OP ## vc1_hor_16b_shift2_mmi, \
2092 OP ## vc1_hor_16b_shift3_mmi }; \
2093 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2094 { NULL, OP ## vc1_shift1_mmi, \
2095 OP ## vc1_shift2_mmi, \
2096 OP ## vc1_shift3_mmi }; \
2097 \
2098 if (vmode) { /* Vertical filter to apply */ \
2099 if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2100 static const int shift_value[] = { 0, 5, 1, 5 }; \
2101 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2102 int r; \
2103 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2104 \
2105 r = (1<<(shift-1)) + rnd-1; \
2106 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2107 \
2108 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2109 return; \
2110 } \
2111 else { /* No horizontal filter, output 8 lines to dst */ \
2112 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2113 return; \
2114 } \
2115 } \
2116 \
2117 /* Horizontal mode with no vertical mode */ \
2118 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2119 } \
2120 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2121 int stride, int hmode, int vmode, int rnd)\
2122 { \
2123 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2124 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2125 dst += 8*stride; src += 8*stride; \
2126 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2127 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2128 }
2129
2130 VC1_MSPEL_MC(put_)
VC1_MSPEL_MC(avg_)2131 VC1_MSPEL_MC(avg_)
2132
2133 /** Macro to ease bicubic filter interpolation functions declarations */
2134 #define DECLARE_FUNCTION(a, b) \
2135 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2136 const uint8_t *src, \
2137 ptrdiff_t stride, \
2138 int rnd) \
2139 { \
2140 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2141 } \
2142 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2143 const uint8_t *src, \
2144 ptrdiff_t stride, \
2145 int rnd) \
2146 { \
2147 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2148 } \
2149 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2150 const uint8_t *src, \
2151 ptrdiff_t stride, \
2152 int rnd) \
2153 { \
2154 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2155 } \
2156 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2157 const uint8_t *src, \
2158 ptrdiff_t stride, \
2159 int rnd) \
2160 { \
2161 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2162 }
2163
2164 DECLARE_FUNCTION(0, 1)
2165 DECLARE_FUNCTION(0, 2)
2166 DECLARE_FUNCTION(0, 3)
2167
2168 DECLARE_FUNCTION(1, 0)
2169 DECLARE_FUNCTION(1, 1)
2170 DECLARE_FUNCTION(1, 2)
2171 DECLARE_FUNCTION(1, 3)
2172
2173 DECLARE_FUNCTION(2, 0)
2174 DECLARE_FUNCTION(2, 1)
2175 DECLARE_FUNCTION(2, 2)
2176 DECLARE_FUNCTION(2, 3)
2177
2178 DECLARE_FUNCTION(3, 0)
2179 DECLARE_FUNCTION(3, 1)
2180 DECLARE_FUNCTION(3, 2)
2181 DECLARE_FUNCTION(3, 3)
2182
2183 #define CHROMA_MC_8_MMI \
2184 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2185 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2186 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2187 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2188 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2189 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2190 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2191 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2192 \
2193 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2194 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2195 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2196 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2197 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2198 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2199 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2200 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2201 \
2202 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2203 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2204 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2205 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2206 \
2207 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2208 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2209 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2210 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2211 \
2212 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2213 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2214 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2215
2216
2217 #define CHROMA_MC_4_MMI \
2218 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2219 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2220 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2221 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2222 \
2223 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2224 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2225 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2226 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2227 \
2228 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2229 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2230 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2231 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2232 \
2233 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2234 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2235
2236
2237 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2238 uint8_t *src /* align 1 */,
2239 ptrdiff_t stride, int h, int x, int y)
2240 {
2241 union mmi_intfloat64 A, B, C, D;
2242 double ftmp[10];
2243 uint32_t tmp[1];
2244 DECLARE_VAR_ALL64;
2245 DECLARE_VAR_ADDRT;
2246 A.i = (8 - x) * (8 - y);
2247 B.i = (x) * (8 - y);
2248 C.i = (8 - x) * (y);
2249 D.i = (x) * (y);
2250
2251 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2252
2253 __asm__ volatile(
2254 "li %[tmp0], 0x06 \n\t"
2255 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2256 "mtc1 %[tmp0], %[ftmp9] \n\t"
2257 "pshufh %[A], %[A], %[ftmp0] \n\t"
2258 "pshufh %[B], %[B], %[ftmp0] \n\t"
2259 "pshufh %[C], %[C], %[ftmp0] \n\t"
2260 "pshufh %[D], %[D], %[ftmp0] \n\t"
2261
2262 "1: \n\t"
2263 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2264 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2265 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2266 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2267 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2268
2269 CHROMA_MC_8_MMI
2270
2271 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2272 "addiu %[h], %[h], -0x01 \n\t"
2273 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2274 "bnez %[h], 1b \n\t"
2275 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2276 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2277 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2278 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2279 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2280 RESTRICT_ASM_ALL64
2281 RESTRICT_ASM_ADDRT
2282 [tmp0]"=&r"(tmp[0]),
2283 [src]"+&r"(src), [dst]"+&r"(dst),
2284 [h]"+&r"(h)
2285 : [stride]"r"((mips_reg)stride),
2286 [A]"f"(A.f), [B]"f"(B.f),
2287 [C]"f"(C.f), [D]"f"(D.f),
2288 [ff_pw_28]"f"(ff_pw_28.f)
2289 : "memory"
2290 );
2291 }
2292
ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)2293 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2294 uint8_t *src /* align 1 */,
2295 ptrdiff_t stride, int h, int x, int y)
2296 {
2297 union mmi_intfloat64 A, B, C, D;
2298 double ftmp[6];
2299 uint32_t tmp[1];
2300 DECLARE_VAR_LOW32;
2301 DECLARE_VAR_ADDRT;
2302 A.i = (8 - x) * (8 - y);
2303 B.i = (x) * (8 - y);
2304 C.i = (8 - x) * (y);
2305 D.i = (x) * (y);
2306
2307 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2308
2309 __asm__ volatile(
2310 "li %[tmp0], 0x06 \n\t"
2311 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2312 "mtc1 %[tmp0], %[ftmp5] \n\t"
2313 "pshufh %[A], %[A], %[ftmp0] \n\t"
2314 "pshufh %[B], %[B], %[ftmp0] \n\t"
2315 "pshufh %[C], %[C], %[ftmp0] \n\t"
2316 "pshufh %[D], %[D], %[ftmp0] \n\t"
2317
2318 "1: \n\t"
2319 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2320 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2321 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2322 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2323 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2324
2325 CHROMA_MC_4_MMI
2326
2327 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2328 "addiu %[h], %[h], -0x01 \n\t"
2329 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2330 "bnez %[h], 1b \n\t"
2331 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2332 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2333 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2334 [tmp0]"=&r"(tmp[0]),
2335 RESTRICT_ASM_LOW32
2336 RESTRICT_ASM_ADDRT
2337 [src]"+&r"(src), [dst]"+&r"(dst),
2338 [h]"+&r"(h)
2339 : [stride]"r"((mips_reg)stride),
2340 [A]"f"(A.f), [B]"f"(B.f),
2341 [C]"f"(C.f), [D]"f"(D.f),
2342 [ff_pw_28]"f"(ff_pw_28.f)
2343 : "memory"
2344 );
2345 }
2346
ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)2347 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2348 uint8_t *src /* align 1 */,
2349 ptrdiff_t stride, int h, int x, int y)
2350 {
2351 union mmi_intfloat64 A, B, C, D;
2352 double ftmp[10];
2353 uint32_t tmp[1];
2354 DECLARE_VAR_ALL64;
2355 DECLARE_VAR_ADDRT;
2356 A.i = (8 - x) * (8 - y);
2357 B.i = (x) * (8 - y);
2358 C.i = (8 - x) * (y);
2359 D.i = (x) * (y);
2360
2361 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2362
2363 __asm__ volatile(
2364 "li %[tmp0], 0x06 \n\t"
2365 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2366 "mtc1 %[tmp0], %[ftmp9] \n\t"
2367 "pshufh %[A], %[A], %[ftmp0] \n\t"
2368 "pshufh %[B], %[B], %[ftmp0] \n\t"
2369 "pshufh %[C], %[C], %[ftmp0] \n\t"
2370 "pshufh %[D], %[D], %[ftmp0] \n\t"
2371
2372 "1: \n\t"
2373 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2374 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2375 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2376 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2377 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2378
2379 CHROMA_MC_8_MMI
2380
2381 MMI_LDC1(%[ftmp2], %[dst], 0x00)
2382 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2383
2384 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2385 "addiu %[h], %[h], -0x01 \n\t"
2386 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2387 "bnez %[h], 1b \n\t"
2388 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2389 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2390 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2391 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2392 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2393 [tmp0]"=&r"(tmp[0]),
2394 RESTRICT_ASM_ALL64
2395 RESTRICT_ASM_ADDRT
2396 [src]"+&r"(src), [dst]"+&r"(dst),
2397 [h]"+&r"(h)
2398 : [stride]"r"((mips_reg)stride),
2399 [A]"f"(A.f), [B]"f"(B.f),
2400 [C]"f"(C.f), [D]"f"(D.f),
2401 [ff_pw_28]"f"(ff_pw_28.f)
2402 : "memory"
2403 );
2404 }
2405
ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)2406 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2407 uint8_t *src /* align 1 */,
2408 ptrdiff_t stride, int h, int x, int y)
2409 {
2410 union mmi_intfloat64 A, B, C, D;
2411 double ftmp[6];
2412 uint32_t tmp[1];
2413 DECLARE_VAR_LOW32;
2414 DECLARE_VAR_ADDRT;
2415 A.i = (8 - x) * (8 - y);
2416 B.i = (x) * (8 - y);
2417 C.i = (8 - x) * (y);
2418 D.i = (x) * (y);
2419
2420 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2421
2422 __asm__ volatile(
2423 "li %[tmp0], 0x06 \n\t"
2424 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2425 "mtc1 %[tmp0], %[ftmp5] \n\t"
2426 "pshufh %[A], %[A], %[ftmp0] \n\t"
2427 "pshufh %[B], %[B], %[ftmp0] \n\t"
2428 "pshufh %[C], %[C], %[ftmp0] \n\t"
2429 "pshufh %[D], %[D], %[ftmp0] \n\t"
2430
2431 "1: \n\t"
2432 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2433 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2434 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2435 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2436 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2437
2438 CHROMA_MC_4_MMI
2439
2440 MMI_LWC1(%[ftmp2], %[dst], 0x00)
2441 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2442
2443 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2444 "addiu %[h], %[h], -0x01 \n\t"
2445 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2446 "bnez %[h], 1b \n\t"
2447 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2448 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2449 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2450 [tmp0]"=&r"(tmp[0]),
2451 RESTRICT_ASM_LOW32
2452 RESTRICT_ASM_ADDRT
2453 [src]"+&r"(src), [dst]"+&r"(dst),
2454 [h]"+&r"(h)
2455 : [stride]"r"((mips_reg)stride),
2456 [A]"f"(A.f), [B]"f"(B.f),
2457 [C]"f"(C.f), [D]"f"(D.f),
2458 [ff_pw_28]"f"(ff_pw_28.f)
2459 : "memory"
2460 );
2461 }
2462