1 /*
2 * Loongson SIMD optimized h264pred
3 *
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "h264pred_mips.h"
26 #include "libavcodec/bit_depth_template.c"
27 #include "libavutil/mips/mmiutils.h"
28 #include "constants.h"
29
ff_pred16x16_vertical_8_mmi(uint8_t * src,ptrdiff_t stride)30 void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
31 {
32 double ftmp[2];
33 uint64_t tmp[1];
34 DECLARE_VAR_ALL64;
35
36 __asm__ volatile (
37 "dli %[tmp0], 0x08 \n\t"
38 MMI_LDC1(%[ftmp0], %[srcA], 0x00)
39 MMI_LDC1(%[ftmp1], %[srcA], 0x08)
40
41 "1: \n\t"
42 MMI_SDC1(%[ftmp0], %[src], 0x00)
43 MMI_SDC1(%[ftmp1], %[src], 0x08)
44 PTR_ADDU "%[src], %[src], %[stride] \n\t"
45 MMI_SDC1(%[ftmp0], %[src], 0x00)
46 MMI_SDC1(%[ftmp1], %[src], 0x08)
47
48 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
49 PTR_ADDU "%[src], %[src], %[stride] \n\t"
50 "bnez %[tmp0], 1b \n\t"
51 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
52 [tmp0]"=&r"(tmp[0]),
53 RESTRICT_ASM_ALL64
54 [src]"+&r"(src)
55 : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride))
56 : "memory"
57 );
58 }
59
ff_pred16x16_horizontal_8_mmi(uint8_t * src,ptrdiff_t stride)60 void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
61 {
62 uint64_t tmp[3];
63 mips_reg addr[2];
64
65 __asm__ volatile (
66 PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
67 PTR_ADDU "%[addr1], %[src], $0 \n\t"
68 "dli %[tmp2], 0x08 \n\t"
69 "1: \n\t"
70 "lbu %[tmp0], 0x00(%[addr0]) \n\t"
71 "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
72 "swl %[tmp1], 0x07(%[addr1]) \n\t"
73 "swr %[tmp1], 0x00(%[addr1]) \n\t"
74 "swl %[tmp1], 0x0f(%[addr1]) \n\t"
75 "swr %[tmp1], 0x08(%[addr1]) \n\t"
76 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
77 PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
78 "lbu %[tmp0], 0x00(%[addr0]) \n\t"
79 "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
80 "swl %[tmp1], 0x07(%[addr1]) \n\t"
81 "swr %[tmp1], 0x00(%[addr1]) \n\t"
82 "swl %[tmp1], 0x0f(%[addr1]) \n\t"
83 "swr %[tmp1], 0x08(%[addr1]) \n\t"
84 "daddi %[tmp2], %[tmp2], -0x01 \n\t"
85 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
86 PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
87 "bnez %[tmp2], 1b \n\t"
88 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
89 [tmp2]"=&r"(tmp[2]),
90 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
91 : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
92 [ff_pb_1]"r"(ff_pb_1)
93 : "memory"
94 );
95 }
96
ff_pred16x16_dc_8_mmi(uint8_t * src,ptrdiff_t stride)97 void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
98 {
99 uint64_t tmp[4];
100 mips_reg addr[2];
101
102 __asm__ volatile (
103 PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
104 "dli %[tmp0], 0x08 \n\t"
105 "xor %[tmp3], %[tmp3], %[tmp3] \n\t"
106 "1: \n\t"
107 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
108 "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
109 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
110 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
111 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
112 "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
113 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
114 "bnez %[tmp0], 1b \n\t"
115
116 "dli %[tmp0], 0x08 \n\t"
117 PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
118 "2: \n\t"
119 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
120 "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
121 PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t"
122 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
123 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
124 "daddu %[tmp3], %[tmp3], %[tmp1] \n\t"
125 PTR_ADDIU "%[addr0], %[addr0], 0x01 \n\t"
126 "bnez %[tmp0], 2b \n\t"
127
128 "daddiu %[tmp3], %[tmp3], 0x10 \n\t"
129 "dsra %[tmp3], 0x05 \n\t"
130 "dmul %[tmp2], %[tmp3], %[ff_pb_1] \n\t"
131 PTR_ADDU "%[addr0], %[src], $0 \n\t"
132 "dli %[tmp0], 0x08 \n\t"
133 "3: \n\t"
134 "swl %[tmp2], 0x07(%[addr0]) \n\t"
135 "swr %[tmp2], 0x00(%[addr0]) \n\t"
136 "swl %[tmp2], 0x0f(%[addr0]) \n\t"
137 "swr %[tmp2], 0x08(%[addr0]) \n\t"
138 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
139 "swl %[tmp2], 0x07(%[addr0]) \n\t"
140 "swr %[tmp2], 0x00(%[addr0]) \n\t"
141 "swl %[tmp2], 0x0f(%[addr0]) \n\t"
142 "swr %[tmp2], 0x08(%[addr0]) \n\t"
143 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
144 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
145 "bnez %[tmp0], 3b \n\t"
146 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
147 [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
148 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
149 : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
150 [ff_pb_1]"r"(ff_pb_1)
151 : "memory"
152 );
153 }
154
ff_pred8x8l_top_dc_8_mmi(uint8_t * src,int has_topleft,int has_topright,ptrdiff_t stride)155 void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
156 int has_topright, ptrdiff_t stride)
157 {
158 double ftmp[11];
159 mips_reg tmp[3];
160 union av_intfloat64 dc;
161 DECLARE_VAR_ALL64;
162 DECLARE_VAR_ADDRT;
163
164 __asm__ volatile (
165 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
166 MMI_ULDC1(%[ftmp10], %[srcA], 0x00)
167 MMI_ULDC1(%[ftmp9], %[src0], 0x00)
168 MMI_ULDC1(%[ftmp8], %[src1], 0x00)
169
170 "punpcklbh %[ftmp7], %[ftmp10], %[ftmp0] \n\t"
171 "punpckhbh %[ftmp6], %[ftmp10], %[ftmp0] \n\t"
172 "punpcklbh %[ftmp5], %[ftmp9], %[ftmp0] \n\t"
173 "punpckhbh %[ftmp4], %[ftmp9], %[ftmp0] \n\t"
174 "punpcklbh %[ftmp3], %[ftmp8], %[ftmp0] \n\t"
175 "punpckhbh %[ftmp2], %[ftmp8], %[ftmp0] \n\t"
176 "bnez %[has_topleft], 1f \n\t"
177 "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
178
179 "1: \n\t"
180 "bnez %[has_topright], 2f \n\t"
181 "dli %[tmp0], 0xa4 \n\t"
182 "mtc1 %[tmp0], %[ftmp1] \n\t"
183 "pshufh %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
184
185 "2: \n\t"
186 "dli %[tmp0], 0x02 \n\t"
187 "mtc1 %[tmp0], %[ftmp1] \n\t"
188 "pmullh %[ftmp5], %[ftmp5], %[ff_pw_2] \n\t"
189 "pmullh %[ftmp4], %[ftmp4], %[ff_pw_2] \n\t"
190 "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
191 "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
192 "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
193 "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
194 "paddh %[ftmp7], %[ftmp7], %[ff_pw_2] \n\t"
195 "paddh %[ftmp6], %[ftmp6], %[ff_pw_2] \n\t"
196 "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
197 "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
198 "packushb %[ftmp9], %[ftmp7], %[ftmp6] \n\t"
199 "biadd %[ftmp10], %[ftmp9] \n\t"
200 "mfc1 %[tmp1], %[ftmp10] \n\t"
201 "addiu %[tmp1], %[tmp1], 0x04 \n\t"
202 "srl %[tmp1], %[tmp1], 0x03 \n\t"
203 "mul %[dc], %[tmp1], %[ff_pb_1] \n\t"
204 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
205 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
206 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
207 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
208 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
209 [ftmp10]"=&f"(ftmp[10]),
210 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
211 RESTRICT_ASM_ALL64
212 [dc]"=r"(dc.i)
213 : [srcA]"r"((mips_reg)(src-stride-1)),
214 [src0]"r"((mips_reg)(src-stride)),
215 [src1]"r"((mips_reg)(src-stride+1)),
216 [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright),
217 [ff_pb_1]"r"(ff_pb_1.i), [ff_pw_2]"f"(ff_pw_2.f)
218 : "memory"
219 );
220
221 __asm__ volatile (
222 "dli %[tmp0], 0x02 \n\t"
223 "punpcklwd %[ftmp0], %[dc], %[dc] \n\t"
224
225 "1: \n\t"
226 MMI_SDC1(%[ftmp0], %[src], 0x00)
227 MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
228 PTR_ADDU "%[src], %[src], %[stride] \n\t"
229 PTR_ADDU "%[src], %[src], %[stride] \n\t"
230 MMI_SDC1(%[ftmp0], %[src], 0x00)
231 MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
232
233 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
234 PTR_ADDU "%[src], %[src], %[stride] \n\t"
235 PTR_ADDU "%[src], %[src], %[stride] \n\t"
236 "bnez %[tmp0], 1b \n\t"
237 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
238 RESTRICT_ASM_ALL64
239 RESTRICT_ASM_ADDRT
240 [src]"+&r"(src)
241 : [dc]"f"(dc.f), [stride]"r"((mips_reg)stride)
242 : "memory"
243 );
244 }
245
ff_pred8x8l_dc_8_mmi(uint8_t * src,int has_topleft,int has_topright,ptrdiff_t stride)246 void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
247 ptrdiff_t stride)
248 {
249 uint32_t dc1, dc2;
250 double ftmp[14];
251 mips_reg tmp[1];
252 union av_intfloat64 dc;
253
254 const int l0 = ((has_topleft ? src[-1+-1*stride] : src[-1+0*stride]) + 2*src[-1+0*stride] + src[-1+1*stride] + 2) >> 2;
255 const int l1 = (src[-1+0*stride] + 2*src[-1+1*stride] + src[-1+2*stride] + 2) >> 2;
256 const int l2 = (src[-1+1*stride] + 2*src[-1+2*stride] + src[-1+3*stride] + 2) >> 2;
257 const int l3 = (src[-1+2*stride] + 2*src[-1+3*stride] + src[-1+4*stride] + 2) >> 2;
258 const int l4 = (src[-1+3*stride] + 2*src[-1+4*stride] + src[-1+5*stride] + 2) >> 2;
259 const int l5 = (src[-1+4*stride] + 2*src[-1+5*stride] + src[-1+6*stride] + 2) >> 2;
260 const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
261 const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
262
263 DECLARE_VAR_ALL64;
264 DECLARE_VAR_ADDRT;
265
266 __asm__ volatile (
267 MMI_ULDC1(%[ftmp4], %[srcA], 0x00)
268 MMI_ULDC1(%[ftmp5], %[src0], 0x00)
269 MMI_ULDC1(%[ftmp6], %[src1], 0x00)
270 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
271 "dli %[tmp0], 0x03 \n\t"
272 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
273 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
274 "mtc1 %[tmp0], %[ftmp1] \n\t"
275 "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t"
276 "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
277 "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t"
278 "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
279 "pshufh %[ftmp3], %[ftmp8], %[ftmp1] \n\t"
280 "pshufh %[ftmp13], %[ftmp12], %[ftmp1] \n\t"
281 "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t"
282 "pinsrh_3 %[ftmp12], %[ftmp12], %[ftmp3] \n\t"
283 "bnez %[has_topleft], 1f \n\t"
284 "pinsrh_0 %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
285
286 "1: \n\t"
287 "bnez %[has_topright], 2f \n\t"
288 "pshufh %[ftmp13], %[ftmp10], %[ftmp1] \n\t"
289 "pinsrh_3 %[ftmp8], %[ftmp8], %[ftmp13] \n\t"
290
291 "2: \n\t"
292 "dli %[tmp0], 0x02 \n\t"
293 "mtc1 %[tmp0], %[ftmp1] \n\t"
294 "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
295 "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
296 "pmullh %[ftmp10], %[ftmp10], %[ftmp2] \n\t"
297 "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
298 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
299 "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
300 "paddh %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
301 "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
302 "paddh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
303 "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
304 "psrah %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
305 "packushb %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
306 "biadd %[ftmp4], %[ftmp5] \n\t"
307 "mfc1 %[dc2], %[ftmp4] \n\t"
308 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
309 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
310 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
311 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
312 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
313 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
314 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
315 [tmp0]"=&r"(tmp[0]),
316 RESTRICT_ASM_ALL64
317 [dc2]"=r"(dc2)
318 : [srcA]"r"((mips_reg)(src-stride-1)),
319 [src0]"r"((mips_reg)(src-stride)),
320 [src1]"r"((mips_reg)(src-stride+1)),
321 [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright)
322 : "memory"
323 );
324
325 dc1 = l0+l1+l2+l3+l4+l5+l6+l7;
326 dc.i = ((dc1+dc2+8)>>4)*0x01010101U;
327
328 __asm__ volatile (
329 "dli %[tmp0], 0x02 \n\t"
330 "punpcklwd %[ftmp0], %[dc], %[dc] \n\t"
331
332 "1: \n\t"
333 MMI_SDC1(%[ftmp0], %[src], 0x00)
334 MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
335 PTR_ADDU "%[src], %[src], %[stride] \n\t"
336 PTR_ADDU "%[src], %[src], %[stride] \n\t"
337 MMI_SDC1(%[ftmp0], %[src], 0x00)
338 MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
339
340 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
341 PTR_ADDU "%[src], %[src], %[stride] \n\t"
342 PTR_ADDU "%[src], %[src], %[stride] \n\t"
343 "bnez %[tmp0], 1b \n\t"
344 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
345 RESTRICT_ASM_ALL64
346 RESTRICT_ASM_ADDRT
347 [src]"+&r"(src)
348 : [dc]"f"(dc.f), [stride]"r"((mips_reg)stride)
349 : "memory"
350 );
351 }
352
ff_pred8x8l_vertical_8_mmi(uint8_t * src,int has_topleft,int has_topright,ptrdiff_t stride)353 void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
354 int has_topright, ptrdiff_t stride)
355 {
356 double ftmp[12];
357 mips_reg tmp[1];
358 DECLARE_VAR_ALL64;
359
360 __asm__ volatile (
361 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
362 MMI_LDC1(%[ftmp3], %[srcA], 0x00)
363 MMI_LDC1(%[ftmp4], %[src0], 0x00)
364 MMI_LDC1(%[ftmp5], %[src1], 0x00)
365 "punpcklbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
366 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
367 "punpcklbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
368 "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
369 "punpcklbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t"
370 "punpckhbh %[ftmp11], %[ftmp5], %[ftmp0] \n\t"
371 "bnez %[has_topleft], 1f \n\t"
372 "pinsrh_0 %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
373
374 "1: \n\t"
375 "bnez %[has_topright], 2f \n\t"
376 "dli %[tmp0], 0xa4 \n\t"
377 "mtc1 %[tmp0], %[ftmp1] \n\t"
378 "pshufh %[ftmp11], %[ftmp11], %[ftmp1] \n\t"
379
380 "2: \n\t"
381 "dli %[tmp0], 0x02 \n\t"
382 "mtc1 %[tmp0], %[ftmp1] \n\t"
383 "pshufh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
384 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
385 "pmullh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
386 "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
387 "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
388 "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
389 "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
390 "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
391 "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
392 "psrah %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
393 "psrah %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
394 "packushb %[ftmp4], %[ftmp6], %[ftmp7] \n\t"
395 MMI_SDC1(%[ftmp4], %[src], 0x00)
396 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
397 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
398 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
399 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
400 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
401 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
402 [tmp0]"=&r"(tmp[0]),
403 RESTRICT_ASM_ALL64
404 [src]"=r"(src)
405 : [srcA]"r"((mips_reg)(src-stride-1)),
406 [src0]"r"((mips_reg)(src-stride)),
407 [src1]"r"((mips_reg)(src-stride+1)),
408 [has_topleft]"r"(has_topleft), [has_topright]"r"(has_topright)
409 : "memory"
410 );
411
412 __asm__ volatile (
413 "dli %[tmp0], 0x02 \n\t"
414
415 "1: \n\t"
416 MMI_SDC1(%[ftmp0], %[src], 0x00)
417 PTR_ADDU "%[src], %[src], %[stride] \n\t"
418 MMI_SDC1(%[ftmp0], %[src], 0x00)
419 PTR_ADDU "%[src], %[src], %[stride] \n\t"
420 MMI_SDC1(%[ftmp0], %[src], 0x00)
421 PTR_ADDU "%[src], %[src], %[stride] \n\t"
422 MMI_SDC1(%[ftmp0], %[src], 0x00)
423
424 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
425 PTR_ADDU "%[src], %[src], %[stride] \n\t"
426 "bnez %[tmp0], 1b \n\t"
427 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
428 RESTRICT_ASM_ALL64
429 [src]"+&r"(src)
430 : [stride]"r"((mips_reg)stride)
431 : "memory"
432 );
433 }
434
ff_pred4x4_dc_8_mmi(uint8_t * src,const uint8_t * topright,ptrdiff_t stride)435 void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
436 ptrdiff_t stride)
437 {
438 const int dc = (src[-stride] + src[1-stride] + src[2-stride]
439 + src[3-stride] + src[-1+0*stride] + src[-1+1*stride]
440 + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
441 uint64_t tmp[2];
442 mips_reg addr[1];
443 DECLARE_VAR_ADDRT;
444
445 __asm__ volatile (
446 PTR_ADDU "%[tmp0], %[dc], $0 \n\t"
447 "dmul %[tmp1], %[tmp0], %[ff_pb_1] \n\t"
448 "xor %[addr0], %[addr0], %[addr0] \n\t"
449 MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
450 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
451 MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
452 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
453 MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
454 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
455 MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
456 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
457 RESTRICT_ASM_ADDRT
458 [addr0]"=&r"(addr[0])
459 : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
460 [dc]"r"(dc), [ff_pb_1]"r"(ff_pb_1)
461 : "memory"
462 );
463 }
464
ff_pred8x8_vertical_8_mmi(uint8_t * src,ptrdiff_t stride)465 void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
466 {
467 uint64_t tmp[2];
468 mips_reg addr[2];
469
470 __asm__ volatile (
471 PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
472 PTR_ADDU "%[addr1], %[src], $0 \n\t"
473 "ldl %[tmp0], 0x07(%[addr0]) \n\t"
474 "ldr %[tmp0], 0x00(%[addr0]) \n\t"
475 "dli %[tmp1], 0x04 \n\t"
476 "1: \n\t"
477 "sdl %[tmp0], 0x07(%[addr1]) \n\t"
478 "sdr %[tmp0], 0x00(%[addr1]) \n\t"
479 PTR_ADDU "%[addr1], %[stride] \n\t"
480 "sdl %[tmp0], 0x07(%[addr1]) \n\t"
481 "sdr %[tmp0], 0x00(%[addr1]) \n\t"
482 "daddi %[tmp1], -0x01 \n\t"
483 PTR_ADDU "%[addr1], %[stride] \n\t"
484 "bnez %[tmp1], 1b \n\t"
485 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
486 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
487 : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride)
488 : "memory"
489 );
490 }
491
ff_pred8x8_horizontal_8_mmi(uint8_t * src,ptrdiff_t stride)492 void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
493 {
494 uint64_t tmp[3];
495 mips_reg addr[2];
496
497 __asm__ volatile (
498 PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
499 PTR_ADDU "%[addr1], %[src], $0 \n\t"
500 "dli %[tmp0], 0x04 \n\t"
501 "1: \n\t"
502 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
503 "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
504 "swl %[tmp2], 0x07(%[addr1]) \n\t"
505 "swr %[tmp2], 0x00(%[addr1]) \n\t"
506 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
507 PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
508 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
509 "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
510 "swl %[tmp2], 0x07(%[addr1]) \n\t"
511 "swr %[tmp2], 0x00(%[addr1]) \n\t"
512 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
513 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
514 PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
515 "bnez %[tmp0], 1b \n\t"
516 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
517 [tmp2]"=&r"(tmp[2]),
518 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
519 : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
520 [ff_pb_1]"r"(ff_pb_1)
521 : "memory"
522 );
523 }
524
ff_pred8x8_top_dc_8_mmi(uint8_t * src,ptrdiff_t stride)525 void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
526 {
527 double ftmp[4];
528 uint64_t tmp[1];
529 mips_reg addr[1];
530 DECLARE_VAR_ALL64;
531
532 __asm__ volatile (
533 "dli %[tmp0], 0x02 \n\t"
534 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
535 PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
536 MMI_LDC1(%[ftmp1], %[addr0], 0x00)
537 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
538 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
539 "biadd %[ftmp2], %[ftmp2] \n\t"
540 "biadd %[ftmp3], %[ftmp3] \n\t"
541 "mtc1 %[tmp0], %[ftmp1] \n\t"
542 "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
543 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
544 "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
545 "paddush %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
546 "paddush %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
547 "mtc1 %[tmp0], %[ftmp1] \n\t"
548 "psrlh %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
549 "psrlh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
550 "packushb %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
551 MMI_SDC1(%[ftmp1], %[src], 0x00)
552 PTR_ADDU "%[src], %[src], %[stride] \n\t"
553 MMI_SDC1(%[ftmp1], %[src], 0x00)
554 PTR_ADDU "%[src], %[src], %[stride] \n\t"
555 MMI_SDC1(%[ftmp1], %[src], 0x00)
556 PTR_ADDU "%[src], %[src], %[stride] \n\t"
557 MMI_SDC1(%[ftmp1], %[src], 0x00)
558 PTR_ADDU "%[src], %[src], %[stride] \n\t"
559 MMI_SDC1(%[ftmp1], %[src], 0x00)
560 PTR_ADDU "%[src], %[src], %[stride] \n\t"
561 MMI_SDC1(%[ftmp1], %[src], 0x00)
562 PTR_ADDU "%[src], %[src], %[stride] \n\t"
563 MMI_SDC1(%[ftmp1], %[src], 0x00)
564 PTR_ADDU "%[src], %[src], %[stride] \n\t"
565 MMI_SDC1(%[ftmp1], %[src], 0x00)
566 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
567 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
568 [tmp0]"=&r"(tmp[0]),
569 RESTRICT_ASM_ALL64
570 [addr0]"=&r"(addr[0]),
571 [src]"+&r"(src)
572 : [stride]"r"((mips_reg)stride)
573 : "memory"
574 );
575 }
576
ff_pred8x8_dc_8_mmi(uint8_t * src,ptrdiff_t stride)577 void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
578 {
579 double ftmp[5];
580 mips_reg addr[7];
581
582 __asm__ volatile (
583 "negu %[addr0], %[stride] \n\t"
584 PTR_ADDU "%[addr0], %[addr0], %[src] \n\t"
585 PTR_ADDIU "%[addr1], %[addr0], 0x04 \n\t"
586 "lbu %[addr2], 0x00(%[addr0]) \n\t"
587 PTR_ADDU "%[addr3], $0, %[addr2] \n\t"
588 PTR_ADDIU "%[addr0], 0x01 \n\t"
589 "lbu %[addr2], 0x00(%[addr1]) \n\t"
590 PTR_ADDU "%[addr4], $0, %[addr2] \n\t"
591 PTR_ADDIU "%[addr1], 0x01 \n\t"
592 "lbu %[addr2], 0x00(%[addr0]) \n\t"
593 PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
594 PTR_ADDIU "%[addr0], 0x01 \n\t"
595 "lbu %[addr2], 0x00(%[addr1]) \n\t"
596 PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
597 PTR_ADDIU "%[addr1], 0x01 \n\t"
598 "lbu %[addr2], 0x00(%[addr0]) \n\t"
599 PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
600 PTR_ADDIU "%[addr0], 0x01 \n\t"
601 "lbu %[addr2], 0x00(%[addr1]) \n\t"
602 PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
603 PTR_ADDIU "%[addr1], 0x01 \n\t"
604 "lbu %[addr2], 0x00(%[addr0]) \n\t"
605 PTR_ADDU "%[addr3], %[addr3], %[addr2] \n\t"
606 PTR_ADDIU "%[addr0], 0x01 \n\t"
607 "lbu %[addr2], 0x00(%[addr1]) \n\t"
608 PTR_ADDU "%[addr4], %[addr4], %[addr2] \n\t"
609 PTR_ADDIU "%[addr1], 0x01 \n\t"
610 "dli %[addr2], -0x01 \n\t"
611 PTR_ADDU "%[addr2], %[addr2], %[src] \n\t"
612 "lbu %[addr1], 0x00(%[addr2]) \n\t"
613 PTR_ADDU "%[addr5], $0, %[addr1] \n\t"
614 PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
615 "lbu %[addr1], 0x00(%[addr2]) \n\t"
616 PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
617 PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
618 "lbu %[addr1], 0x00(%[addr2]) \n\t"
619 PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
620 PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
621 "lbu %[addr1], 0x00(%[addr2]) \n\t"
622 PTR_ADDU "%[addr5], %[addr5], %[addr1] \n\t"
623 PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
624 "lbu %[addr1], 0x00(%[addr2]) \n\t"
625 PTR_ADDU "%[addr6], $0, %[addr1] \n\t"
626 PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
627 "lbu %[addr1], 0x00(%[addr2]) \n\t"
628 PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
629 PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
630 "lbu %[addr1], 0x00(%[addr2]) \n\t"
631 PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
632 PTR_ADDU "%[addr2], %[addr2], %[stride] \n\t"
633 "lbu %[addr1], 0x00(%[addr2]) \n\t"
634 PTR_ADDU "%[addr6], %[addr6], %[addr1] \n\t"
635 PTR_ADDU "%[addr3], %[addr3], %[addr5] \n\t"
636 PTR_ADDIU "%[addr3], %[addr3], 0x04 \n\t"
637 PTR_ADDIU "%[addr4], %[addr4], 0x02 \n\t"
638 PTR_ADDIU "%[addr1], %[addr6], 0x02 \n\t"
639 PTR_ADDU "%[addr2], %[addr4], %[addr1] \n\t"
640 PTR_SRL "%[addr3], 0x03 \n\t"
641 PTR_SRL "%[addr4], 0x02 \n\t"
642 PTR_SRL "%[addr1], 0x02 \n\t"
643 PTR_SRL "%[addr2], 0x03 \n\t"
644 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
645 "dmtc1 %[addr3], %[ftmp1] \n\t"
646 "pshufh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
647 "dmtc1 %[addr4], %[ftmp2] \n\t"
648 "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
649 "dmtc1 %[addr1], %[ftmp3] \n\t"
650 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
651 "dmtc1 %[addr2], %[ftmp4] \n\t"
652 "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
653 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
654 "packushb %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
655 PTR_ADDU "%[addr0], $0, %[src] \n\t"
656 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
657 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
658 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
659 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
660 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
661 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
662 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
663 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
664 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
665 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
666 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
667 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
668 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
669 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
670 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
671 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
672 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
673 [ftmp4]"=&f"(ftmp[4]),
674 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
675 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
676 [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
677 [addr6]"=&r"(addr[6])
678 : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride)
679 : "memory"
680 );
681 }
682
ff_pred8x16_vertical_8_mmi(uint8_t * src,ptrdiff_t stride)683 void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
684 {
685 double ftmp[1];
686 uint64_t tmp[1];
687 DECLARE_VAR_ALL64;
688
689 __asm__ volatile (
690 MMI_LDC1(%[ftmp0], %[srcA], 0x00)
691 "dli %[tmp0], 0x04 \n\t"
692
693 "1: \n\t"
694 MMI_SDC1(%[ftmp0], %[src], 0x00)
695 PTR_ADDU "%[src], %[src], %[stride] \n\t"
696 MMI_SDC1(%[ftmp0], %[src], 0x00)
697 PTR_ADDU "%[src], %[src], %[stride] \n\t"
698 MMI_SDC1(%[ftmp0], %[src], 0x00)
699 PTR_ADDU "%[src], %[src], %[stride] \n\t"
700 MMI_SDC1(%[ftmp0], %[src], 0x00)
701
702 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
703 PTR_ADDU "%[src], %[src], %[stride] \n\t"
704 "bnez %[tmp0], 1b \n\t"
705 : [ftmp0]"=&f"(ftmp[0]),
706 [tmp0]"=&r"(tmp[0]),
707 RESTRICT_ASM_ALL64
708 [src]"+&r"(src)
709 : [stride]"r"((mips_reg)stride), [srcA]"r"((mips_reg)(src-stride))
710 : "memory"
711 );
712 }
713
ff_pred8x16_horizontal_8_mmi(uint8_t * src,ptrdiff_t stride)714 void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
715 {
716 uint64_t tmp[3];
717 mips_reg addr[2];
718
719 __asm__ volatile (
720 PTR_ADDI "%[addr0], %[src], -0x01 \n\t"
721 PTR_ADDU "%[addr1], %[src], $0 \n\t"
722 "dli %[tmp0], 0x08 \n\t"
723 "1: \n\t"
724 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
725 "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
726 "swl %[tmp2], 0x07(%[addr1]) \n\t"
727 "swr %[tmp2], 0x00(%[addr1]) \n\t"
728 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
729 PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
730 "lbu %[tmp1], 0x00(%[addr0]) \n\t"
731 "dmul %[tmp2], %[tmp1], %[ff_pb_1] \n\t"
732 "swl %[tmp2], 0x07(%[addr1]) \n\t"
733 "swr %[tmp2], 0x00(%[addr1]) \n\t"
734 "daddi %[tmp0], %[tmp0], -0x01 \n\t"
735 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
736 PTR_ADDU "%[addr1], %[addr1], %[stride] \n\t"
737 "bnez %[tmp0], 1b \n\t"
738 : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
739 [tmp2]"=&r"(tmp[2]),
740 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1])
741 : [src]"r"((mips_reg)src), [stride]"r"((mips_reg)stride),
742 [ff_pb_1]"r"(ff_pb_1)
743 : "memory"
744 );
745 }
746
pred16x16_plane_compat_mmi(uint8_t * src,int stride,const int svq3,const int rv40)747 static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
748 const int svq3, const int rv40)
749 {
750 double ftmp[11];
751 uint64_t tmp[6];
752 mips_reg addr[1];
753 DECLARE_VAR_ALL64;
754
755 __asm__ volatile(
756 PTR_SUBU "%[addr0], %[src], %[stride] \n\t"
757 "dli %[tmp0], 0x20 \n\t"
758 "dmtc1 %[tmp0], %[ftmp4] \n\t"
759 MMI_ULDC1(%[ftmp0], %[addr0], -0x01)
760 MMI_ULDC1(%[ftmp2], %[addr0], 0x08)
761 "ssrld %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
762 "ssrld %[ftmp3], %[ftmp2], %[ftmp4] \n\t"
763 "pxor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
764 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
765 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
766 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
767 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
768 "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t"
769 "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t"
770 "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t"
771 "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t"
772 "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
773 "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
774 "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
775 "dli %[tmp0], 0x0e \n\t"
776 "dmtc1 %[tmp0], %[ftmp4] \n\t"
777 "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
778 "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
779 "dli %[tmp0], 0x01 \n\t"
780 "dmtc1 %[tmp0], %[ftmp4] \n\t"
781 "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
782 "paddsh %[ftmp5], %[ftmp0], %[ftmp1] \n\t"
783
784 PTR_ADDIU "%[addr0], %[src], -0x01 \n\t"
785 PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
786 "lbu %[tmp2], 0x00(%[addr0]) \n\t"
787 "lbu %[tmp5], 0x10(%[addr0]) \n\t"
788 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
789 "lbu %[tmp3], 0x00(%[addr0]) \n\t"
790 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
791 "lbu %[tmp4], 0x00(%[addr0]) \n\t"
792 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
793 "lbu %[tmp0], 0x00(%[addr0]) \n\t"
794 "dsll %[tmp3], %[tmp3], 0x10 \n\t"
795 "dsll %[tmp4], %[tmp4], 0x20 \n\t"
796 "dsll %[tmp0], %[tmp0], 0x30 \n\t"
797 "or %[tmp4], %[tmp4], %[tmp0] \n\t"
798 "or %[tmp2], %[tmp2], %[tmp3] \n\t"
799 "or %[tmp2], %[tmp2], %[tmp4] \n\t"
800 "dmtc1 %[tmp2], %[ftmp0] \n\t"
801
802 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
803 "lbu %[tmp2], 0x00(%[addr0]) \n\t"
804 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
805 "lbu %[tmp3], 0x00(%[addr0]) \n\t"
806 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
807 "lbu %[tmp4], 0x00(%[addr0]) \n\t"
808 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
809 "lbu %[tmp0], 0x00(%[addr0]) \n\t"
810 "dsll %[tmp3], %[tmp3], 0x10 \n\t"
811 "dsll %[tmp4], %[tmp4], 0x20 \n\t"
812 "dsll %[tmp0], %[tmp0], 0x30 \n\t"
813 "or %[tmp4], %[tmp4], %[tmp0] \n\t"
814 "or %[tmp2], %[tmp2], %[tmp3] \n\t"
815 "or %[tmp2], %[tmp2], %[tmp4] \n\t"
816 "dmtc1 %[tmp2], %[ftmp1] \n\t"
817
818 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
819 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
820 "lbu %[tmp2], 0x00(%[addr0]) \n\t"
821 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
822 "lbu %[tmp3], 0x00(%[addr0]) \n\t"
823 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
824 "lbu %[tmp4], 0x00(%[addr0]) \n\t"
825 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
826 "lbu %[tmp0], 0x00(%[addr0]) \n\t"
827 "dsll %[tmp3], %[tmp3], 0x10 \n\t"
828 "dsll %[tmp4], %[tmp4], 0x20 \n\t"
829 "dsll %[tmp0], %[tmp0], 0x30 \n\t"
830 "or %[tmp4], %[tmp4], %[tmp0] \n\t"
831 "or %[tmp2], %[tmp2], %[tmp3] \n\t"
832 "or %[tmp2], %[tmp2], %[tmp4] \n\t"
833 "dmtc1 %[tmp2], %[ftmp2] \n\t"
834
835 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
836 "lbu %[tmp2], 0x00(%[addr0]) \n\t"
837 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
838 "lbu %[tmp3], 0x00(%[addr0]) \n\t"
839 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
840 "lbu %[tmp4], 0x00(%[addr0]) \n\t"
841 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
842 "lbu %[tmp0], 0x00(%[addr0]) \n\t"
843 "daddu %[tmp5], %[tmp5], %[tmp0] \n\t"
844 "daddiu %[tmp5], %[tmp5], 0x01 \n\t"
845 "dsll %[tmp5], %[tmp5], 0x04 \n\t"
846
847 "dsll %[tmp3], %[tmp3], 0x10 \n\t"
848 "dsll %[tmp4], %[tmp4], 0x20 \n\t"
849 "dsll %[tmp0], %[tmp0], 0x30 \n\t"
850 "or %[tmp4], %[tmp4], %[tmp0] \n\t"
851 "or %[tmp2], %[tmp2], %[tmp3] \n\t"
852 "or %[tmp2], %[tmp2], %[tmp4] \n\t"
853 "dmtc1 %[tmp2], %[ftmp3] \n\t"
854
855 "pmullh %[ftmp0], %[ftmp0], %[ff_pw_m8tom5] \n\t"
856 "pmullh %[ftmp1], %[ftmp1], %[ff_pw_m4tom1] \n\t"
857 "pmullh %[ftmp2], %[ftmp2], %[ff_pw_1to4] \n\t"
858 "pmullh %[ftmp3], %[ftmp3], %[ff_pw_5to8] \n\t"
859 "paddsh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
860 "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
861 "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
862 "dli %[tmp0], 0x0e \n\t"
863 "dmtc1 %[tmp0], %[ftmp4] \n\t"
864 "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
865 "paddsh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
866
867 "dli %[tmp0], 0x01 \n\t"
868 "dmtc1 %[tmp0], %[ftmp4] \n\t"
869 "pshufh %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
870 "paddsh %[ftmp6], %[ftmp0], %[ftmp1] \n\t"
871
872 "dmfc1 %[tmp0], %[ftmp5] \n\t"
873 "dsll %[tmp0], %[tmp0], 0x30 \n\t"
874 "dsra %[tmp0], %[tmp0], 0x30 \n\t"
875 "dmfc1 %[tmp1], %[ftmp6] \n\t"
876 "dsll %[tmp1], %[tmp1], 0x30 \n\t"
877 "dsra %[tmp1], %[tmp1], 0x30 \n\t"
878
879 "beqz %[svq3], 1f \n\t"
880 "dli %[tmp2], 0x04 \n\t"
881 "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t"
882 "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t"
883 "dli %[tmp2], 0x05 \n\t"
884 "dmul %[tmp0], %[tmp0], %[tmp2] \n\t"
885 "dmul %[tmp1], %[tmp1], %[tmp2] \n\t"
886 "dli %[tmp2], 0x10 \n\t"
887 "ddiv %[tmp0], %[tmp0], %[tmp2] \n\t"
888 "ddiv %[tmp1], %[tmp1], %[tmp2] \n\t"
889 "daddu %[tmp2], %[tmp0], $0 \n\t"
890 "daddu %[tmp0], %[tmp1], $0 \n\t"
891 "daddu %[tmp1], %[tmp2], $0 \n\t"
892 "b 2f \n\t"
893
894 "1: \n\t"
895 "beqz %[rv40], 1f \n\t"
896 "dsra %[tmp2], %[tmp0], 0x02 \n\t"
897 "daddu %[tmp0], %[tmp0], %[tmp2] \n\t"
898 "dsra %[tmp2], %[tmp1], 0x02 \n\t"
899 "daddu %[tmp1], %[tmp1], %[tmp2] \n\t"
900 "dsra %[tmp0], %[tmp0], 0x04 \n\t"
901 "dsra %[tmp1], %[tmp1], 0x04 \n\t"
902 "b 2f \n\t"
903
904 "1: \n\t"
905 "dli %[tmp2], 0x05 \n\t"
906 "dmul %[tmp0], %[tmp0], %[tmp2] \n\t"
907 "dmul %[tmp1], %[tmp1], %[tmp2] \n\t"
908 "daddiu %[tmp0], %[tmp0], 0x20 \n\t"
909 "daddiu %[tmp1], %[tmp1], 0x20 \n\t"
910 "dsra %[tmp0], %[tmp0], 0x06 \n\t"
911 "dsra %[tmp1], %[tmp1], 0x06 \n\t"
912
913 "2: \n\t"
914 "daddu %[tmp3], %[tmp0], %[tmp1] \n\t"
915 "dli %[tmp2], 0x07 \n\t"
916 "dmul %[tmp3], %[tmp3], %[tmp2] \n\t"
917 "dsubu %[tmp5], %[tmp5], %[tmp3] \n\t"
918
919 "pxor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
920 "dmtc1 %[tmp0], %[ftmp0] \n\t"
921 "pshufh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
922 "dmtc1 %[tmp1], %[ftmp5] \n\t"
923 "pshufh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
924 "dmtc1 %[tmp5], %[ftmp6] \n\t"
925 "pshufh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
926 "dli %[tmp0], 0x05 \n\t"
927 "dmtc1 %[tmp0], %[ftmp7] \n\t"
928 "pmullh %[ftmp1], %[ff_pw_0to3], %[ftmp0] \n\t"
929 "dmtc1 %[ff_pw_4to7], %[ftmp2] \n\t"
930 "pmullh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
931 "dmtc1 %[ff_pw_8tob], %[ftmp3] \n\t"
932 "pmullh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
933 "dmtc1 %[ff_pw_ctof], %[ftmp4] \n\t"
934 "pmullh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
935
936 "dli %[tmp0], 0x10 \n\t"
937 PTR_ADDU "%[addr0], %[src], $0 \n\t"
938 "1: \n\t"
939 "paddsh %[ftmp8], %[ftmp1], %[ftmp6] \n\t"
940 "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
941 "paddsh %[ftmp9], %[ftmp2], %[ftmp6] \n\t"
942 "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t"
943 "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t"
944 MMI_SDC1(%[ftmp0], %[addr0], 0x00)
945
946 "paddsh %[ftmp8], %[ftmp3], %[ftmp6] \n\t"
947 "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
948 "paddsh %[ftmp9], %[ftmp4], %[ftmp6] \n\t"
949 "psrah %[ftmp9], %[ftmp9], %[ftmp7] \n\t"
950 "packushb %[ftmp0], %[ftmp8], %[ftmp9] \n\t"
951 MMI_SDC1(%[ftmp0], %[addr0], 0x08)
952
953 "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
954 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
955 "daddiu %[tmp0], %[tmp0], -0x01 \n\t"
956 "bnez %[tmp0], 1b \n\t"
957 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
958 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
959 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
960 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
961 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
962 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
963 [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
964 [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
965 RESTRICT_ASM_ALL64
966 [addr0]"=&r"(addr[0])
967 : [src]"r"(src), [stride]"r"((mips_reg)stride),
968 [svq3]"r"(svq3), [rv40]"r"(rv40),
969 [ff_pw_m8tom5]"f"(ff_pw_m8tom5.f),[ff_pw_m4tom1]"f"(ff_pw_m4tom1.f),
970 [ff_pw_1to4]"f"(ff_pw_1to4.f), [ff_pw_5to8]"f"(ff_pw_5to8.f),
971 [ff_pw_0to3]"f"(ff_pw_0to3.f), [ff_pw_4to7]"r"(ff_pw_4to7.i),
972 [ff_pw_8tob]"r"(ff_pw_8tob.i), [ff_pw_ctof]"r"(ff_pw_ctof.i)
973 : "memory"
974 );
975 }
976
ff_pred16x16_plane_h264_8_mmi(uint8_t * src,ptrdiff_t stride)977 void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride)
978 {
979 pred16x16_plane_compat_mmi(src, stride, 0, 0);
980 }
981
ff_pred16x16_plane_svq3_8_mmi(uint8_t * src,ptrdiff_t stride)982 void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride)
983 {
984 pred16x16_plane_compat_mmi(src, stride, 1, 0);
985 }
986
ff_pred16x16_plane_rv40_8_mmi(uint8_t * src,ptrdiff_t stride)987 void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride)
988 {
989 pred16x16_plane_compat_mmi(src, stride, 0, 1);
990 }
991