1 /*
2 * Loongson SIMD optimized vp8dsp
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/mips/mmiutils.h"
27 #include "libavutil/mem_internal.h"
28
29 #define DECLARE_DOUBLE_1 double db_1
30 #define DECLARE_DOUBLE_2 double db_2
31 #define DECLARE_UINT32_T uint32_t it_1
32 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
33 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
34 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
35
36 #define MMI_PCMPGTUB(dst, src1, src2) \
37 "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
38 "pmaxub %[db_2], "#src1", "#src2" \n\t" \
39 "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
40 "xor "#dst", %[db_2], %[db_1] \n\t"
41
42 #define MMI_BTOH(dst_l, dst_r, src) \
43 "xor %[db_1], %[db_1], %[db_1] \n\t" \
44 "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
45 "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
46 "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
47
48 #define MMI_VP8_LOOP_FILTER \
49 /* Calculation of hev */ \
50 "dmtc1 %[thresh], %[ftmp3] \n\t" \
51 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
52 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54 "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
55 "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
56 "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
57 MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
58 /* Calculation of mask */ \
59 "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
60 "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
61 "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
62 "li %[tmp0], 0x09 \n\t" \
63 "dmtc1 %[tmp0], %[ftmp3] \n\t" \
64 PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
65 "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
66 "dmtc1 %[e], %[ftmp3] \n\t" \
67 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
68 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70 MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
71 "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
72 "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
73 "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
74 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
75 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
76 "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
77 "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
78 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
79 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
80 "dmtc1 %[i], %[ftmp3] \n\t" \
81 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
82 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84 MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
85 "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
86 "xor %[mask], %[mask], %[ftmp3] \n\t" \
87 /* VP8_MBFILTER */ \
88 "li %[tmp0], 0x80808080 \n\t" \
89 "dmtc1 %[tmp0], %[ftmp7] \n\t" \
90 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
91 "xor %[p2], %[p2], %[ftmp7] \n\t" \
92 "xor %[p1], %[p1], %[ftmp7] \n\t" \
93 "xor %[p0], %[p0], %[ftmp7] \n\t" \
94 "xor %[q0], %[q0], %[ftmp7] \n\t" \
95 "xor %[q1], %[q1], %[ftmp7] \n\t" \
96 "xor %[q2], %[q2], %[ftmp7] \n\t" \
97 "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
98 "psubb %[ftmp5], %[q0], %[p0] \n\t" \
99 MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
100 MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
101 /* Right part */ \
102 "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
103 "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
104 "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
105 /* Left part */ \
106 "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
107 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
108 "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
109 /* Combine left and right part */ \
110 "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
111 "and %[ftmp1], %[ftmp1], %[mask] \n\t" \
112 "and %[ftmp2], %[ftmp1], %[hev] \n\t" \
113 "li %[tmp0], 0x04040404 \n\t" \
114 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
115 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
116 "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
117 "li %[tmp0], 0x0B \n\t" \
118 "dmtc1 %[tmp0], %[ftmp4] \n\t" \
119 PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
120 "li %[tmp0], 0x03030303 \n\t" \
121 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
122 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
123 "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
124 "li %[tmp0], 0x0B \n\t" \
125 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
126 PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
127 "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
128 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
129 /* filt_val &= ~hev */ \
130 "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
131 "xor %[hev], %[hev], %[ftmp0] \n\t" \
132 "and %[ftmp1], %[ftmp1], %[hev] \n\t" \
133 MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
134 "li %[tmp0], 0x07 \n\t" \
135 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
136 "li %[tmp0], 0x001b001b \n\t" \
137 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
138 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
139 "li %[tmp0], 0x003f003f \n\t" \
140 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
141 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
142 /* Right part */ \
143 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
144 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
145 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
146 /* Left part */ \
147 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
148 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
149 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
150 /* Combine left and right part */ \
151 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
152 "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
153 "xor %[q0], %[q0], %[ftmp7] \n\t" \
154 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
155 "xor %[p0], %[p0], %[ftmp7] \n\t" \
156 "li %[tmp0], 0x00120012 \n\t" \
157 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
158 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
159 /* Right part */ \
160 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
161 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
162 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
163 /* Left part */ \
164 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
165 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
166 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
167 /* Combine left and right part */ \
168 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
169 "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
170 "xor %[q1], %[q1], %[ftmp7] \n\t" \
171 "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
172 "xor %[p1], %[p1], %[ftmp7] \n\t" \
173 "li %[tmp0], 0x03 \n\t" \
174 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
175 /* Right part */ \
176 "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
177 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
178 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
179 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
180 /* Left part */ \
181 "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
182 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
183 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
184 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
185 /* Combine left and right part */ \
186 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
187 "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
188 "xor %[q2], %[q2], %[ftmp7] \n\t" \
189 "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
190 "xor %[p2], %[p2], %[ftmp7] \n\t"
191
192 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
193 MMI_ULWC1(%[ftmp1], src, 0x00) \
194 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
195 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
196 \
197 MMI_ULWC1(%[ftmp1], src, -0x01) \
198 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
199 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
200 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
201 \
202 MMI_ULWC1(%[ftmp1], src, -0x02) \
203 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
204 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
205 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
206 \
207 MMI_ULWC1(%[ftmp1], src, 0x01) \
208 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
209 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
210 \
211 MMI_ULWC1(%[ftmp1], src, 0x02) \
212 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
213 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
214 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
215 \
216 MMI_ULWC1(%[ftmp1], src, 0x03) \
217 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
218 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
219 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
220 \
221 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
222 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
223 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
224 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
225 \
226 MMI_SWC1(%[ftmp1], dst, 0x00)
227
228
229 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
230 MMI_ULWC1(%[ftmp1], src, 0x00) \
231 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
232 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
233 \
234 MMI_ULWC1(%[ftmp1], src, -0x01) \
235 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
236 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
237 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
238 \
239 MMI_ULWC1(%[ftmp1], src, 0x01) \
240 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
241 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
242 \
243 MMI_ULWC1(%[ftmp1], src, 0x02) \
244 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
245 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
246 "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
247 \
248 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
249 \
250 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
251 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
252 \
253 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
254 MMI_SWC1(%[ftmp1], dst, 0x00)
255
256
257 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
258 MMI_ULWC1(%[ftmp1], src, 0x00) \
259 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
260 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
261 \
262 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
263 MMI_ULWC1(%[ftmp1], src1, 0x00) \
264 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
265 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
266 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
267 \
268 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
269 MMI_ULWC1(%[ftmp1], src1, 0x00) \
270 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
271 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
272 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
273 \
274 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
275 MMI_ULWC1(%[ftmp1], src1, 0x00) \
276 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
277 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
278 \
279 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
280 MMI_ULWC1(%[ftmp1], src1, 0x00) \
281 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
282 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
283 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
284 \
285 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
286 MMI_ULWC1(%[ftmp1], src1, 0x00) \
287 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
288 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
289 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
290 \
291 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
292 \
293 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
294 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
295 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
296 \
297 MMI_SWC1(%[ftmp1], dst, 0x00)
298
299
300 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
301 MMI_ULWC1(%[ftmp1], src, 0x00) \
302 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
303 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
304 \
305 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
306 MMI_ULWC1(%[ftmp1], src1, 0x00) \
307 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
308 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
309 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
310 \
311 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
312 MMI_ULWC1(%[ftmp1], src1, 0x00) \
313 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
314 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
315 \
316 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
317 MMI_ULWC1(%[ftmp1], src1, 0x00) \
318 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
319 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
320 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
321 \
322 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
323 \
324 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
325 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
326 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
327 \
328 MMI_SWC1(%[ftmp1], dst, 0x00)
329
330
331 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
332 MMI_ULDC1(%[ftmp1], src, 0x00) \
333 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
334 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
335 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
336 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
337 \
338 MMI_ULDC1(%[ftmp1], src, -0x01) \
339 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
340 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
341 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
342 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
343 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
344 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
345 \
346 MMI_ULDC1(%[ftmp1], src, -0x02) \
347 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
348 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
349 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
350 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
351 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
352 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
353 \
354 MMI_ULDC1(%[ftmp1], src, 0x01) \
355 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
356 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
357 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
358 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
359 \
360 MMI_ULDC1(%[ftmp1], src, 0x02) \
361 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
362 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
363 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
364 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
365 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
366 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
367 \
368 MMI_ULDC1(%[ftmp1], src, 0x03) \
369 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
370 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
371 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
372 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
373 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
374 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
375 \
376 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
377 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
378 \
379 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
380 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
381 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
382 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
383 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
384 \
385 MMI_SDC1(%[ftmp1], dst, 0x00)
386
387
388 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
389 MMI_ULDC1(%[ftmp1], src, 0x00) \
390 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
391 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
392 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
393 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
394 \
395 MMI_ULDC1(%[ftmp1], src, -0x01) \
396 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
397 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
398 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
399 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
400 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
401 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
402 \
403 MMI_ULDC1(%[ftmp1], src, 0x01) \
404 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
405 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
406 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
407 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
408 \
409 MMI_ULDC1(%[ftmp1], src, 0x02) \
410 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
411 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
412 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
413 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
414 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
415 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
416 \
417 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
418 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
419 \
420 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
421 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
422 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
423 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
424 \
425 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
426 MMI_SDC1(%[ftmp1], dst, 0x00)
427
428
429 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
430 MMI_ULDC1(%[ftmp1], src, 0x00) \
431 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
432 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
433 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
434 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
435 \
436 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
437 MMI_ULDC1(%[ftmp1], src1, 0x00) \
438 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
439 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
440 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
441 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
442 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
443 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
444 \
445 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
446 MMI_ULDC1(%[ftmp1], src1, 0x00) \
447 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
448 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
449 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
450 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
451 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
452 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
453 \
454 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
455 MMI_ULDC1(%[ftmp1], src1, 0x00) \
456 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
457 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
458 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
459 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
460 \
461 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
462 MMI_ULDC1(%[ftmp1], src1, 0x00) \
463 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
464 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
465 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
466 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
467 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
468 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
469 \
470 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
471 MMI_ULDC1(%[ftmp1], src1, 0x00) \
472 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
473 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
474 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
475 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
476 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
477 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
478 \
479 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
480 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
481 \
482 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
483 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
484 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
485 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
486 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
487 \
488 MMI_SDC1(%[ftmp1], dst, 0x00)
489
490
491 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
492 MMI_ULDC1(%[ftmp1], src, 0x00) \
493 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
494 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
495 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
496 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
497 \
498 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
499 MMI_ULDC1(%[ftmp1], src1, 0x00) \
500 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
501 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
502 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
503 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
504 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
505 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
506 \
507 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
508 MMI_ULDC1(%[ftmp1], src1, 0x00) \
509 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
510 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
511 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
512 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
513 \
514 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
515 MMI_ULDC1(%[ftmp1], src1, 0x00) \
516 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
517 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
518 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
519 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
520 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
521 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
522 \
523 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
524 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
525 \
526 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
527 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
528 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
529 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
530 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
531 \
532 MMI_SDC1(%[ftmp1], dst, 0x00)
533
534
535 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
536 MMI_ULDC1(%[ftmp1], src, 0x00) \
537 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
538 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
539 "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
540 "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
541 \
542 MMI_ULDC1(%[ftmp1], src, 0x01) \
543 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
544 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
545 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
546 "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
547 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
548 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
549 \
550 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
551 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
552 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
553 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
554 \
555 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
556 MMI_SDC1(%[ftmp1], dst, 0x00)
557
558
559 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
560 MMI_ULWC1(%[ftmp1], src, 0x00) \
561 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
562 "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
563 \
564 MMI_ULWC1(%[ftmp1], src, 0x01) \
565 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
566 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
567 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
568 \
569 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
570 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
571 \
572 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
573 MMI_SWC1(%[ftmp1], dst, 0x00)
574
575
576 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
577 MMI_ULDC1(%[ftmp1], src, 0x00) \
578 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
579 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
580 "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
581 "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
582 \
583 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
584 MMI_ULDC1(%[ftmp1], src1, 0x00) \
585 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
586 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
587 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
588 "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
589 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
590 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
591 \
592 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
593 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
594 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
595 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
596 \
597 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
598 MMI_SDC1(%[ftmp1], dst, 0x00)
599
600
601 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
602 MMI_ULWC1(%[ftmp1], src, 0x00) \
603 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
604 "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
605 \
606 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
607 MMI_ULWC1(%[ftmp1], src1, 0x00) \
608 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
609 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
610 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
611 \
612 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
613 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
614 \
615 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
616 MMI_SWC1(%[ftmp1], dst, 0x00)
617
618
619 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
620 {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
621 0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
622
623 {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
624 0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
625
626 {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
627 0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
628
629 {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
630 0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
631
632 {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
633 0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
634
635 {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
636 0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
637
638 {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
639 0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
640 };
641
642 #if 0
643 #define FILTER_6TAP(src, F, stride) \
644 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
645 F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
646 F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
647
648 #define FILTER_4TAP(src, F, stride) \
649 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
650 F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
651
652 static const uint8_t subpel_filters[7][6] = {
653 { 0, 6, 123, 12, 1, 0 },
654 { 2, 11, 108, 36, 8, 1 },
655 { 0, 9, 93, 50, 6, 0 },
656 { 3, 16, 77, 77, 16, 3 },
657 { 0, 6, 50, 93, 9, 0 },
658 { 1, 8, 36, 108, 11, 2 },
659 { 0, 1, 12, 123, 6, 0 },
660 };
661
662 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
663 #define MUL_35468(a) (((a) * 35468) >> 16)
664 #endif
665
666 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
vp8_filter_common_is4tap(uint8_t * p,ptrdiff_t stride)667 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
668 ptrdiff_t stride)
669 {
670 int av_unused p1 = p[-2 * stride];
671 int av_unused p0 = p[-1 * stride];
672 int av_unused q0 = p[ 0 * stride];
673 int av_unused q1 = p[ 1 * stride];
674 int a, f1, f2;
675 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
676
677 a = 3 * (q0 - p0);
678 a += clip_int8(p1 - q1);
679 a = clip_int8(a);
680
681 // We deviate from the spec here with c(a+3) >> 3
682 // since that's what libvpx does.
683 f1 = FFMIN(a + 4, 127) >> 3;
684 f2 = FFMIN(a + 3, 127) >> 3;
685
686 // Despite what the spec says, we do need to clamp here to
687 // be bitexact with libvpx.
688 p[-1 * stride] = cm[p0 + f2];
689 p[ 0 * stride] = cm[q0 - f1];
690 }
691
vp8_filter_common_isnot4tap(uint8_t * p,ptrdiff_t stride)692 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
693 ptrdiff_t stride)
694 {
695 int av_unused p1 = p[-2 * stride];
696 int av_unused p0 = p[-1 * stride];
697 int av_unused q0 = p[ 0 * stride];
698 int av_unused q1 = p[ 1 * stride];
699 int a, f1, f2;
700 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
701
702 a = 3 * (q0 - p0);
703 a = clip_int8(a);
704
705 // We deviate from the spec here with c(a+3) >> 3
706 // since that's what libvpx does.
707 f1 = FFMIN(a + 4, 127) >> 3;
708 f2 = FFMIN(a + 3, 127) >> 3;
709
710 // Despite what the spec says, we do need to clamp here to
711 // be bitexact with libvpx.
712 p[-1 * stride] = cm[p0 + f2];
713 p[ 0 * stride] = cm[q0 - f1];
714 a = (f1 + 1) >> 1;
715 p[-2 * stride] = cm[p1 + a];
716 p[ 1 * stride] = cm[q1 - a];
717 }
718
vp8_simple_limit(uint8_t * p,ptrdiff_t stride,int flim)719 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
720 int flim)
721 {
722 int av_unused p1 = p[-2 * stride];
723 int av_unused p0 = p[-1 * stride];
724 int av_unused q0 = p[ 0 * stride];
725 int av_unused q1 = p[ 1 * stride];
726
727 return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
728 }
729
hev(uint8_t * p,ptrdiff_t stride,int thresh)730 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
731 {
732 int av_unused p1 = p[-2 * stride];
733 int av_unused p0 = p[-1 * stride];
734 int av_unused q0 = p[ 0 * stride];
735 int av_unused q1 = p[ 1 * stride];
736
737 return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
738 }
739
filter_mbedge(uint8_t * p,ptrdiff_t stride)740 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
741 {
742 int a0, a1, a2, w;
743 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
744
745 int av_unused p2 = p[-3 * stride];
746 int av_unused p1 = p[-2 * stride];
747 int av_unused p0 = p[-1 * stride];
748 int av_unused q0 = p[ 0 * stride];
749 int av_unused q1 = p[ 1 * stride];
750 int av_unused q2 = p[ 2 * stride];
751
752 w = clip_int8(p1 - q1);
753 w = clip_int8(w + 3 * (q0 - p0));
754
755 a0 = (27 * w + 63) >> 7;
756 a1 = (18 * w + 63) >> 7;
757 a2 = (9 * w + 63) >> 7;
758
759 p[-3 * stride] = cm[p2 + a2];
760 p[-2 * stride] = cm[p1 + a1];
761 p[-1 * stride] = cm[p0 + a0];
762 p[ 0 * stride] = cm[q0 - a0];
763 p[ 1 * stride] = cm[q1 - a1];
764 p[ 2 * stride] = cm[q2 - a2];
765 }
766
vp8_normal_limit(uint8_t * p,ptrdiff_t stride,int E,int I)767 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
768 int E, int I)
769 {
770 int av_unused p3 = p[-4 * stride];
771 int av_unused p2 = p[-3 * stride];
772 int av_unused p1 = p[-2 * stride];
773 int av_unused p0 = p[-1 * stride];
774 int av_unused q0 = p[ 0 * stride];
775 int av_unused q1 = p[ 1 * stride];
776 int av_unused q2 = p[ 2 * stride];
777 int av_unused q3 = p[ 3 * stride];
778
779 return vp8_simple_limit(p, stride, E) &&
780 FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
781 FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
782 FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
783 }
784
vp8_v_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)785 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
786 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
787 {
788 double ftmp[18];
789 uint32_t tmp[1];
790 DECLARE_DOUBLE_1;
791 DECLARE_DOUBLE_2;
792 DECLARE_UINT32_T;
793 __asm__ volatile(
794 /* Get data from dst */
795 "gsldlc1 %[q0], 0x07(%[dst]) \n\t"
796 "gsldrc1 %[q0], 0x00(%[dst]) \n\t"
797 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
798 "gsldlc1 %[p0], 0x07(%[tmp0]) \n\t"
799 "gsldrc1 %[p0], 0x00(%[tmp0]) \n\t"
800 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
801 "gsldlc1 %[p1], 0x07(%[tmp0]) \n\t"
802 "gsldrc1 %[p1], 0x00(%[tmp0]) \n\t"
803 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
804 "gsldlc1 %[p2], 0x07(%[tmp0]) \n\t"
805 "gsldrc1 %[p2], 0x00(%[tmp0]) \n\t"
806 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
807 "gsldlc1 %[p3], 0x07(%[tmp0]) \n\t"
808 "gsldrc1 %[p3], 0x00(%[tmp0]) \n\t"
809 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
810 "gsldlc1 %[q1], 0x07(%[tmp0]) \n\t"
811 "gsldrc1 %[q1], 0x00(%[tmp0]) \n\t"
812 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
813 "gsldlc1 %[q2], 0x07(%[tmp0]) \n\t"
814 "gsldrc1 %[q2], 0x00(%[tmp0]) \n\t"
815 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
816 "gsldlc1 %[q3], 0x07(%[tmp0]) \n\t"
817 "gsldrc1 %[q3], 0x00(%[tmp0]) \n\t"
818 MMI_VP8_LOOP_FILTER
819 /* Move to dst */
820 "gssdlc1 %[q0], 0x07(%[dst]) \n\t"
821 "gssdrc1 %[q0], 0x00(%[dst]) \n\t"
822 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
823 "gssdlc1 %[p0], 0x07(%[tmp0]) \n\t"
824 "gssdrc1 %[p0], 0x00(%[tmp0]) \n\t"
825 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
826 "gssdlc1 %[p1], 0x07(%[tmp0]) \n\t"
827 "gssdrc1 %[p1], 0x00(%[tmp0]) \n\t"
828 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
829 "gssdlc1 %[p2], 0x07(%[tmp0]) \n\t"
830 "gssdrc1 %[p2], 0x00(%[tmp0]) \n\t"
831 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
832 "gssdlc1 %[q1], 0x07(%[tmp0]) \n\t"
833 "gssdrc1 %[q1], 0x00(%[tmp0]) \n\t"
834 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
835 "gssdlc1 %[q2], 0x07(%[tmp0]) \n\t"
836 "gssdrc1 %[q2], 0x00(%[tmp0]) \n\t"
837 : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
838 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
839 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
840 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
841 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
842 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
843 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
844 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
845 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
846 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
847 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
848 RESTRICT_ASM_UINT32_T
849 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
850 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
851 : "memory"
852 );
853 }
854
vp8_v_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)855 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
856 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
857 {
858 int i;
859
860 for (i = 0; i < 8; i++)
861 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
862 int hv = hev(dst + i * 1, stride, hev_thresh);
863 if (hv)
864 vp8_filter_common_is4tap(dst + i * 1, stride);
865 else
866 vp8_filter_common_isnot4tap(dst + i * 1, stride);
867 }
868 }
869
vp8_h_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)870 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
871 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
872 {
873 double ftmp[18];
874 uint32_t tmp[1];
875 DECLARE_DOUBLE_1;
876 DECLARE_DOUBLE_2;
877 DECLARE_UINT32_T;
878 __asm__ volatile(
879 /* Get data from dst */
880 "gsldlc1 %[p3], 0x03(%[dst]) \n\t"
881 "gsldrc1 %[p3], -0x04(%[dst]) \n\t"
882 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
883 "gsldlc1 %[p2], 0x03(%[tmp0]) \n\t"
884 "gsldrc1 %[p2], -0x04(%[tmp0]) \n\t"
885 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
886 "gsldlc1 %[p1], 0x03(%[tmp0]) \n\t"
887 "gsldrc1 %[p1], -0x04(%[tmp0]) \n\t"
888 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
889 "gsldlc1 %[p0], 0x03(%[tmp0]) \n\t"
890 "gsldrc1 %[p0], -0x04(%[tmp0]) \n\t"
891 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
892 "gsldlc1 %[q0], 0x03(%[tmp0]) \n\t"
893 "gsldrc1 %[q0], -0x04(%[tmp0]) \n\t"
894 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
895 "gsldlc1 %[q1], 0x03(%[tmp0]) \n\t"
896 "gsldrc1 %[q1], -0x04(%[tmp0]) \n\t"
897 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
898 "gsldlc1 %[q2], 0x03(%[tmp0]) \n\t"
899 "gsldrc1 %[q2], -0x04(%[tmp0]) \n\t"
900 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
901 "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t"
902 "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t"
903 /* Matrix transpose */
904 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
905 %[q0], %[q1], %[q2], %[q3],
906 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
907 MMI_VP8_LOOP_FILTER
908 /* Matrix transpose */
909 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
910 %[q0], %[q1], %[q2], %[q3],
911 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
912 /* Move to dst */
913 "gssdlc1 %[p3], 0x03(%[dst]) \n\t"
914 "gssdrc1 %[p3], -0x04(%[dst]) \n\t"
915 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
916 "gssdlc1 %[p2], 0x03(%[dst]) \n\t"
917 "gssdrc1 %[p2], -0x04(%[dst]) \n\t"
918 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
919 "gssdlc1 %[p1], 0x03(%[dst]) \n\t"
920 "gssdrc1 %[p1], -0x04(%[dst]) \n\t"
921 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
922 "gssdlc1 %[p0], 0x03(%[dst]) \n\t"
923 "gssdrc1 %[p0], -0x04(%[dst]) \n\t"
924 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
925 "gssdlc1 %[q0], 0x03(%[dst]) \n\t"
926 "gssdrc1 %[q0], -0x04(%[dst]) \n\t"
927 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
928 "gssdlc1 %[q1], 0x03(%[dst]) \n\t"
929 "gssdrc1 %[q1], -0x04(%[dst]) \n\t"
930 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
931 "gssdlc1 %[q2], 0x03(%[dst]) \n\t"
932 "gssdrc1 %[q2], -0x04(%[dst]) \n\t"
933 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
934 "gssdlc1 %[q3], 0x03(%[dst]) \n\t"
935 "gssdrc1 %[q3], -0x04(%[dst]) \n\t"
936 : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
937 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
938 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
939 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
940 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
941 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
942 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
943 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
944 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
945 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
946 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
947 RESTRICT_ASM_UINT32_T
948 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
949 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
950 : "memory"
951 );
952 }
953
vp8_h_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)954 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
955 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
956 {
957 int i;
958
959 for (i = 0; i < 8; i++)
960 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
961 int hv = hev(dst + i * stride, 1, hev_thresh);
962 if (hv)
963 vp8_filter_common_is4tap(dst + i * stride, 1);
964 else
965 vp8_filter_common_isnot4tap(dst + i * stride, 1);
966 }
967 }
968
ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16],int16_t dc[16])969 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
970 {
971 #if 1
972 double ftmp[8];
973 DECLARE_VAR_ALL64;
974
975 __asm__ volatile (
976 MMI_LDC1(%[ftmp0], %[dc], 0x00)
977 MMI_LDC1(%[ftmp1], %[dc], 0x08)
978 MMI_LDC1(%[ftmp2], %[dc], 0x10)
979 MMI_LDC1(%[ftmp3], %[dc], 0x18)
980 "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
981 "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
982 "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
983 "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
984 "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
985 "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
986 "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
987 "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
988 MMI_SDC1(%[ftmp0], %[dc], 0x00)
989 MMI_SDC1(%[ftmp1], %[dc], 0x08)
990 MMI_SDC1(%[ftmp2], %[dc], 0x10)
991 MMI_SDC1(%[ftmp3], %[dc], 0x18)
992 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
993 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
994 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
995 [ftmp6]"=&f"(ftmp[6]),
996 RESTRICT_ASM_ALL64
997 [ftmp7]"=&f"(ftmp[7])
998 : [dc]"r"((uint8_t*)dc)
999 : "memory"
1000 );
1001
1002 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1003 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1004 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1005 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1006
1007 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1008 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1009 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1010 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1011
1012 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1013 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1014 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1015 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1016
1017 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1018 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1019 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1020 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1021
1022 __asm__ volatile (
1023 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1024 MMI_SDC1(%[ftmp0], %[dc], 0x00)
1025 MMI_SDC1(%[ftmp0], %[dc], 0x08)
1026 MMI_SDC1(%[ftmp0], %[dc], 0x10)
1027 MMI_SDC1(%[ftmp0], %[dc], 0x18)
1028 : RESTRICT_ASM_ALL64
1029 [ftmp0]"=&f"(ftmp[0])
1030 : [dc]"r"((uint8_t *)dc)
1031 : "memory"
1032 );
1033 #else
1034 int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1035
1036 t00 = dc[0] + dc[12];
1037 t10 = dc[1] + dc[13];
1038 t20 = dc[2] + dc[14];
1039 t30 = dc[3] + dc[15];
1040
1041 t03 = dc[0] - dc[12];
1042 t13 = dc[1] - dc[13];
1043 t23 = dc[2] - dc[14];
1044 t33 = dc[3] - dc[15];
1045
1046 t01 = dc[4] + dc[ 8];
1047 t11 = dc[5] + dc[ 9];
1048 t21 = dc[6] + dc[10];
1049 t31 = dc[7] + dc[11];
1050
1051 t02 = dc[4] - dc[ 8];
1052 t12 = dc[5] - dc[ 9];
1053 t22 = dc[6] - dc[10];
1054 t32 = dc[7] - dc[11];
1055
1056 dc[ 0] = t00 + t01;
1057 dc[ 1] = t10 + t11;
1058 dc[ 2] = t20 + t21;
1059 dc[ 3] = t30 + t31;
1060
1061 dc[ 4] = t03 + t02;
1062 dc[ 5] = t13 + t12;
1063 dc[ 6] = t23 + t22;
1064 dc[ 7] = t33 + t32;
1065
1066 dc[ 8] = t00 - t01;
1067 dc[ 9] = t10 - t11;
1068 dc[10] = t20 - t21;
1069 dc[11] = t30 - t31;
1070
1071 dc[12] = t03 - t02;
1072 dc[13] = t13 - t12;
1073 dc[14] = t23 - t22;
1074 dc[15] = t33 - t32;
1075
1076 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1077 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1078 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1079 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1080
1081 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1082 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1083 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1084 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1085
1086 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1087 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1088 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1089 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1090
1091 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1092 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1093 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1094 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1095
1096 AV_ZERO64(dc + 0);
1097 AV_ZERO64(dc + 4);
1098 AV_ZERO64(dc + 8);
1099 AV_ZERO64(dc + 12);
1100 #endif
1101 }
1102
ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16],int16_t dc[16])1103 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1104 {
1105 int val = (dc[0] + 3) >> 3;
1106
1107 dc[0] = 0;
1108
1109 block[0][0][0] = val;
1110 block[0][1][0] = val;
1111 block[0][2][0] = val;
1112 block[0][3][0] = val;
1113 block[1][0][0] = val;
1114 block[1][1][0] = val;
1115 block[1][2][0] = val;
1116 block[1][3][0] = val;
1117 block[2][0][0] = val;
1118 block[2][1][0] = val;
1119 block[2][2][0] = val;
1120 block[2][3][0] = val;
1121 block[3][0][0] = val;
1122 block[3][1][0] = val;
1123 block[3][2][0] = val;
1124 block[3][3][0] = val;
1125 }
1126
ff_vp8_idct_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1127 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1128 {
1129 #if 1
1130 DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1131 DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1132 double ftmp[12];
1133 uint32_t tmp[1];
1134 DECLARE_VAR_LOW32;
1135 DECLARE_VAR_ALL64;
1136
1137 __asm__ volatile (
1138 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1139 MMI_LDC1(%[ftmp1], %[block], 0x00)
1140 MMI_LDC1(%[ftmp2], %[block], 0x08)
1141 MMI_LDC1(%[ftmp3], %[block], 0x10)
1142 MMI_LDC1(%[ftmp4], %[block], 0x18)
1143
1144 "li %[tmp0], 0x02 \n\t"
1145 "mtc1 %[tmp0], %[ftmp11] \n\t"
1146
1147 // block[0...3] + block[8...11]
1148 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1149 // block[0...3] - block[8...11]
1150 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1151 // MUL_35468(block[12...15])
1152 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1153 "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1154 // MUL_35468(block[4...7])
1155 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1156 "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1157 // MUL_20091(block[4...7]
1158 "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1159 "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1160 // MUL_20091(block[12...15])
1161 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1162 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1163
1164 // tmp[0 4 8 12]
1165 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1166 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1167 // tmp[1 5 9 13]
1168 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1169 "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1170 // tmp[2 6 10 14]
1171 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1172 "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1173 // tmp[3 7 11 15]
1174 "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1175 "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1176
1177 MMI_SDC1(%[ftmp0], %[block], 0x00)
1178 MMI_SDC1(%[ftmp0], %[block], 0x08)
1179 MMI_SDC1(%[ftmp0], %[block], 0x10)
1180 MMI_SDC1(%[ftmp0], %[block], 0x18)
1181
1182 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1183 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1184
1185 // t[0 4 8 12]
1186 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1187 // t[1 5 9 13]
1188 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1189 // t[2 6 10 14]
1190 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1191 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1192 "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1193 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1194 "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1195 // t[3 7 11 15]
1196 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1197 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1198 "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1199 "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1200 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1201
1202 "li %[tmp0], 0x03 \n\t"
1203 "mtc1 %[tmp0], %[ftmp11] \n\t"
1204 "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1205 "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1206 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1207 "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1208 "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1209 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1210 "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1211 "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1212 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1213 "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1214 "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1215 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1216
1217 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1218 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1219
1220 MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1221 MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1222 MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1223 MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1224
1225 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1226 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1227 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1228 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1229
1230 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1231 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1232 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1233 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1234
1235 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1236 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1237 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1238 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1239
1240 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1241 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1242 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1243 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1244 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1245 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1246 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1247 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1248 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1249 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1250 RESTRICT_ASM_LOW32
1251 RESTRICT_ASM_ALL64
1252 [tmp0]"=&r"(tmp[0])
1253 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1254 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1255 [block]"r"(block), [ff_pw_4]"f"(ff_pw_4),
1256 [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_22a3]"f"(ff_ph_22a3)
1257 : "memory"
1258 );
1259 #else
1260 int i, t0, t1, t2, t3;
1261 int16_t tmp[16];
1262
1263 for (i = 0; i < 4; i++) {
1264 t0 = block[0 + i] + block[8 + i];
1265 t1 = block[0 + i] - block[8 + i];
1266 t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1267 t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1268 block[ 0 + i] = 0;
1269 block[ 4 + i] = 0;
1270 block[ 8 + i] = 0;
1271 block[12 + i] = 0;
1272
1273 tmp[i * 4 + 0] = t0 + t3;
1274 tmp[i * 4 + 1] = t1 + t2;
1275 tmp[i * 4 + 2] = t1 - t2;
1276 tmp[i * 4 + 3] = t0 - t3;
1277 }
1278
1279 for (i = 0; i < 4; i++) {
1280 t0 = tmp[0 + i] + tmp[8 + i];
1281 t1 = tmp[0 + i] - tmp[8 + i];
1282 t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1283 t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1284
1285 dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1286 dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1287 dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1288 dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1289 dst += stride;
1290 }
1291 #endif
1292 }
1293
ff_vp8_idct_dc_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1294 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1295 {
1296 #if 1
1297 int dc = (block[0] + 4) >> 3;
1298 double ftmp[6];
1299 DECLARE_VAR_LOW32;
1300
1301 block[0] = 0;
1302
1303 __asm__ volatile (
1304 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1305 "mtc1 %[dc], %[ftmp5] \n\t"
1306 MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1307 MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1308 MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1309 MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1310 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1311 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1312 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1313 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1314 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1315 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1316 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1317 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1318 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1319 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1320 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1321 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1322 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1323 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1324 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1325 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1326 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1327 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1328 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1329 [ftmp4]"=&f"(ftmp[4]),
1330 RESTRICT_ASM_LOW32
1331 [ftmp5]"=&f"(ftmp[5])
1332 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1333 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1334 [dc]"r"(dc)
1335 : "memory"
1336 );
1337 #else
1338 int i, dc = (block[0] + 4) >> 3;
1339
1340 block[0] = 0;
1341
1342 for (i = 0; i < 4; i++) {
1343 dst[0] = av_clip_uint8(dst[0] + dc);
1344 dst[1] = av_clip_uint8(dst[1] + dc);
1345 dst[2] = av_clip_uint8(dst[2] + dc);
1346 dst[3] = av_clip_uint8(dst[3] + dc);
1347 dst += stride;
1348 }
1349 #endif
1350 }
1351
ff_vp8_idct_dc_add4y_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1352 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1353 ptrdiff_t stride)
1354 {
1355 ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1356 ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1357 ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1358 ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1359 }
1360
ff_vp8_idct_dc_add4uv_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1361 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1362 ptrdiff_t stride)
1363 {
1364 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1365 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1366 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1367 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1368 }
1369
1370 // loop filter applied to edges between macroblocks
ff_vp8_v_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1371 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1372 int flim_I, int hev_thresh)
1373 {
1374 vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1375 vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1376 }
1377
ff_vp8_h_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1378 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1379 int flim_I, int hev_thresh)
1380 {
1381 vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1382 vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1383 hev_thresh);
1384 }
1385
ff_vp8_v_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1386 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1387 int flim_E, int flim_I, int hev_thresh)
1388 {
1389 vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1390 vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1391 }
1392
ff_vp8_h_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1393 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1394 int flim_E, int flim_I, int hev_thresh)
1395 {
1396 vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1397 vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1398 }
1399
1400 // loop filter applied to inner macroblock edges
ff_vp8_v_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1401 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1402 int flim_E, int flim_I, int hev_thresh)
1403 {
1404 int i;
1405
1406 for (i = 0; i < 16; i++)
1407 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1408 int hv = hev(dst + i * 1, stride, hev_thresh);
1409 if (hv)
1410 vp8_filter_common_is4tap(dst + i * 1, stride);
1411 else
1412 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1413 }
1414 }
1415
ff_vp8_h_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1416 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1417 int flim_E, int flim_I, int hev_thresh)
1418 {
1419 int i;
1420
1421 for (i = 0; i < 16; i++)
1422 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1423 int hv = hev(dst + i * stride, 1, hev_thresh);
1424 if (hv)
1425 vp8_filter_common_is4tap(dst + i * stride, 1);
1426 else
1427 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1428 }
1429 }
1430
ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1431 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1432 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1433 {
1434 vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1435 vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1436 }
1437
ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1438 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1439 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1440 {
1441 vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1442 vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1443 }
1444
ff_vp8_v_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1445 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1446 {
1447 int i;
1448
1449 for (i = 0; i < 16; i++)
1450 if (vp8_simple_limit(dst + i, stride, flim))
1451 vp8_filter_common_is4tap(dst + i, stride);
1452 }
1453
ff_vp8_h_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1454 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1455 {
1456 int i;
1457
1458 for (i = 0; i < 16; i++)
1459 if (vp8_simple_limit(dst + i * stride, 1, flim))
1460 vp8_filter_common_is4tap(dst + i * stride, 1);
1461 }
1462
ff_put_vp8_pixels16_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1463 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1464 ptrdiff_t srcstride, int h, int x, int y)
1465 {
1466 #if 1
1467 double ftmp[2];
1468 uint64_t tmp[2];
1469 mips_reg addr[2];
1470 DECLARE_VAR_ALL64;
1471
1472 __asm__ volatile (
1473 "1: \n\t"
1474 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1475 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1476 "ldl %[tmp0], 0x0f(%[src]) \n\t"
1477 "ldr %[tmp0], 0x08(%[src]) \n\t"
1478 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1479 "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1480 "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1481 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1482 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1483 "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1484 "sdr %[tmp0], 0x08(%[dst]) \n\t"
1485 "addiu %[h], %[h], -0x02 \n\t"
1486 MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1487 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1488 "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1489 "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1490 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1491 "bnez %[h], 1b \n\t"
1492 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1493 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1494 RESTRICT_ASM_ALL64
1495 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1496 [dst]"+&r"(dst), [src]"+&r"(src),
1497 [h]"+&r"(h)
1498 : [dststride]"r"((mips_reg)dststride),
1499 [srcstride]"r"((mips_reg)srcstride)
1500 : "memory"
1501 );
1502 #else
1503 int i;
1504
1505 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1506 memcpy(dst, src, 16);
1507 #endif
1508 }
1509
ff_put_vp8_pixels8_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1510 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1511 ptrdiff_t srcstride, int h, int x, int y)
1512 {
1513 #if 1
1514 double ftmp[1];
1515 uint64_t tmp[1];
1516 mips_reg addr[2];
1517 DECLARE_VAR_ALL64;
1518
1519 __asm__ volatile (
1520 "1: \n\t"
1521 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1522 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1523 "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1524 "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1525 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1526 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1527 "addiu %[h], %[h], -0x02 \n\t"
1528 "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1529 "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1530 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1531 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1532 "bnez %[h], 1b \n\t"
1533 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1534 RESTRICT_ASM_ALL64
1535 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1536 [dst]"+&r"(dst), [src]"+&r"(src),
1537 [h]"+&r"(h)
1538 : [dststride]"r"((mips_reg)dststride),
1539 [srcstride]"r"((mips_reg)srcstride)
1540 : "memory"
1541 );
1542 #else
1543 int i;
1544
1545 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1546 memcpy(dst, src, 8);
1547 #endif
1548 }
1549
ff_put_vp8_pixels4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1550 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1551 ptrdiff_t srcstride, int h, int x, int y)
1552 {
1553 #if 1
1554 double ftmp[1];
1555 uint64_t tmp[1];
1556 mips_reg addr[2];
1557 DECLARE_VAR_LOW32;
1558
1559 __asm__ volatile (
1560 "1: \n\t"
1561 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1562 MMI_LWC1(%[ftmp0], %[src], 0x00)
1563 "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1564 "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1565 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1566 MMI_SWC1(%[ftmp0], %[dst], 0x00)
1567 "addiu %[h], %[h], -0x02 \n\t"
1568 "swl %[tmp0], 0x03(%[addr1]) \n\t"
1569 "swr %[tmp0], 0x00(%[addr1]) \n\t"
1570 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1571 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1572 "bnez %[h], 1b \n\t"
1573 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1574 RESTRICT_ASM_LOW32
1575 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1576 [dst]"+&r"(dst), [src]"+&r"(src),
1577 [h]"+&r"(h)
1578 : [dststride]"r"((mips_reg)dststride),
1579 [srcstride]"r"((mips_reg)srcstride)
1580 : "memory"
1581 );
1582 #else
1583 int i;
1584
1585 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1586 memcpy(dst, src, 4);
1587 #endif
1588 }
1589
ff_put_vp8_epel16_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1590 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1591 ptrdiff_t srcstride, int h, int mx, int my)
1592 {
1593 #if 1
1594 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1595 double ftmp[9];
1596 uint32_t tmp[1];
1597 mips_reg src1, dst1;
1598 DECLARE_VAR_ALL64;
1599
1600 /*
1601 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1602 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1603 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1604 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1605 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1606 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1607 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1608 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1609
1610 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1611 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1612 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1613 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1614 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1615 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1616 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1617 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1618 */
1619 __asm__ volatile (
1620 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1621 "li %[tmp0], 0x07 \n\t"
1622 "mtc1 %[tmp0], %[ftmp4] \n\t"
1623
1624 "1: \n\t"
1625 // 0 - 7
1626 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1627 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1628 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1629 // 8 - 15
1630 PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1631
1632 "addiu %[h], %[h], -0x01 \n\t"
1633 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1634 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1635 "bnez %[h], 1b \n\t"
1636 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1637 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1638 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1639 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1640 [ftmp8]"=&f"(ftmp[8]),
1641 [tmp0]"=&r"(tmp[0]),
1642 RESTRICT_ASM_ALL64
1643 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1644 [h]"+&r"(h),
1645 [dst]"+&r"(dst), [src]"+&r"(src)
1646 : [ff_pw_64]"f"(ff_pw_64),
1647 [srcstride]"r"((mips_reg)srcstride),
1648 [dststride]"r"((mips_reg)dststride),
1649 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1650 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1651 : "memory"
1652 );
1653 #else
1654 const uint8_t *filter = subpel_filters[mx - 1];
1655 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1656 int x, y;
1657
1658 for (y = 0; y < h; y++) {
1659 for (x = 0; x < 16; x++)
1660 dst[x] = FILTER_4TAP(src, filter, 1);
1661 dst += dststride;
1662 src += srcstride;
1663 }
1664 #endif
1665 }
1666
ff_put_vp8_epel8_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1667 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1668 ptrdiff_t srcstride, int h, int mx, int my)
1669 {
1670 #if 1
1671 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1672 double ftmp[9];
1673 uint32_t tmp[1];
1674 DECLARE_VAR_ALL64;
1675
1676 /*
1677 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1678 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1679 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1680 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1681 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1682 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1683 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1684 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1685 */
1686 __asm__ volatile (
1687 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1688 "li %[tmp0], 0x07 \n\t"
1689 "mtc1 %[tmp0], %[ftmp4] \n\t"
1690
1691 "1: \n\t"
1692 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1693
1694 "addiu %[h], %[h], -0x01 \n\t"
1695 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1696 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1697 "bnez %[h], 1b \n\t"
1698 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1699 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1700 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1701 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1702 [ftmp8]"=&f"(ftmp[8]),
1703 [tmp0]"=&r"(tmp[0]),
1704 RESTRICT_ASM_ALL64
1705 [h]"+&r"(h),
1706 [dst]"+&r"(dst), [src]"+&r"(src)
1707 : [ff_pw_64]"f"(ff_pw_64),
1708 [srcstride]"r"((mips_reg)srcstride),
1709 [dststride]"r"((mips_reg)dststride),
1710 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1711 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1712 : "memory"
1713 );
1714 #else
1715 const uint8_t *filter = subpel_filters[mx - 1];
1716 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1717 int x, y;
1718
1719 for (y = 0; y < h; y++) {
1720 for (x = 0; x < 8; x++)
1721 dst[x] = FILTER_4TAP(src, filter, 1);
1722 dst += dststride;
1723 src += srcstride;
1724 }
1725 #endif
1726 }
1727
ff_put_vp8_epel4_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1728 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1729 ptrdiff_t srcstride, int h, int mx, int my)
1730 {
1731 #if 1
1732 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1733 double ftmp[6];
1734 uint32_t tmp[1];
1735 DECLARE_VAR_LOW32;
1736
1737 /*
1738 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1739 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1740 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1741 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1742 */
1743 __asm__ volatile (
1744 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1745 "li %[tmp0], 0x07 \n\t"
1746 "mtc1 %[tmp0], %[ftmp4] \n\t"
1747
1748 "1: \n\t"
1749 PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1750
1751 "addiu %[h], %[h], -0x01 \n\t"
1752 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1753 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1754 "bnez %[h], 1b \n\t"
1755 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1756 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1757 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1758 [tmp0]"=&r"(tmp[0]),
1759 RESTRICT_ASM_LOW32
1760 [h]"+&r"(h),
1761 [dst]"+&r"(dst), [src]"+&r"(src)
1762 : [ff_pw_64]"f"(ff_pw_64),
1763 [srcstride]"r"((mips_reg)srcstride),
1764 [dststride]"r"((mips_reg)dststride),
1765 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1766 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1767 : "memory"
1768 );
1769 #else
1770 const uint8_t *filter = subpel_filters[mx - 1];
1771 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1772 int x, y;
1773
1774 for (y = 0; y < h; y++) {
1775 for (x = 0; x < 4; x++)
1776 dst[x] = FILTER_4TAP(src, filter, 1);
1777 dst += dststride;
1778 src += srcstride;
1779 }
1780 #endif
1781 }
1782
ff_put_vp8_epel16_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1783 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1784 ptrdiff_t srcstride, int h, int mx, int my)
1785 {
1786 #if 1
1787 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1788 double ftmp[9];
1789 uint32_t tmp[1];
1790 mips_reg src1, dst1;
1791 DECLARE_VAR_ALL64;
1792
1793 /*
1794 dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1795 dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1796 dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1797 dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1798 dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1799 dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1800 dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1801 dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1802
1803 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1804 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1805 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1806 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1807 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1808 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1809 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1810 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1811 */
1812 __asm__ volatile (
1813 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1814 "li %[tmp0], 0x07 \n\t"
1815 "mtc1 %[tmp0], %[ftmp4] \n\t"
1816
1817 "1: \n\t"
1818 // 0 - 7
1819 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1820 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1821 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1822 // 8 - 15
1823 PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1824
1825 "addiu %[h], %[h], -0x01 \n\t"
1826 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1827 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1828 "bnez %[h], 1b \n\t"
1829 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1830 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1831 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1832 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1833 [ftmp8]"=&f"(ftmp[8]),
1834 [tmp0]"=&r"(tmp[0]),
1835 RESTRICT_ASM_ALL64
1836 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1837 [h]"+&r"(h),
1838 [dst]"+&r"(dst), [src]"+&r"(src)
1839 : [ff_pw_64]"f"(ff_pw_64),
1840 [srcstride]"r"((mips_reg)srcstride),
1841 [dststride]"r"((mips_reg)dststride),
1842 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1843 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1844 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1845 : "memory"
1846 );
1847 #else
1848 const uint8_t *filter = subpel_filters[mx - 1];
1849 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1850 int x, y;
1851
1852 for (y = 0; y < h; y++) {
1853 for (x = 0; x < 16; x++)
1854 dst[x] = FILTER_6TAP(src, filter, 1);
1855 dst += dststride;
1856 src += srcstride;
1857 }
1858 #endif
1859 }
1860
ff_put_vp8_epel8_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1861 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1862 ptrdiff_t srcstride, int h, int mx, int my)
1863 {
1864 #if 1
1865 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1866 double ftmp[9];
1867 uint32_t tmp[1];
1868 DECLARE_VAR_ALL64;
1869
1870 /*
1871 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1872 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1873 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1874 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1875 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1876 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1877 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1878 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1879 */
1880 __asm__ volatile (
1881 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1882 "li %[tmp0], 0x07 \n\t"
1883 "mtc1 %[tmp0], %[ftmp4] \n\t"
1884
1885 "1: \n\t"
1886 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1887
1888 "addiu %[h], %[h], -0x01 \n\t"
1889 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1890 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1891 "bnez %[h], 1b \n\t"
1892 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1893 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1894 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1895 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1896 [ftmp8]"=&f"(ftmp[8]),
1897 [tmp0]"=&r"(tmp[0]),
1898 RESTRICT_ASM_ALL64
1899 [h]"+&r"(h),
1900 [dst]"+&r"(dst), [src]"+&r"(src)
1901 : [ff_pw_64]"f"(ff_pw_64),
1902 [srcstride]"r"((mips_reg)srcstride),
1903 [dststride]"r"((mips_reg)dststride),
1904 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1905 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1906 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1907 : "memory"
1908 );
1909 #else
1910 const uint8_t *filter = subpel_filters[mx - 1];
1911 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1912 int x, y;
1913
1914 for (y = 0; y < h; y++) {
1915 for (x = 0; x < 8; x++)
1916 dst[x] = FILTER_6TAP(src, filter, 1);
1917 dst += dststride;
1918 src += srcstride;
1919 }
1920 #endif
1921 }
1922
ff_put_vp8_epel4_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1923 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1924 ptrdiff_t srcstride, int h, int mx, int my)
1925 {
1926 #if 1
1927 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1928 double ftmp[6];
1929 uint32_t tmp[1];
1930 DECLARE_VAR_LOW32;
1931
1932 /*
1933 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1934 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1935 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1936 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1937 */
1938 __asm__ volatile (
1939 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1940 "li %[tmp0], 0x07 \n\t"
1941 "mtc1 %[tmp0], %[ftmp4] \n\t"
1942
1943 "1: \n\t"
1944 PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1945
1946 "addiu %[h], %[h], -0x01 \n\t"
1947 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1948 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1949 "bnez %[h], 1b \n\t"
1950 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1951 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1952 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1953 [tmp0]"=&r"(tmp[0]),
1954 RESTRICT_ASM_LOW32
1955 [h]"+&r"(h),
1956 [dst]"+&r"(dst), [src]"+&r"(src)
1957 : [ff_pw_64]"f"(ff_pw_64),
1958 [srcstride]"r"((mips_reg)srcstride),
1959 [dststride]"r"((mips_reg)dststride),
1960 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1961 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1962 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1963 : "memory"
1964 );
1965 #else
1966 const uint8_t *filter = subpel_filters[mx - 1];
1967 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1968 int x, y;
1969
1970 for (y = 0; y < h; y++) {
1971 for (x = 0; x < 4; x++)
1972 dst[x] = FILTER_6TAP(src, filter, 1);
1973 dst += dststride;
1974 src += srcstride;
1975 }
1976 #endif
1977 }
1978
ff_put_vp8_epel16_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1979 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1980 ptrdiff_t srcstride, int h, int mx, int my)
1981 {
1982 #if 1
1983 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1984 double ftmp[9];
1985 uint32_t tmp[1];
1986 mips_reg src0, src1, dst0;
1987 DECLARE_VAR_ALL64;
1988
1989 /*
1990 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1991 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1992 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1993 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1994 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1995 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1996 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1997 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1998
1999 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2000 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2001 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2002 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2003 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2004 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2005 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2006 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2007 */
2008 __asm__ volatile (
2009 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2010 "li %[tmp0], 0x07 \n\t"
2011 "mtc1 %[tmp0], %[ftmp4] \n\t"
2012
2013 "1: \n\t"
2014 // 0 - 7
2015 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2016 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2017 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2018 // 8 - 15
2019 PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2020
2021 "addiu %[h], %[h], -0x01 \n\t"
2022 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2023 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2024 "bnez %[h], 1b \n\t"
2025 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2026 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2027 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2028 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2029 [ftmp8]"=&f"(ftmp[8]),
2030 [tmp0]"=&r"(tmp[0]),
2031 RESTRICT_ASM_ALL64
2032 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2033 [src1]"=&r"(src1),
2034 [h]"+&r"(h),
2035 [dst]"+&r"(dst), [src]"+&r"(src)
2036 : [ff_pw_64]"f"(ff_pw_64),
2037 [srcstride]"r"((mips_reg)srcstride),
2038 [dststride]"r"((mips_reg)dststride),
2039 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2040 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2041 : "memory"
2042 );
2043 #else
2044 const uint8_t *filter = subpel_filters[my - 1];
2045 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2046 int x, y;
2047
2048 for (y = 0; y < h; y++) {
2049 for (x = 0; x < 16; x++)
2050 dst[x] = FILTER_4TAP(src, filter, srcstride);
2051 dst += dststride;
2052 src += srcstride;
2053 }
2054 #endif
2055 }
2056
ff_put_vp8_epel8_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2057 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2058 ptrdiff_t srcstride, int h, int mx, int my)
2059 {
2060 #if 1
2061 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2062 double ftmp[9];
2063 uint32_t tmp[1];
2064 mips_reg src1;
2065 DECLARE_VAR_ALL64;
2066
2067 /*
2068 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2069 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2070 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2071 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2072 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2073 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2074 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2075 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2076 */
2077 __asm__ volatile (
2078 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2079 "li %[tmp0], 0x07 \n\t"
2080 "mtc1 %[tmp0], %[ftmp4] \n\t"
2081
2082 "1: \n\t"
2083 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2084
2085 "addiu %[h], %[h], -0x01 \n\t"
2086 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2087 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2088 "bnez %[h], 1b \n\t"
2089 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2090 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2091 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2092 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2093 [ftmp8]"=&f"(ftmp[8]),
2094 [tmp0]"=&r"(tmp[0]),
2095 RESTRICT_ASM_ALL64
2096 [src1]"=&r"(src1),
2097 [h]"+&r"(h),
2098 [dst]"+&r"(dst), [src]"+&r"(src)
2099 : [ff_pw_64]"f"(ff_pw_64),
2100 [srcstride]"r"((mips_reg)srcstride),
2101 [dststride]"r"((mips_reg)dststride),
2102 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2103 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2104 : "memory"
2105 );
2106 #else
2107 const uint8_t *filter = subpel_filters[my - 1];
2108 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2109 int x, y;
2110
2111 for (y = 0; y < h; y++) {
2112 for (x = 0; x < 8; x++)
2113 dst[x] = FILTER_4TAP(src, filter, srcstride);
2114 dst += dststride;
2115 src += srcstride;
2116 }
2117 #endif
2118 }
2119
ff_put_vp8_epel4_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2120 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2121 ptrdiff_t srcstride, int h, int mx, int my)
2122 {
2123 #if 1
2124 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2125 double ftmp[6];
2126 uint32_t tmp[1];
2127 mips_reg src1;
2128 DECLARE_VAR_LOW32;
2129
2130 /*
2131 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2132 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2133 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2134 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2135 */
2136 __asm__ volatile (
2137 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2138 "li %[tmp0], 0x07 \n\t"
2139 "mtc1 %[tmp0], %[ftmp4] \n\t"
2140
2141 "1: \n\t"
2142 PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2143
2144 "addiu %[h], %[h], -0x01 \n\t"
2145 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2146 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2147 "bnez %[h], 1b \n\t"
2148 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2149 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2150 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2151 [tmp0]"=&r"(tmp[0]),
2152 RESTRICT_ASM_LOW32
2153 [src1]"=&r"(src1),
2154 [h]"+&r"(h),
2155 [dst]"+&r"(dst), [src]"+&r"(src)
2156 : [ff_pw_64]"f"(ff_pw_64),
2157 [srcstride]"r"((mips_reg)srcstride),
2158 [dststride]"r"((mips_reg)dststride),
2159 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2160 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2161 : "memory"
2162 );
2163 #else
2164 const uint8_t *filter = subpel_filters[my - 1];
2165 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2166 int x, y;
2167
2168 for (y = 0; y < h; y++) {
2169 for (x = 0; x < 4; x++)
2170 dst[x] = FILTER_4TAP(src, filter, srcstride);
2171 dst += dststride;
2172 src += srcstride;
2173 }
2174 #endif
2175 }
2176
ff_put_vp8_epel16_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2177 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2178 ptrdiff_t srcstride, int h, int mx, int my)
2179 {
2180 #if 1
2181 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2182 double ftmp[9];
2183 uint32_t tmp[1];
2184 mips_reg src0, src1, dst0;
2185 DECLARE_VAR_ALL64;
2186
2187 /*
2188 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2189 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2190 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2191 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2192 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2193 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2194 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2195 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2196
2197 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2198 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2199 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2200 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2201 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2202 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2203 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2204 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2205 */
2206 __asm__ volatile (
2207 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2208 "li %[tmp0], 0x07 \n\t"
2209 "mtc1 %[tmp0], %[ftmp4] \n\t"
2210
2211 "1: \n\t"
2212 // 0 - 7
2213 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2214 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2215 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2216 // 8 - 15
2217 PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2218
2219 "addiu %[h], %[h], -0x01 \n\t"
2220 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2221 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2222 "bnez %[h], 1b \n\t"
2223 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2224 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2225 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2226 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2227 [ftmp8]"=&f"(ftmp[8]),
2228 [tmp0]"=&r"(tmp[0]),
2229 RESTRICT_ASM_ALL64
2230 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2231 [src1]"=&r"(src1),
2232 [h]"+&r"(h),
2233 [dst]"+&r"(dst), [src]"+&r"(src)
2234 : [ff_pw_64]"f"(ff_pw_64),
2235 [srcstride]"r"((mips_reg)srcstride),
2236 [dststride]"r"((mips_reg)dststride),
2237 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2238 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2239 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2240 : "memory"
2241 );
2242 #else
2243 const uint8_t *filter = subpel_filters[my - 1];
2244 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2245 int x, y;
2246
2247 for (y = 0; y < h; y++) {
2248 for (x = 0; x < 16; x++)
2249 dst[x] = FILTER_6TAP(src, filter, srcstride);
2250 dst += dststride;
2251 src += srcstride;
2252 }
2253 #endif
2254 }
2255
ff_put_vp8_epel8_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2256 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2257 ptrdiff_t srcstride, int h, int mx, int my)
2258 {
2259 #if 1
2260 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2261 double ftmp[9];
2262 uint32_t tmp[1];
2263 mips_reg src1;
2264 DECLARE_VAR_ALL64;
2265
2266 /*
2267 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2268 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2269 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2270 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2271 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2272 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2273 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2274 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2275 */
2276 __asm__ volatile (
2277 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2278 "li %[tmp0], 0x07 \n\t"
2279 "mtc1 %[tmp0], %[ftmp4] \n\t"
2280
2281 "1: \n\t"
2282 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2283
2284 "addiu %[h], %[h], -0x01 \n\t"
2285 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2286 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2287 "bnez %[h], 1b \n\t"
2288 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2289 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2290 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2291 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2292 [ftmp8]"=&f"(ftmp[8]),
2293 [tmp0]"=&r"(tmp[0]),
2294 RESTRICT_ASM_ALL64
2295 [src1]"=&r"(src1),
2296 [h]"+&r"(h),
2297 [dst]"+&r"(dst), [src]"+&r"(src)
2298 : [ff_pw_64]"f"(ff_pw_64),
2299 [srcstride]"r"((mips_reg)srcstride),
2300 [dststride]"r"((mips_reg)dststride),
2301 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2302 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2303 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2304 : "memory"
2305 );
2306 #else
2307 const uint8_t *filter = subpel_filters[my - 1];
2308 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2309 int x, y;
2310
2311 for (y = 0; y < h; y++) {
2312 for (x = 0; x < 8; x++)
2313 dst[x] = FILTER_6TAP(src, filter, srcstride);
2314 dst += dststride;
2315 src += srcstride;
2316 }
2317 #endif
2318 }
2319
ff_put_vp8_epel4_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2320 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2321 ptrdiff_t srcstride, int h, int mx, int my)
2322 {
2323 #if 1
2324 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2325 double ftmp[6];
2326 uint32_t tmp[1];
2327 mips_reg src1;
2328 DECLARE_VAR_LOW32;
2329
2330 /*
2331 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2332 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2333 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2334 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2335 */
2336 __asm__ volatile (
2337 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2338 "li %[tmp0], 0x07 \n\t"
2339 "mtc1 %[tmp0], %[ftmp4] \n\t"
2340
2341 "1: \n\t"
2342 PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2343
2344 "addiu %[h], %[h], -0x01 \n\t"
2345 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2346 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2347 "bnez %[h], 1b \n\t"
2348 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2349 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2350 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2351 [tmp0]"=&r"(tmp[0]),
2352 RESTRICT_ASM_LOW32
2353 [src1]"=&r"(src1),
2354 [h]"+&r"(h),
2355 [dst]"+&r"(dst), [src]"+&r"(src)
2356 : [ff_pw_64]"f"(ff_pw_64),
2357 [srcstride]"r"((mips_reg)srcstride),
2358 [dststride]"r"((mips_reg)dststride),
2359 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2360 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2361 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2362 : "memory"
2363 );
2364 #else
2365 const uint8_t *filter = subpel_filters[my - 1];
2366 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2367 int x, y;
2368
2369 for (y = 0; y < h; y++) {
2370 for (x = 0; x < 4; x++)
2371 dst[x] = FILTER_6TAP(src, filter, srcstride);
2372 dst += dststride;
2373 src += srcstride;
2374 }
2375 #endif
2376 }
2377
ff_put_vp8_epel16_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2378 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2379 ptrdiff_t srcstride, int h, int mx, int my)
2380 {
2381 #if 1
2382 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2383 uint8_t *tmp = tmp_array;
2384
2385 src -= srcstride;
2386 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2387 tmp = tmp_array + 16;
2388 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2389 #else
2390 const uint8_t *filter = subpel_filters[mx - 1];
2391 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2392 int x, y;
2393 uint8_t tmp_array[560];
2394 uint8_t *tmp = tmp_array;
2395
2396 src -= srcstride;
2397
2398 for (y = 0; y < h + 3; y++) {
2399 for (x = 0; x < 16; x++)
2400 tmp[x] = FILTER_4TAP(src, filter, 1);
2401 tmp += 16;
2402 src += srcstride;
2403 }
2404
2405 tmp = tmp_array + 16;
2406 filter = subpel_filters[my - 1];
2407
2408 for (y = 0; y < h; y++) {
2409 for (x = 0; x < 16; x++)
2410 dst[x] = FILTER_4TAP(tmp, filter, 16);
2411 dst += dststride;
2412 tmp += 16;
2413 }
2414 #endif
2415 }
2416
ff_put_vp8_epel8_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2417 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2418 ptrdiff_t srcstride, int h, int mx, int my)
2419 {
2420 #if 1
2421 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2422 uint8_t *tmp = tmp_array;
2423
2424 src -= srcstride;
2425 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2426 tmp = tmp_array + 8;
2427 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2428 #else
2429 const uint8_t *filter = subpel_filters[mx - 1];
2430 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2431 int x, y;
2432 uint8_t tmp_array[152];
2433 uint8_t *tmp = tmp_array;
2434
2435 src -= srcstride;
2436
2437 for (y = 0; y < h + 3; y++) {
2438 for (x = 0; x < 8; x++)
2439 tmp[x] = FILTER_4TAP(src, filter, 1);
2440 tmp += 8;
2441 src += srcstride;
2442 }
2443
2444 tmp = tmp_array + 8;
2445 filter = subpel_filters[my - 1];
2446
2447 for (y = 0; y < h; y++) {
2448 for (x = 0; x < 8; x++)
2449 dst[x] = FILTER_4TAP(tmp, filter, 8);
2450 dst += dststride;
2451 tmp += 8;
2452 }
2453 #endif
2454 }
2455
ff_put_vp8_epel4_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2456 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2457 ptrdiff_t srcstride, int h, int mx, int my)
2458 {
2459 #if 1
2460 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2461 uint8_t *tmp = tmp_array;
2462
2463 src -= srcstride;
2464 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2465 tmp = tmp_array + 4;
2466 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2467 #else
2468 const uint8_t *filter = subpel_filters[mx - 1];
2469 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2470 int x, y;
2471 uint8_t tmp_array[44];
2472 uint8_t *tmp = tmp_array;
2473
2474 src -= srcstride;
2475
2476 for (y = 0; y < h + 3; y++) {
2477 for (x = 0; x < 4; x++)
2478 tmp[x] = FILTER_4TAP(src, filter, 1);
2479 tmp += 4;
2480 src += srcstride;
2481 }
2482 tmp = tmp_array + 4;
2483 filter = subpel_filters[my - 1];
2484
2485 for (y = 0; y < h; y++) {
2486 for (x = 0; x < 4; x++)
2487 dst[x] = FILTER_4TAP(tmp, filter, 4);
2488 dst += dststride;
2489 tmp += 4;
2490 }
2491 #endif
2492 }
2493
ff_put_vp8_epel16_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2494 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2495 ptrdiff_t srcstride, int h, int mx, int my)
2496 {
2497 #if 1
2498 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2499 uint8_t *tmp = tmp_array;
2500
2501 src -= 2 * srcstride;
2502 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2503 tmp = tmp_array + 32;
2504 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2505 #else
2506 const uint8_t *filter = subpel_filters[mx - 1];
2507 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2508 int x, y;
2509 uint8_t tmp_array[592];
2510 uint8_t *tmp = tmp_array;
2511
2512 src -= 2 * srcstride;
2513
2514 for (y = 0; y < h + 5; y++) {
2515 for (x = 0; x < 16; x++)
2516 tmp[x] = FILTER_4TAP(src, filter, 1);
2517 tmp += 16;
2518 src += srcstride;
2519 }
2520
2521 tmp = tmp_array + 32;
2522 filter = subpel_filters[my - 1];
2523
2524 for (y = 0; y < h; y++) {
2525 for (x = 0; x < 16; x++)
2526 dst[x] = FILTER_6TAP(tmp, filter, 16);
2527 dst += dststride;
2528 tmp += 16;
2529 }
2530 #endif
2531 }
2532
ff_put_vp8_epel8_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2533 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2534 ptrdiff_t srcstride, int h, int mx, int my)
2535 {
2536 #if 1
2537 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2538 uint8_t *tmp = tmp_array;
2539
2540 src -= 2 * srcstride;
2541 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2542 tmp = tmp_array + 16;
2543 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2544 #else
2545 const uint8_t *filter = subpel_filters[mx - 1];
2546 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2547 int x, y;
2548 uint8_t tmp_array[168];
2549 uint8_t *tmp = tmp_array;
2550
2551 src -= 2 * srcstride;
2552
2553 for (y = 0; y < h + 5; y++) {
2554 for (x = 0; x < 8; x++)
2555 tmp[x] = FILTER_4TAP(src, filter, 1);
2556 tmp += 8;
2557 src += srcstride;
2558 }
2559
2560 tmp = tmp_array + 16;
2561 filter = subpel_filters[my - 1];
2562
2563 for (y = 0; y < h; y++) {
2564 for (x = 0; x < 8; x++)
2565 dst[x] = FILTER_6TAP(tmp, filter, 8);
2566 dst += dststride;
2567 tmp += 8;
2568 }
2569 #endif
2570 }
2571
ff_put_vp8_epel4_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2572 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2573 ptrdiff_t srcstride, int h, int mx, int my)
2574 {
2575 #if 1
2576 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2577 uint8_t *tmp = tmp_array;
2578
2579 src -= 2 * srcstride;
2580 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2581 tmp = tmp_array + 8;
2582 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2583 #else
2584 const uint8_t *filter = subpel_filters[mx - 1];
2585 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2586 int x, y;
2587 uint8_t tmp_array[52];
2588 uint8_t *tmp = tmp_array;
2589
2590 src -= 2 * srcstride;
2591
2592 for (y = 0; y < h + 5; y++) {
2593 for (x = 0; x < 4; x++)
2594 tmp[x] = FILTER_4TAP(src, filter, 1);
2595 tmp += 4;
2596 src += srcstride;
2597 }
2598
2599 tmp = tmp_array + 8;
2600 filter = subpel_filters[my - 1];
2601
2602 for (y = 0; y < h; y++) {
2603 for (x = 0; x < 4; x++)
2604 dst[x] = FILTER_6TAP(tmp, filter, 4);
2605 dst += dststride;
2606 tmp += 4;
2607 }
2608 #endif
2609 }
2610
ff_put_vp8_epel16_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2611 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2612 ptrdiff_t srcstride, int h, int mx, int my)
2613 {
2614 #if 1
2615 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2616 uint8_t *tmp = tmp_array;
2617
2618 src -= srcstride;
2619 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2620 tmp = tmp_array + 16;
2621 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2622 #else
2623 const uint8_t *filter = subpel_filters[mx - 1];
2624 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2625 int x, y;
2626 uint8_t tmp_array[560];
2627 uint8_t *tmp = tmp_array;
2628
2629 src -= srcstride;
2630
2631 for (y = 0; y < h + 3; y++) {
2632 for (x = 0; x < 16; x++)
2633 tmp[x] = FILTER_6TAP(src, filter, 1);
2634 tmp += 16;
2635 src += srcstride;
2636 }
2637
2638 tmp = tmp_array + 16;
2639 filter = subpel_filters[my - 1];
2640
2641 for (y = 0; y < h; y++) {
2642 for (x = 0; x < 16; x++)
2643 dst[x] = FILTER_4TAP(tmp, filter, 16);
2644 dst += dststride;
2645 tmp += 16;
2646 }
2647 #endif
2648 }
2649
ff_put_vp8_epel8_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2650 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2651 ptrdiff_t srcstride, int h, int mx, int my)
2652 {
2653 #if 1
2654 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2655 uint8_t *tmp = tmp_array;
2656
2657 src -= srcstride;
2658 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2659 tmp = tmp_array + 8;
2660 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2661 #else
2662 const uint8_t *filter = subpel_filters[mx - 1];
2663 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2664 int x, y;
2665 uint8_t tmp_array[152];
2666 uint8_t *tmp = tmp_array;
2667
2668 src -= srcstride;
2669
2670 for (y = 0; y < h + 3; y++) {
2671 for (x = 0; x < 8; x++)
2672 tmp[x] = FILTER_6TAP(src, filter, 1);
2673 tmp += 8;
2674 src += srcstride;
2675 }
2676
2677 tmp = tmp_array + 8;
2678 filter = subpel_filters[my - 1];
2679
2680 for (y = 0; y < h; y++) {
2681 for (x = 0; x < 8; x++)
2682 dst[x] = FILTER_4TAP(tmp, filter, 8);
2683 dst += dststride;
2684 tmp += 8;
2685 }
2686 #endif
2687 }
2688
ff_put_vp8_epel4_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2689 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2690 ptrdiff_t srcstride, int h, int mx, int my)
2691 {
2692 #if 1
2693 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2694 uint8_t *tmp = tmp_array;
2695
2696 src -= srcstride;
2697 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2698 tmp = tmp_array + 4;
2699 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2700 #else
2701 const uint8_t *filter = subpel_filters[mx - 1];
2702 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2703 int x, y;
2704 uint8_t tmp_array[44];
2705 uint8_t *tmp = tmp_array;
2706
2707 src -= srcstride;
2708
2709 for (y = 0; y < h + 3; y++) {
2710 for (x = 0; x < 4; x++)
2711 tmp[x] = FILTER_6TAP(src, filter, 1);
2712 tmp += 4;
2713 src += srcstride;
2714 }
2715
2716 tmp = tmp_array + 4;
2717 filter = subpel_filters[my - 1];
2718
2719 for (y = 0; y < h; y++) {
2720 for (x = 0; x < 4; x++)
2721 dst[x] = FILTER_4TAP(tmp, filter, 4);
2722 dst += dststride;
2723 tmp += 4;
2724 }
2725 #endif
2726 }
2727
ff_put_vp8_epel16_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2728 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2729 ptrdiff_t srcstride, int h, int mx, int my)
2730 {
2731 #if 1
2732 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2733 uint8_t *tmp = tmp_array;
2734
2735 src -= 2 * srcstride;
2736 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2737 tmp = tmp_array + 32;
2738 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2739 #else
2740 const uint8_t *filter = subpel_filters[mx - 1];
2741 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2742 int x, y;
2743 uint8_t tmp_array[592];
2744 uint8_t *tmp = tmp_array;
2745
2746 src -= 2 * srcstride;
2747
2748 for (y = 0; y < h + 5; y++) {
2749 for (x = 0; x < 16; x++)
2750 tmp[x] = FILTER_6TAP(src, filter, 1);
2751 tmp += 16;
2752 src += srcstride;
2753 }
2754
2755 tmp = tmp_array + 32;
2756 filter = subpel_filters[my - 1];
2757
2758 for (y = 0; y < h; y++) {
2759 for (x = 0; x < 16; x++)
2760 dst[x] = FILTER_6TAP(tmp, filter, 16);
2761 dst += dststride;
2762 tmp += 16;
2763 }
2764 #endif
2765 }
2766
ff_put_vp8_epel8_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2767 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2768 ptrdiff_t srcstride, int h, int mx, int my)
2769 {
2770 #if 1
2771 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2772 uint8_t *tmp = tmp_array;
2773
2774 src -= 2 * srcstride;
2775 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2776 tmp = tmp_array + 16;
2777 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2778 #else
2779 const uint8_t *filter = subpel_filters[mx - 1];
2780 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2781 int x, y;
2782 uint8_t tmp_array[168];
2783 uint8_t *tmp = tmp_array;
2784
2785 src -= 2 * srcstride;
2786
2787 for (y = 0; y < h + 5; y++) {
2788 for (x = 0; x < 8; x++)
2789 tmp[x] = FILTER_6TAP(src, filter, 1);
2790 tmp += 8;
2791 src += srcstride;
2792 }
2793
2794 tmp = tmp_array + 16;
2795 filter = subpel_filters[my - 1];
2796
2797 for (y = 0; y < h; y++) {
2798 for (x = 0; x < 8; x++)
2799 dst[x] = FILTER_6TAP(tmp, filter, 8);
2800 dst += dststride;
2801 tmp += 8;
2802 }
2803 #endif
2804 }
2805
ff_put_vp8_epel4_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2806 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2807 ptrdiff_t srcstride, int h, int mx, int my)
2808 {
2809 #if 1
2810 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2811 uint8_t *tmp = tmp_array;
2812
2813 src -= 2 * srcstride;
2814 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2815 tmp = tmp_array + 8;
2816 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2817 #else
2818 const uint8_t *filter = subpel_filters[mx - 1];
2819 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2820 int x, y;
2821 uint8_t tmp_array[52];
2822 uint8_t *tmp = tmp_array;
2823
2824 src -= 2 * srcstride;
2825
2826 for (y = 0; y < h + 5; y++) {
2827 for (x = 0; x < 4; x++)
2828 tmp[x] = FILTER_6TAP(src, filter, 1);
2829 tmp += 4;
2830 src += srcstride;
2831 }
2832
2833 tmp = tmp_array + 8;
2834 filter = subpel_filters[my - 1];
2835
2836 for (y = 0; y < h; y++) {
2837 for (x = 0; x < 4; x++)
2838 dst[x] = FILTER_6TAP(tmp, filter, 4);
2839 dst += dststride;
2840 tmp += 4;
2841 }
2842 #endif
2843 }
2844
ff_put_vp8_bilinear16_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2845 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2846 ptrdiff_t sstride, int h, int mx, int my)
2847 {
2848 #if 1
2849 int a = 8 - mx, b = mx;
2850 double ftmp[7];
2851 uint32_t tmp[1];
2852 mips_reg dst0, src0;
2853 DECLARE_VAR_ALL64;
2854
2855 /*
2856 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2857 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2858 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2859 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2860 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2861 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2862 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2863 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2864
2865 dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2866 dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2867 dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2868 dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2869 dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2870 dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2871 dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2872 dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2873 */
2874 __asm__ volatile (
2875 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2876 "li %[tmp0], 0x03 \n\t"
2877 "mtc1 %[tmp0], %[ftmp4] \n\t"
2878 "pshufh %[a], %[a], %[ftmp0] \n\t"
2879 "pshufh %[b], %[b], %[ftmp0] \n\t"
2880
2881 "1: \n\t"
2882 // 0 - 7
2883 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2884 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2885 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2886 // 8 - 15
2887 PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2888
2889 "addiu %[h], %[h], -0x01 \n\t"
2890 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2891 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2892 "bnez %[h], 1b \n\t"
2893 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2894 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2895 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2896 [ftmp6]"=&f"(ftmp[6]),
2897 [tmp0]"=&r"(tmp[0]),
2898 RESTRICT_ASM_ALL64
2899 [dst0]"=&r"(dst0), [src0]"=&r"(src0),
2900 [h]"+&r"(h),
2901 [dst]"+&r"(dst), [src]"+&r"(src),
2902 [a]"+&f"(a), [b]"+&f"(b)
2903 : [sstride]"r"((mips_reg)sstride),
2904 [dstride]"r"((mips_reg)dstride),
2905 [ff_pw_4]"f"(ff_pw_4)
2906 : "memory"
2907 );
2908 #else
2909 int a = 8 - mx, b = mx;
2910 int x, y;
2911
2912 for (y = 0; y < h; y++) {
2913 for (x = 0; x < 16; x++)
2914 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2915 dst += dstride;
2916 src += sstride;
2917 }
2918 #endif
2919 }
2920
ff_put_vp8_bilinear16_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2921 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2922 ptrdiff_t sstride, int h, int mx, int my)
2923 {
2924 #if 1
2925 int c = 8 - my, d = my;
2926 double ftmp[7];
2927 uint32_t tmp[1];
2928 mips_reg src0, src1, dst0;
2929 DECLARE_VAR_ALL64;
2930
2931 /*
2932 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2933 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2934 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2935 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2936 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2937 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2938 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2939 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2940 */
2941 __asm__ volatile (
2942 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2943 "li %[tmp0], 0x03 \n\t"
2944 "mtc1 %[tmp0], %[ftmp4] \n\t"
2945 "pshufh %[c], %[c], %[ftmp0] \n\t"
2946 "pshufh %[d], %[d], %[ftmp0] \n\t"
2947
2948 "1: \n\t"
2949 // 0 - 7
2950 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2951 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2952 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2953 // 8 - 15
2954 PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2955
2956 "addiu %[h], %[h], -0x01 \n\t"
2957 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2958 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2959 "bnez %[h], 1b \n\t"
2960 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2961 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2962 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2963 [ftmp6]"=&f"(ftmp[6]),
2964 [tmp0]"=&r"(tmp[0]),
2965 RESTRICT_ASM_ALL64
2966 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2967 [src1]"=&r"(src1),
2968 [h]"+&r"(h),
2969 [dst]"+&r"(dst), [src]"+&r"(src),
2970 [c]"+&f"(c), [d]"+&f"(d)
2971 : [sstride]"r"((mips_reg)sstride),
2972 [dstride]"r"((mips_reg)dstride),
2973 [ff_pw_4]"f"(ff_pw_4)
2974 : "memory"
2975 );
2976 #else
2977 int c = 8 - my, d = my;
2978 int x, y;
2979
2980 for (y = 0; y < h; y++) {
2981 for (x = 0; x < 16; x++)
2982 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2983 dst += dstride;
2984 src += sstride;
2985 }
2986 #endif
2987 }
2988
ff_put_vp8_bilinear16_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2989 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2990 ptrdiff_t sstride, int h, int mx, int my)
2991 {
2992 #if 1
2993 DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2994 uint8_t *tmp = tmp_array;
2995
2996 ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2997 ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2998 #else
2999 int a = 8 - mx, b = mx;
3000 int c = 8 - my, d = my;
3001 int x, y;
3002 uint8_t tmp_array[528];
3003 uint8_t *tmp = tmp_array;
3004
3005 for (y = 0; y < h + 1; y++) {
3006 for (x = 0; x < 16; x++)
3007 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3008 tmp += 16;
3009 src += sstride;
3010 }
3011
3012 tmp = tmp_array;
3013
3014 for (y = 0; y < h; y++) {
3015 for (x = 0; x < 16; x++)
3016 dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3017 dst += dstride;
3018 tmp += 16;
3019 }
3020 #endif
3021 }
3022
ff_put_vp8_bilinear8_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3023 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3024 ptrdiff_t sstride, int h, int mx, int my)
3025 {
3026 #if 1
3027 int a = 8 - mx, b = mx;
3028 double ftmp[7];
3029 uint32_t tmp[1];
3030 DECLARE_VAR_ALL64;
3031
3032 /*
3033 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3034 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3035 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3036 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3037 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3038 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3039 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3040 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3041 */
3042 __asm__ volatile (
3043 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3044 "li %[tmp0], 0x03 \n\t"
3045 "mtc1 %[tmp0], %[ftmp4] \n\t"
3046 "pshufh %[a], %[a], %[ftmp0] \n\t"
3047 "pshufh %[b], %[b], %[ftmp0] \n\t"
3048
3049 "1: \n\t"
3050 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3051
3052 "addiu %[h], %[h], -0x01 \n\t"
3053 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3054 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3055 "bnez %[h], 1b \n\t"
3056 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3057 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3058 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3059 [ftmp6]"=&f"(ftmp[6]),
3060 [tmp0]"=&r"(tmp[0]),
3061 RESTRICT_ASM_ALL64
3062 [h]"+&r"(h),
3063 [dst]"+&r"(dst), [src]"+&r"(src),
3064 [a]"+&f"(a), [b]"+&f"(b)
3065 : [sstride]"r"((mips_reg)sstride),
3066 [dstride]"r"((mips_reg)dstride),
3067 [ff_pw_4]"f"(ff_pw_4)
3068 : "memory"
3069 );
3070 #else
3071 int a = 8 - mx, b = mx;
3072 int x, y;
3073
3074 for (y = 0; y < h; y++) {
3075 for (x = 0; x < 8; x++)
3076 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3077 dst += dstride;
3078 src += sstride;
3079 }
3080 #endif
3081 }
3082
ff_put_vp8_bilinear8_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3083 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3084 ptrdiff_t sstride, int h, int mx, int my)
3085 {
3086 #if 1
3087 int c = 8 - my, d = my;
3088 double ftmp[7];
3089 uint32_t tmp[1];
3090 mips_reg src1;
3091 DECLARE_VAR_ALL64;
3092
3093 /*
3094 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3095 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3096 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3097 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3098 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3099 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3100 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3101 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3102 */
3103 __asm__ volatile (
3104 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3105 "li %[tmp0], 0x03 \n\t"
3106 "mtc1 %[tmp0], %[ftmp4] \n\t"
3107 "pshufh %[c], %[c], %[ftmp0] \n\t"
3108 "pshufh %[d], %[d], %[ftmp0] \n\t"
3109
3110 "1: \n\t"
3111 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3112
3113 "addiu %[h], %[h], -0x01 \n\t"
3114 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3115 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3116 "bnez %[h], 1b \n\t"
3117 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3118 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3119 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3120 [ftmp6]"=&f"(ftmp[6]),
3121 [tmp0]"=&r"(tmp[0]),
3122 RESTRICT_ASM_ALL64
3123 [src1]"=&r"(src1),
3124 [h]"+&r"(h),
3125 [dst]"+&r"(dst), [src]"+&r"(src),
3126 [c]"+&f"(c), [d]"+&f"(d)
3127 : [sstride]"r"((mips_reg)sstride),
3128 [dstride]"r"((mips_reg)dstride),
3129 [ff_pw_4]"f"(ff_pw_4)
3130 : "memory"
3131 );
3132 #else
3133 int c = 8 - my, d = my;
3134 int x, y;
3135
3136 for (y = 0; y < h; y++) {
3137 for (x = 0; x < 8; x++)
3138 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3139 dst += dstride;
3140 src += sstride;
3141 }
3142 #endif
3143 }
3144
ff_put_vp8_bilinear8_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3145 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3146 ptrdiff_t sstride, int h, int mx, int my)
3147 {
3148 #if 1
3149 DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3150 uint8_t *tmp = tmp_array;
3151
3152 ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3153 ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3154 #else
3155 int a = 8 - mx, b = mx;
3156 int c = 8 - my, d = my;
3157 int x, y;
3158 uint8_t tmp_array[136];
3159 uint8_t *tmp = tmp_array;
3160
3161 for (y = 0; y < h + 1; y++) {
3162 for (x = 0; x < 8; x++)
3163 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3164 tmp += 8;
3165 src += sstride;
3166 }
3167
3168 tmp = tmp_array;
3169
3170 for (y = 0; y < h; y++) {
3171 for (x = 0; x < 8; x++)
3172 dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3173 dst += dstride;
3174 tmp += 8;
3175 }
3176 #endif
3177 }
3178
ff_put_vp8_bilinear4_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3179 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3180 ptrdiff_t sstride, int h, int mx, int my)
3181 {
3182 #if 1
3183 int a = 8 - mx, b = mx;
3184 double ftmp[5];
3185 uint32_t tmp[1];
3186 DECLARE_VAR_LOW32;
3187 DECLARE_VAR_ALL64;
3188
3189 /*
3190 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3191 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3192 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3193 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3194 */
3195 __asm__ volatile (
3196 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3197 "li %[tmp0], 0x03 \n\t"
3198 "mtc1 %[tmp0], %[ftmp4] \n\t"
3199 "pshufh %[a], %[a], %[ftmp0] \n\t"
3200 "pshufh %[b], %[b], %[ftmp0] \n\t"
3201
3202 "1: \n\t"
3203 PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3204
3205 "addiu %[h], %[h], -0x01 \n\t"
3206 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3207 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3208 "bnez %[h], 1b \n\t"
3209 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3210 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3211 [ftmp4]"=&f"(ftmp[4]),
3212 [tmp0]"=&r"(tmp[0]),
3213 RESTRICT_ASM_LOW32
3214 RESTRICT_ASM_ALL64
3215 [h]"+&r"(h),
3216 [dst]"+&r"(dst), [src]"+&r"(src),
3217 [a]"+&f"(a), [b]"+&f"(b)
3218 : [sstride]"r"((mips_reg)sstride),
3219 [dstride]"r"((mips_reg)dstride),
3220 [ff_pw_4]"f"(ff_pw_4)
3221 : "memory"
3222 );
3223 #else
3224 int a = 8 - mx, b = mx;
3225 int x, y;
3226
3227 for (y = 0; y < h; y++) {
3228 for (x = 0; x < 4; x++)
3229 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3230 dst += dstride;
3231 src += sstride;
3232 }
3233 #endif
3234 }
3235
ff_put_vp8_bilinear4_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3236 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3237 ptrdiff_t sstride, int h, int mx, int my)
3238 {
3239 #if 1
3240 int c = 8 - my, d = my;
3241 double ftmp[7];
3242 uint32_t tmp[1];
3243 mips_reg src1;
3244 DECLARE_VAR_LOW32;
3245 DECLARE_VAR_ALL64;
3246
3247 /*
3248 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3249 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3250 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3251 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3252 */
3253 __asm__ volatile (
3254 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3255 "li %[tmp0], 0x03 \n\t"
3256 "mtc1 %[tmp0], %[ftmp4] \n\t"
3257 "pshufh %[c], %[c], %[ftmp0] \n\t"
3258 "pshufh %[d], %[d], %[ftmp0] \n\t"
3259
3260 "1: \n\t"
3261 PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3262
3263 "addiu %[h], %[h], -0x01 \n\t"
3264 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3265 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3266 "bnez %[h], 1b \n\t"
3267 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3268 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3269 [ftmp4]"=&f"(ftmp[4]),
3270 [tmp0]"=&r"(tmp[0]),
3271 RESTRICT_ASM_LOW32
3272 RESTRICT_ASM_ALL64
3273 [src1]"=&r"(src1),
3274 [h]"+&r"(h),
3275 [dst]"+&r"(dst), [src]"+&r"(src),
3276 [c]"+&f"(c), [d]"+&f"(d)
3277 : [sstride]"r"((mips_reg)sstride),
3278 [dstride]"r"((mips_reg)dstride),
3279 [ff_pw_4]"f"(ff_pw_4)
3280 : "memory"
3281 );
3282 #else
3283 int c = 8 - my, d = my;
3284 int x, y;
3285
3286 for (y = 0; y < h; y++) {
3287 for (x = 0; x < 4; x++)
3288 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3289 dst += dstride;
3290 src += sstride;
3291 }
3292 #endif
3293 }
3294
ff_put_vp8_bilinear4_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3295 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3296 ptrdiff_t sstride, int h, int mx, int my)
3297 {
3298 #if 1
3299 DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3300 uint8_t *tmp = tmp_array;
3301
3302 ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3303 ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3304 #else
3305 int a = 8 - mx, b = mx;
3306 int c = 8 - my, d = my;
3307 int x, y;
3308 uint8_t tmp_array[36];
3309 uint8_t *tmp = tmp_array;
3310
3311 for (y = 0; y < h + 1; y++) {
3312 for (x = 0; x < 4; x++)
3313 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3314 tmp += 4;
3315 src += sstride;
3316 }
3317
3318 tmp = tmp_array;
3319
3320 for (y = 0; y < h; y++) {
3321 for (x = 0; x < 4; x++)
3322 dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3323 dst += dstride;
3324 tmp += 4;
3325 }
3326 #endif
3327 }
3328