1 /*
2 * Loongson SIMD optimized vp8dsp
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/attributes.h"
27 #include "libavutil/mips/mmiutils.h"
28 #include "libavutil/mem_internal.h"
29
30 #define DECLARE_DOUBLE_1 double db_1
31 #define DECLARE_DOUBLE_2 double db_2
32 #define DECLARE_UINT32_T uint32_t it_1
33 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
34 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
35 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
36
37 #define MMI_PCMPGTUB(dst, src1, src2) \
38 "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
39 "pmaxub %[db_2], "#src1", "#src2" \n\t" \
40 "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
41 "pxor "#dst", %[db_2], %[db_1] \n\t"
42
43 #define MMI_BTOH(dst_l, dst_r, src) \
44 "pxor %[db_1], %[db_1], %[db_1] \n\t" \
45 "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
46 "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
47 "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
48
49 #define MMI_VP8_LOOP_FILTER \
50 /* Calculation of hev */ \
51 "dmtc1 %[thresh], %[ftmp3] \n\t" \
52 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
55 "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
56 "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
57 "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
58 MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
59 /* Calculation of mask */ \
60 "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
61 "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
62 "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
63 "li %[tmp0], 0x09 \n\t" \
64 "dmtc1 %[tmp0], %[ftmp3] \n\t" \
65 PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
66 "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
67 "dmtc1 %[e], %[ftmp3] \n\t" \
68 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
71 MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
72 "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
73 "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
74 "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
75 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
76 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
77 "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
78 "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
79 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
80 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
81 "dmtc1 %[i], %[ftmp3] \n\t" \
82 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85 MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
86 "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
87 "pxor %[mask], %[mask], %[ftmp3] \n\t" \
88 /* VP8_MBFILTER */ \
89 "li %[tmp0], 0x80808080 \n\t" \
90 "dmtc1 %[tmp0], %[ftmp7] \n\t" \
91 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
92 "pxor %[p2], %[p2], %[ftmp7] \n\t" \
93 "pxor %[p1], %[p1], %[ftmp7] \n\t" \
94 "pxor %[p0], %[p0], %[ftmp7] \n\t" \
95 "pxor %[q0], %[q0], %[ftmp7] \n\t" \
96 "pxor %[q1], %[q1], %[ftmp7] \n\t" \
97 "pxor %[q2], %[q2], %[ftmp7] \n\t" \
98 "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
99 "psubb %[ftmp5], %[q0], %[p0] \n\t" \
100 MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
101 MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
102 /* Right part */ \
103 "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
104 "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
105 "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
106 /* Left part */ \
107 "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
108 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
109 "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
110 /* Combine left and right part */ \
111 "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
112 "pand %[ftmp1], %[ftmp1], %[mask] \n\t" \
113 "pand %[ftmp2], %[ftmp1], %[hev] \n\t" \
114 "li %[tmp0], 0x04040404 \n\t" \
115 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
116 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
117 "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
118 "li %[tmp0], 0x0B \n\t" \
119 "dmtc1 %[tmp0], %[ftmp4] \n\t" \
120 PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
121 "li %[tmp0], 0x03030303 \n\t" \
122 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
123 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
124 "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
125 "li %[tmp0], 0x0B \n\t" \
126 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
127 PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
128 "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
129 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
130 /* filt_val &= ~hev */ \
131 "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
132 "pxor %[hev], %[hev], %[ftmp0] \n\t" \
133 "pand %[ftmp1], %[ftmp1], %[hev] \n\t" \
134 MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
135 "li %[tmp0], 0x07 \n\t" \
136 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
137 "li %[tmp0], 0x001b001b \n\t" \
138 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
139 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
140 "li %[tmp0], 0x003f003f \n\t" \
141 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
142 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
143 /* Right part */ \
144 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
145 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
146 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
147 /* Left part */ \
148 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
149 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
150 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
151 /* Combine left and right part */ \
152 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
153 "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
154 "pxor %[q0], %[q0], %[ftmp7] \n\t" \
155 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
156 "pxor %[p0], %[p0], %[ftmp7] \n\t" \
157 "li %[tmp0], 0x00120012 \n\t" \
158 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
159 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
160 /* Right part */ \
161 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
162 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
163 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
164 /* Left part */ \
165 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
166 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
167 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
168 /* Combine left and right part */ \
169 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
170 "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
171 "pxor %[q1], %[q1], %[ftmp7] \n\t" \
172 "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
173 "pxor %[p1], %[p1], %[ftmp7] \n\t" \
174 "li %[tmp0], 0x03 \n\t" \
175 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
176 /* Right part */ \
177 "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
178 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
179 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
180 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
181 /* Left part */ \
182 "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
183 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
184 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
185 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
186 /* Combine left and right part */ \
187 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
188 "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
189 "pxor %[q2], %[q2], %[ftmp7] \n\t" \
190 "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
191 "pxor %[p2], %[p2], %[ftmp7] \n\t"
192
193 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
194 MMI_ULWC1(%[ftmp1], src, 0x00) \
195 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
196 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
197 \
198 MMI_ULWC1(%[ftmp1], src, -0x01) \
199 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
200 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
201 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
202 \
203 MMI_ULWC1(%[ftmp1], src, -0x02) \
204 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
205 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
206 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
207 \
208 MMI_ULWC1(%[ftmp1], src, 0x01) \
209 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
210 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
211 \
212 MMI_ULWC1(%[ftmp1], src, 0x02) \
213 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
214 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
215 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
216 \
217 MMI_ULWC1(%[ftmp1], src, 0x03) \
218 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
219 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
220 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
221 \
222 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
223 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
224 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
225 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
226 \
227 MMI_SWC1(%[ftmp1], dst, 0x00)
228
229
230 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
231 MMI_ULWC1(%[ftmp1], src, 0x00) \
232 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
233 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
234 \
235 MMI_ULWC1(%[ftmp1], src, -0x01) \
236 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
237 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
238 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
239 \
240 MMI_ULWC1(%[ftmp1], src, 0x01) \
241 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
242 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
243 \
244 MMI_ULWC1(%[ftmp1], src, 0x02) \
245 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
246 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
247 "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
248 \
249 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
250 \
251 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
252 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
253 \
254 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
255 MMI_SWC1(%[ftmp1], dst, 0x00)
256
257
258 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
259 MMI_ULWC1(%[ftmp1], src, 0x00) \
260 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
261 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
262 \
263 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
264 MMI_ULWC1(%[ftmp1], src1, 0x00) \
265 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
266 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
267 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
268 \
269 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
270 MMI_ULWC1(%[ftmp1], src1, 0x00) \
271 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
272 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
273 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
274 \
275 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
276 MMI_ULWC1(%[ftmp1], src1, 0x00) \
277 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
278 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
279 \
280 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
281 MMI_ULWC1(%[ftmp1], src1, 0x00) \
282 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
283 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
284 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
285 \
286 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
287 MMI_ULWC1(%[ftmp1], src1, 0x00) \
288 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
289 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
290 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
291 \
292 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
293 \
294 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
295 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
296 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
297 \
298 MMI_SWC1(%[ftmp1], dst, 0x00)
299
300
301 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
302 MMI_ULWC1(%[ftmp1], src, 0x00) \
303 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
304 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
305 \
306 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
307 MMI_ULWC1(%[ftmp1], src1, 0x00) \
308 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
309 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
310 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
311 \
312 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
313 MMI_ULWC1(%[ftmp1], src1, 0x00) \
314 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
315 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
316 \
317 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
318 MMI_ULWC1(%[ftmp1], src1, 0x00) \
319 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
320 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
321 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
322 \
323 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
324 \
325 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
326 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
327 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
328 \
329 MMI_SWC1(%[ftmp1], dst, 0x00)
330
331
332 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
333 MMI_ULDC1(%[ftmp1], src, 0x00) \
334 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
335 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
336 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
337 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
338 \
339 MMI_ULDC1(%[ftmp1], src, -0x01) \
340 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
341 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
342 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
343 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
344 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
345 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
346 \
347 MMI_ULDC1(%[ftmp1], src, -0x02) \
348 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
349 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
350 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
351 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
352 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
353 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
354 \
355 MMI_ULDC1(%[ftmp1], src, 0x01) \
356 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
357 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
358 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
359 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
360 \
361 MMI_ULDC1(%[ftmp1], src, 0x02) \
362 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
363 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
364 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
365 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
366 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
367 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
368 \
369 MMI_ULDC1(%[ftmp1], src, 0x03) \
370 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
371 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
372 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
373 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
374 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
375 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
376 \
377 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
378 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
379 \
380 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
381 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
382 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
383 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
384 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
385 \
386 MMI_SDC1(%[ftmp1], dst, 0x00)
387
388
389 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
390 MMI_ULDC1(%[ftmp1], src, 0x00) \
391 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
392 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
393 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
394 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
395 \
396 MMI_ULDC1(%[ftmp1], src, -0x01) \
397 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
398 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
399 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
400 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
401 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
402 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
403 \
404 MMI_ULDC1(%[ftmp1], src, 0x01) \
405 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
406 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
407 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
408 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
409 \
410 MMI_ULDC1(%[ftmp1], src, 0x02) \
411 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
412 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
413 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
414 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
415 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
416 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
417 \
418 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
419 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
420 \
421 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
422 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
423 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
424 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
425 \
426 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
427 MMI_SDC1(%[ftmp1], dst, 0x00)
428
429
430 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
431 MMI_ULDC1(%[ftmp1], src, 0x00) \
432 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
433 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
434 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
435 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
436 \
437 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
438 MMI_ULDC1(%[ftmp1], src1, 0x00) \
439 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
440 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
441 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
442 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
443 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
444 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
445 \
446 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
447 MMI_ULDC1(%[ftmp1], src1, 0x00) \
448 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
449 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
450 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
451 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
452 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
453 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
454 \
455 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
456 MMI_ULDC1(%[ftmp1], src1, 0x00) \
457 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
458 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
459 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
460 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
461 \
462 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
463 MMI_ULDC1(%[ftmp1], src1, 0x00) \
464 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
465 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
466 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
467 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
468 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
469 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
470 \
471 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
472 MMI_ULDC1(%[ftmp1], src1, 0x00) \
473 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
474 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
475 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
476 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
477 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
478 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
479 \
480 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
481 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
482 \
483 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
484 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
485 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
486 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
487 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
488 \
489 MMI_SDC1(%[ftmp1], dst, 0x00)
490
491
492 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
493 MMI_ULDC1(%[ftmp1], src, 0x00) \
494 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
495 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
496 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
497 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
498 \
499 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
500 MMI_ULDC1(%[ftmp1], src1, 0x00) \
501 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
502 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
503 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
504 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
505 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
506 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
507 \
508 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
509 MMI_ULDC1(%[ftmp1], src1, 0x00) \
510 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
511 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
512 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
513 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
514 \
515 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
516 MMI_ULDC1(%[ftmp1], src1, 0x00) \
517 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
518 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
519 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
520 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
521 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
522 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
523 \
524 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
525 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
526 \
527 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
528 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
529 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
530 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
531 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
532 \
533 MMI_SDC1(%[ftmp1], dst, 0x00)
534
535
536 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
537 MMI_ULDC1(%[ftmp1], src, 0x00) \
538 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
539 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
540 "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
541 "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
542 \
543 MMI_ULDC1(%[ftmp1], src, 0x01) \
544 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
545 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
546 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
547 "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
548 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
549 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
550 \
551 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
552 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
553 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
554 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
555 \
556 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
557 MMI_SDC1(%[ftmp1], dst, 0x00)
558
559
560 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
561 MMI_ULWC1(%[ftmp1], src, 0x00) \
562 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
563 "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
564 \
565 MMI_ULWC1(%[ftmp1], src, 0x01) \
566 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
567 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
568 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
569 \
570 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
571 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
572 \
573 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
574 MMI_SWC1(%[ftmp1], dst, 0x00)
575
576
577 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
578 MMI_ULDC1(%[ftmp1], src, 0x00) \
579 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
580 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
581 "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
582 "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
583 \
584 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
585 MMI_ULDC1(%[ftmp1], src1, 0x00) \
586 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
587 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
588 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
589 "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
590 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
591 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
592 \
593 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
594 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
595 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
596 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
597 \
598 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
599 MMI_SDC1(%[ftmp1], dst, 0x00)
600
601
602 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
603 MMI_ULWC1(%[ftmp1], src, 0x00) \
604 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
605 "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
606 \
607 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
608 MMI_ULWC1(%[ftmp1], src1, 0x00) \
609 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
610 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
611 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
612 \
613 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
614 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
615 \
616 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
617 MMI_SWC1(%[ftmp1], dst, 0x00)
618
619
620 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
621 {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
622 0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
623
624 {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
625 0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
626
627 {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
628 0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
629
630 {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
631 0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
632
633 {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
634 0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
635
636 {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
637 0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
638
639 {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
640 0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
641 };
642
643 #if 0
644 #define FILTER_6TAP(src, F, stride) \
645 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
646 F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
647 F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
648
649 #define FILTER_4TAP(src, F, stride) \
650 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
651 F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
652
653 static const uint8_t subpel_filters[7][6] = {
654 { 0, 6, 123, 12, 1, 0 },
655 { 2, 11, 108, 36, 8, 1 },
656 { 0, 9, 93, 50, 6, 0 },
657 { 3, 16, 77, 77, 16, 3 },
658 { 0, 6, 50, 93, 9, 0 },
659 { 1, 8, 36, 108, 11, 2 },
660 { 0, 1, 12, 123, 6, 0 },
661 };
662
663 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
664 #define MUL_35468(a) (((a) * 35468) >> 16)
665 #endif
666
667 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
vp8_filter_common_is4tap(uint8_t * p,ptrdiff_t stride)668 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
669 ptrdiff_t stride)
670 {
671 int av_unused p1 = p[-2 * stride];
672 int av_unused p0 = p[-1 * stride];
673 int av_unused q0 = p[ 0 * stride];
674 int av_unused q1 = p[ 1 * stride];
675 int a, f1, f2;
676 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
677
678 a = 3 * (q0 - p0);
679 a += clip_int8(p1 - q1);
680 a = clip_int8(a);
681
682 // We deviate from the spec here with c(a+3) >> 3
683 // since that's what libvpx does.
684 f1 = FFMIN(a + 4, 127) >> 3;
685 f2 = FFMIN(a + 3, 127) >> 3;
686
687 // Despite what the spec says, we do need to clamp here to
688 // be bitexact with libvpx.
689 p[-1 * stride] = cm[p0 + f2];
690 p[ 0 * stride] = cm[q0 - f1];
691 }
692
vp8_filter_common_isnot4tap(uint8_t * p,ptrdiff_t stride)693 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
694 ptrdiff_t stride)
695 {
696 int av_unused p1 = p[-2 * stride];
697 int av_unused p0 = p[-1 * stride];
698 int av_unused q0 = p[ 0 * stride];
699 int av_unused q1 = p[ 1 * stride];
700 int a, f1, f2;
701 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
702
703 a = 3 * (q0 - p0);
704 a = clip_int8(a);
705
706 // We deviate from the spec here with c(a+3) >> 3
707 // since that's what libvpx does.
708 f1 = FFMIN(a + 4, 127) >> 3;
709 f2 = FFMIN(a + 3, 127) >> 3;
710
711 // Despite what the spec says, we do need to clamp here to
712 // be bitexact with libvpx.
713 p[-1 * stride] = cm[p0 + f2];
714 p[ 0 * stride] = cm[q0 - f1];
715 a = (f1 + 1) >> 1;
716 p[-2 * stride] = cm[p1 + a];
717 p[ 1 * stride] = cm[q1 - a];
718 }
719
vp8_simple_limit(uint8_t * p,ptrdiff_t stride,int flim)720 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
721 int flim)
722 {
723 int av_unused p1 = p[-2 * stride];
724 int av_unused p0 = p[-1 * stride];
725 int av_unused q0 = p[ 0 * stride];
726 int av_unused q1 = p[ 1 * stride];
727
728 return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
729 }
730
hev(uint8_t * p,ptrdiff_t stride,int thresh)731 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
732 {
733 int av_unused p1 = p[-2 * stride];
734 int av_unused p0 = p[-1 * stride];
735 int av_unused q0 = p[ 0 * stride];
736 int av_unused q1 = p[ 1 * stride];
737
738 return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
739 }
740
filter_mbedge(uint8_t * p,ptrdiff_t stride)741 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
742 {
743 int a0, a1, a2, w;
744 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
745
746 int av_unused p2 = p[-3 * stride];
747 int av_unused p1 = p[-2 * stride];
748 int av_unused p0 = p[-1 * stride];
749 int av_unused q0 = p[ 0 * stride];
750 int av_unused q1 = p[ 1 * stride];
751 int av_unused q2 = p[ 2 * stride];
752
753 w = clip_int8(p1 - q1);
754 w = clip_int8(w + 3 * (q0 - p0));
755
756 a0 = (27 * w + 63) >> 7;
757 a1 = (18 * w + 63) >> 7;
758 a2 = (9 * w + 63) >> 7;
759
760 p[-3 * stride] = cm[p2 + a2];
761 p[-2 * stride] = cm[p1 + a1];
762 p[-1 * stride] = cm[p0 + a0];
763 p[ 0 * stride] = cm[q0 - a0];
764 p[ 1 * stride] = cm[q1 - a1];
765 p[ 2 * stride] = cm[q2 - a2];
766 }
767
vp8_normal_limit(uint8_t * p,ptrdiff_t stride,int E,int I)768 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
769 int E, int I)
770 {
771 int av_unused p3 = p[-4 * stride];
772 int av_unused p2 = p[-3 * stride];
773 int av_unused p1 = p[-2 * stride];
774 int av_unused p0 = p[-1 * stride];
775 int av_unused q0 = p[ 0 * stride];
776 int av_unused q1 = p[ 1 * stride];
777 int av_unused q2 = p[ 2 * stride];
778 int av_unused q3 = p[ 3 * stride];
779
780 return vp8_simple_limit(p, stride, E) &&
781 FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
782 FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
783 FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
784 }
785
vp8_v_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)786 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
787 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
788 {
789 double ftmp[18];
790 uint32_t tmp[1];
791 DECLARE_DOUBLE_1;
792 DECLARE_DOUBLE_2;
793 DECLARE_UINT32_T;
794 DECLARE_VAR_ALL64;
795
796 __asm__ volatile(
797 /* Get data from dst */
798 MMI_ULDC1(%[q0], %[dst], 0x0)
799 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
800 MMI_ULDC1(%[p0], %[tmp0], 0x0)
801 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
802 MMI_ULDC1(%[p1], %[tmp0], 0x0)
803 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
804 MMI_ULDC1(%[p2], %[tmp0], 0x0)
805 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
806 MMI_ULDC1(%[p3], %[tmp0], 0x0)
807 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
808 MMI_ULDC1(%[q1], %[tmp0], 0x0)
809 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
810 MMI_ULDC1(%[q2], %[tmp0], 0x0)
811 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
812 MMI_ULDC1(%[q3], %[tmp0], 0x0)
813 MMI_VP8_LOOP_FILTER
814 /* Move to dst */
815 MMI_USDC1(%[q0], %[dst], 0x0)
816 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
817 MMI_USDC1(%[p0], %[tmp0], 0x0)
818 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
819 MMI_USDC1(%[p1], %[tmp0], 0x0)
820 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
821 MMI_USDC1(%[p2], %[tmp0], 0x0)
822 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
823 MMI_USDC1(%[q1], %[tmp0], 0x0)
824 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
825 MMI_USDC1(%[q2], %[tmp0], 0x0)
826 : RESTRICT_ASM_ALL64
827 [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
828 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
829 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
830 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
831 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
832 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
833 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
834 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
835 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
836 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
837 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
838 RESTRICT_ASM_UINT32_T
839 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
840 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
841 : "memory"
842 );
843 }
844
vp8_v_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)845 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
846 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
847 {
848 int i;
849
850 for (i = 0; i < 8; i++)
851 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
852 int hv = hev(dst + i * 1, stride, hev_thresh);
853 if (hv)
854 vp8_filter_common_is4tap(dst + i * 1, stride);
855 else
856 vp8_filter_common_isnot4tap(dst + i * 1, stride);
857 }
858 }
859
vp8_h_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)860 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
861 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
862 {
863 double ftmp[18];
864 uint32_t tmp[1];
865 DECLARE_DOUBLE_1;
866 DECLARE_DOUBLE_2;
867 DECLARE_UINT32_T;
868 DECLARE_VAR_ALL64;
869
870 __asm__ volatile(
871 /* Get data from dst */
872 MMI_ULDC1(%[p3], %[dst], -0x04)
873 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
874 MMI_ULDC1(%[p2], %[tmp0], -0x04)
875 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
876 MMI_ULDC1(%[p1], %[tmp0], -0x04)
877 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
878 MMI_ULDC1(%[p0], %[tmp0], -0x04)
879 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
880 MMI_ULDC1(%[q0], %[tmp0], -0x04)
881 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
882 MMI_ULDC1(%[q1], %[tmp0], -0x04)
883 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
884 MMI_ULDC1(%[q2], %[tmp0], -0x04)
885 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
886 MMI_ULDC1(%[q3], %[tmp0], -0x04)
887 /* Matrix transpose */
888 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
889 %[q0], %[q1], %[q2], %[q3],
890 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
891 MMI_VP8_LOOP_FILTER
892 /* Matrix transpose */
893 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
894 %[q0], %[q1], %[q2], %[q3],
895 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
896 /* Move to dst */
897 MMI_USDC1(%[p3], %[dst], -0x04)
898 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
899 MMI_USDC1(%[p2], %[dst], -0x04)
900 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
901 MMI_USDC1(%[p1], %[dst], -0x04)
902 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
903 MMI_USDC1(%[p0], %[dst], -0x04)
904 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
905 MMI_USDC1(%[q0], %[dst], -0x04)
906 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
907 MMI_USDC1(%[q1], %[dst], -0x04)
908 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
909 MMI_USDC1(%[q2], %[dst], -0x04)
910 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
911 MMI_USDC1(%[q3], %[dst], -0x04)
912 : RESTRICT_ASM_ALL64
913 [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
914 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
915 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
916 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
917 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
918 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
919 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
920 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
921 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
922 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
923 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
924 RESTRICT_ASM_UINT32_T
925 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
926 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
927 : "memory"
928 );
929 }
930
vp8_h_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)931 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
932 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
933 {
934 int i;
935
936 for (i = 0; i < 8; i++)
937 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
938 int hv = hev(dst + i * stride, 1, hev_thresh);
939 if (hv)
940 vp8_filter_common_is4tap(dst + i * stride, 1);
941 else
942 vp8_filter_common_isnot4tap(dst + i * stride, 1);
943 }
944 }
945
ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16],int16_t dc[16])946 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
947 {
948 #if 1
949 double ftmp[8];
950 DECLARE_VAR_ALL64;
951
952 __asm__ volatile (
953 MMI_LDC1(%[ftmp0], %[dc], 0x00)
954 MMI_LDC1(%[ftmp1], %[dc], 0x08)
955 MMI_LDC1(%[ftmp2], %[dc], 0x10)
956 MMI_LDC1(%[ftmp3], %[dc], 0x18)
957 "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
958 "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
959 "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
960 "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
961 "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
962 "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
963 "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
964 "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
965 MMI_SDC1(%[ftmp0], %[dc], 0x00)
966 MMI_SDC1(%[ftmp1], %[dc], 0x08)
967 MMI_SDC1(%[ftmp2], %[dc], 0x10)
968 MMI_SDC1(%[ftmp3], %[dc], 0x18)
969 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
970 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
971 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
972 [ftmp6]"=&f"(ftmp[6]),
973 RESTRICT_ASM_ALL64
974 [ftmp7]"=&f"(ftmp[7])
975 : [dc]"r"((uint8_t*)dc)
976 : "memory"
977 );
978
979 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
980 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
981 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
982 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
983
984 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
985 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
986 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
987 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
988
989 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
990 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
991 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
992 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
993
994 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
995 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
996 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
997 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
998
999 __asm__ volatile (
1000 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1001 MMI_SDC1(%[ftmp0], %[dc], 0x00)
1002 MMI_SDC1(%[ftmp0], %[dc], 0x08)
1003 MMI_SDC1(%[ftmp0], %[dc], 0x10)
1004 MMI_SDC1(%[ftmp0], %[dc], 0x18)
1005 : RESTRICT_ASM_ALL64
1006 [ftmp0]"=&f"(ftmp[0])
1007 : [dc]"r"((uint8_t *)dc)
1008 : "memory"
1009 );
1010 #else
1011 int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1012
1013 t00 = dc[0] + dc[12];
1014 t10 = dc[1] + dc[13];
1015 t20 = dc[2] + dc[14];
1016 t30 = dc[3] + dc[15];
1017
1018 t03 = dc[0] - dc[12];
1019 t13 = dc[1] - dc[13];
1020 t23 = dc[2] - dc[14];
1021 t33 = dc[3] - dc[15];
1022
1023 t01 = dc[4] + dc[ 8];
1024 t11 = dc[5] + dc[ 9];
1025 t21 = dc[6] + dc[10];
1026 t31 = dc[7] + dc[11];
1027
1028 t02 = dc[4] - dc[ 8];
1029 t12 = dc[5] - dc[ 9];
1030 t22 = dc[6] - dc[10];
1031 t32 = dc[7] - dc[11];
1032
1033 dc[ 0] = t00 + t01;
1034 dc[ 1] = t10 + t11;
1035 dc[ 2] = t20 + t21;
1036 dc[ 3] = t30 + t31;
1037
1038 dc[ 4] = t03 + t02;
1039 dc[ 5] = t13 + t12;
1040 dc[ 6] = t23 + t22;
1041 dc[ 7] = t33 + t32;
1042
1043 dc[ 8] = t00 - t01;
1044 dc[ 9] = t10 - t11;
1045 dc[10] = t20 - t21;
1046 dc[11] = t30 - t31;
1047
1048 dc[12] = t03 - t02;
1049 dc[13] = t13 - t12;
1050 dc[14] = t23 - t22;
1051 dc[15] = t33 - t32;
1052
1053 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1054 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1055 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1056 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1057
1058 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1059 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1060 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1061 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1062
1063 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1064 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1065 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1066 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1067
1068 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1069 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1070 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1071 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1072
1073 AV_ZERO64(dc + 0);
1074 AV_ZERO64(dc + 4);
1075 AV_ZERO64(dc + 8);
1076 AV_ZERO64(dc + 12);
1077 #endif
1078 }
1079
ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16],int16_t dc[16])1080 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1081 {
1082 int val = (dc[0] + 3) >> 3;
1083
1084 dc[0] = 0;
1085
1086 block[0][0][0] = val;
1087 block[0][1][0] = val;
1088 block[0][2][0] = val;
1089 block[0][3][0] = val;
1090 block[1][0][0] = val;
1091 block[1][1][0] = val;
1092 block[1][2][0] = val;
1093 block[1][3][0] = val;
1094 block[2][0][0] = val;
1095 block[2][1][0] = val;
1096 block[2][2][0] = val;
1097 block[2][3][0] = val;
1098 block[3][0][0] = val;
1099 block[3][1][0] = val;
1100 block[3][2][0] = val;
1101 block[3][3][0] = val;
1102 }
1103
ff_vp8_idct_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1104 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1105 {
1106 #if 1
1107 double ftmp[12];
1108 uint32_t tmp[1];
1109 union av_intfloat64 ff_ph_4e7b_u;
1110 union av_intfloat64 ff_ph_22a3_u;
1111 DECLARE_VAR_LOW32;
1112 DECLARE_VAR_ALL64;
1113 ff_ph_4e7b_u.i = 0x4e7b4e7b4e7b4e7bULL;
1114 ff_ph_22a3_u.i = 0x22a322a322a322a3ULL;
1115
1116 __asm__ volatile (
1117 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1118 MMI_LDC1(%[ftmp1], %[block], 0x00)
1119 MMI_LDC1(%[ftmp2], %[block], 0x08)
1120 MMI_LDC1(%[ftmp3], %[block], 0x10)
1121 MMI_LDC1(%[ftmp4], %[block], 0x18)
1122
1123 "li %[tmp0], 0x02 \n\t"
1124 "mtc1 %[tmp0], %[ftmp11] \n\t"
1125
1126 // block[0...3] + block[8...11]
1127 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1128 // block[0...3] - block[8...11]
1129 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1130 // MUL_35468(block[12...15])
1131 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1132 "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1133 // MUL_35468(block[4...7])
1134 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1135 "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1136 // MUL_20091(block[4...7]
1137 "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1138 "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1139 // MUL_20091(block[12...15])
1140 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1141 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1142
1143 // tmp[0 4 8 12]
1144 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1145 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1146 // tmp[1 5 9 13]
1147 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1148 "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1149 // tmp[2 6 10 14]
1150 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1151 "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1152 // tmp[3 7 11 15]
1153 "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1154 "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1155
1156 MMI_SDC1(%[ftmp0], %[block], 0x00)
1157 MMI_SDC1(%[ftmp0], %[block], 0x08)
1158 MMI_SDC1(%[ftmp0], %[block], 0x10)
1159 MMI_SDC1(%[ftmp0], %[block], 0x18)
1160
1161 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1162 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1163
1164 // t[0 4 8 12]
1165 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1166 // t[1 5 9 13]
1167 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1168 // t[2 6 10 14]
1169 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1170 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1171 "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1172 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1173 "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1174 // t[3 7 11 15]
1175 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1176 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1177 "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1178 "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1179 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1180
1181 "li %[tmp0], 0x03 \n\t"
1182 "mtc1 %[tmp0], %[ftmp11] \n\t"
1183 "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1184 "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1185 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1186 "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1187 "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1188 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1189 "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1190 "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1191 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1192 "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1193 "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1194 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1195
1196 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1197 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1198
1199 MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1200 MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1201 MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1202 MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1203
1204 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1205 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1206 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1207 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1208
1209 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1210 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1211 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1212 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1213
1214 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1215 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1216 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1217 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1218
1219 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1220 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1221 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1222 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1223 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1224 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1225 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1226 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1227 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1228 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1229 RESTRICT_ASM_LOW32
1230 RESTRICT_ASM_ALL64
1231 [tmp0]"=&r"(tmp[0])
1232 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1233 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1234 [block]"r"(block), [ff_pw_4]"f"(ff_pw_4.f),
1235 [ff_ph_4e7b]"f"(ff_ph_4e7b_u.f), [ff_ph_22a3]"f"(ff_ph_22a3_u.f)
1236 : "memory"
1237 );
1238 #else
1239 int i, t0, t1, t2, t3;
1240 int16_t tmp[16];
1241
1242 for (i = 0; i < 4; i++) {
1243 t0 = block[0 + i] + block[8 + i];
1244 t1 = block[0 + i] - block[8 + i];
1245 t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1246 t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1247 block[ 0 + i] = 0;
1248 block[ 4 + i] = 0;
1249 block[ 8 + i] = 0;
1250 block[12 + i] = 0;
1251
1252 tmp[i * 4 + 0] = t0 + t3;
1253 tmp[i * 4 + 1] = t1 + t2;
1254 tmp[i * 4 + 2] = t1 - t2;
1255 tmp[i * 4 + 3] = t0 - t3;
1256 }
1257
1258 for (i = 0; i < 4; i++) {
1259 t0 = tmp[0 + i] + tmp[8 + i];
1260 t1 = tmp[0 + i] - tmp[8 + i];
1261 t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1262 t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1263
1264 dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1265 dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1266 dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1267 dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1268 dst += stride;
1269 }
1270 #endif
1271 }
1272
ff_vp8_idct_dc_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1273 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1274 {
1275 #if 1
1276 int dc = (block[0] + 4) >> 3;
1277 double ftmp[6];
1278 DECLARE_VAR_LOW32;
1279
1280 block[0] = 0;
1281
1282 __asm__ volatile (
1283 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1284 "mtc1 %[dc], %[ftmp5] \n\t"
1285 MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1286 MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1287 MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1288 MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1289 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1290 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1291 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1292 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1293 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1294 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1295 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1296 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1297 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1298 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1299 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1300 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1301 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1302 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1303 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1304 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1305 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1306 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1307 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1308 [ftmp4]"=&f"(ftmp[4]),
1309 RESTRICT_ASM_LOW32
1310 [ftmp5]"=&f"(ftmp[5])
1311 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1312 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1313 [dc]"r"(dc)
1314 : "memory"
1315 );
1316 #else
1317 int i, dc = (block[0] + 4) >> 3;
1318
1319 block[0] = 0;
1320
1321 for (i = 0; i < 4; i++) {
1322 dst[0] = av_clip_uint8(dst[0] + dc);
1323 dst[1] = av_clip_uint8(dst[1] + dc);
1324 dst[2] = av_clip_uint8(dst[2] + dc);
1325 dst[3] = av_clip_uint8(dst[3] + dc);
1326 dst += stride;
1327 }
1328 #endif
1329 }
1330
ff_vp8_idct_dc_add4y_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1331 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1332 ptrdiff_t stride)
1333 {
1334 ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1335 ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1336 ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1337 ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1338 }
1339
ff_vp8_idct_dc_add4uv_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1340 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1341 ptrdiff_t stride)
1342 {
1343 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1344 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1345 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1346 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1347 }
1348
1349 // loop filter applied to edges between macroblocks
ff_vp8_v_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1350 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1351 int flim_I, int hev_thresh)
1352 {
1353 vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1354 vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1355 }
1356
ff_vp8_h_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1357 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1358 int flim_I, int hev_thresh)
1359 {
1360 vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1361 vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1362 hev_thresh);
1363 }
1364
ff_vp8_v_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1365 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1366 int flim_E, int flim_I, int hev_thresh)
1367 {
1368 vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1369 vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1370 }
1371
ff_vp8_h_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1372 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1373 int flim_E, int flim_I, int hev_thresh)
1374 {
1375 vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1376 vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1377 }
1378
1379 // loop filter applied to inner macroblock edges
ff_vp8_v_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1380 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1381 int flim_E, int flim_I, int hev_thresh)
1382 {
1383 int i;
1384
1385 for (i = 0; i < 16; i++)
1386 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1387 int hv = hev(dst + i * 1, stride, hev_thresh);
1388 if (hv)
1389 vp8_filter_common_is4tap(dst + i * 1, stride);
1390 else
1391 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1392 }
1393 }
1394
ff_vp8_h_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1395 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1396 int flim_E, int flim_I, int hev_thresh)
1397 {
1398 int i;
1399
1400 for (i = 0; i < 16; i++)
1401 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1402 int hv = hev(dst + i * stride, 1, hev_thresh);
1403 if (hv)
1404 vp8_filter_common_is4tap(dst + i * stride, 1);
1405 else
1406 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1407 }
1408 }
1409
ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1410 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1411 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1412 {
1413 vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1414 vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1415 }
1416
ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1417 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1418 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1419 {
1420 vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1421 vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1422 }
1423
ff_vp8_v_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1424 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1425 {
1426 int i;
1427
1428 for (i = 0; i < 16; i++)
1429 if (vp8_simple_limit(dst + i, stride, flim))
1430 vp8_filter_common_is4tap(dst + i, stride);
1431 }
1432
ff_vp8_h_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1433 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1434 {
1435 int i;
1436
1437 for (i = 0; i < 16; i++)
1438 if (vp8_simple_limit(dst + i * stride, 1, flim))
1439 vp8_filter_common_is4tap(dst + i * stride, 1);
1440 }
1441
ff_put_vp8_pixels16_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1442 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1443 ptrdiff_t srcstride, int h, int x, int y)
1444 {
1445 #if 1
1446 double ftmp[2];
1447 uint64_t tmp[2];
1448 mips_reg addr[2];
1449 DECLARE_VAR_ALL64;
1450
1451 __asm__ volatile (
1452 "1: \n\t"
1453 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1454 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1455 "ldl %[tmp0], 0x0f(%[src]) \n\t"
1456 "ldr %[tmp0], 0x08(%[src]) \n\t"
1457 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1458 "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1459 "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1460 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1461 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1462 "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1463 "sdr %[tmp0], 0x08(%[dst]) \n\t"
1464 "addiu %[h], %[h], -0x02 \n\t"
1465 MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1466 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1467 "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1468 "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1469 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1470 "bnez %[h], 1b \n\t"
1471 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1472 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1473 RESTRICT_ASM_ALL64
1474 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1475 [dst]"+&r"(dst), [src]"+&r"(src),
1476 [h]"+&r"(h)
1477 : [dststride]"r"((mips_reg)dststride),
1478 [srcstride]"r"((mips_reg)srcstride)
1479 : "memory"
1480 );
1481 #else
1482 int i;
1483
1484 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1485 memcpy(dst, src, 16);
1486 #endif
1487 }
1488
ff_put_vp8_pixels8_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1489 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1490 ptrdiff_t srcstride, int h, int x, int y)
1491 {
1492 #if 1
1493 double ftmp[1];
1494 uint64_t tmp[1];
1495 mips_reg addr[2];
1496 DECLARE_VAR_ALL64;
1497
1498 __asm__ volatile (
1499 "1: \n\t"
1500 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1501 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1502 "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1503 "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1504 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1505 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1506 "addiu %[h], %[h], -0x02 \n\t"
1507 "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1508 "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1509 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1510 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1511 "bnez %[h], 1b \n\t"
1512 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1513 RESTRICT_ASM_ALL64
1514 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1515 [dst]"+&r"(dst), [src]"+&r"(src),
1516 [h]"+&r"(h)
1517 : [dststride]"r"((mips_reg)dststride),
1518 [srcstride]"r"((mips_reg)srcstride)
1519 : "memory"
1520 );
1521 #else
1522 int i;
1523
1524 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1525 memcpy(dst, src, 8);
1526 #endif
1527 }
1528
ff_put_vp8_pixels4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1529 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1530 ptrdiff_t srcstride, int h, int x, int y)
1531 {
1532 #if 1
1533 double ftmp[1];
1534 uint64_t tmp[1];
1535 mips_reg addr[2];
1536 DECLARE_VAR_LOW32;
1537
1538 __asm__ volatile (
1539 "1: \n\t"
1540 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1541 MMI_LWC1(%[ftmp0], %[src], 0x00)
1542 "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1543 "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1544 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1545 MMI_SWC1(%[ftmp0], %[dst], 0x00)
1546 "addiu %[h], %[h], -0x02 \n\t"
1547 "swl %[tmp0], 0x03(%[addr1]) \n\t"
1548 "swr %[tmp0], 0x00(%[addr1]) \n\t"
1549 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1550 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1551 "bnez %[h], 1b \n\t"
1552 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1553 RESTRICT_ASM_LOW32
1554 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1555 [dst]"+&r"(dst), [src]"+&r"(src),
1556 [h]"+&r"(h)
1557 : [dststride]"r"((mips_reg)dststride),
1558 [srcstride]"r"((mips_reg)srcstride)
1559 : "memory"
1560 );
1561 #else
1562 int i;
1563
1564 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1565 memcpy(dst, src, 4);
1566 #endif
1567 }
1568
ff_put_vp8_epel16_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1569 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1570 ptrdiff_t srcstride, int h, int mx, int my)
1571 {
1572 #if 1
1573 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1574 double ftmp[9];
1575 uint32_t tmp[1];
1576 union av_intfloat64 filter1;
1577 union av_intfloat64 filter2;
1578 union av_intfloat64 filter3;
1579 union av_intfloat64 filter4;
1580 mips_reg src1, dst1;
1581 DECLARE_VAR_ALL64;
1582 filter1.i = filter[1];
1583 filter2.i = filter[2];
1584 filter3.i = filter[3];
1585 filter4.i = filter[4];
1586
1587 /*
1588 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1589 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1590 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1591 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1592 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1593 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1594 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1595 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1596
1597 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1598 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1599 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1600 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1601 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1602 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1603 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1604 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1605 */
1606 __asm__ volatile (
1607 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1608 "li %[tmp0], 0x07 \n\t"
1609 "mtc1 %[tmp0], %[ftmp4] \n\t"
1610
1611 "1: \n\t"
1612 // 0 - 7
1613 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1614 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1615 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1616 // 8 - 15
1617 PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1618
1619 "addiu %[h], %[h], -0x01 \n\t"
1620 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1621 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1622 "bnez %[h], 1b \n\t"
1623 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1624 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1625 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1626 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1627 [ftmp8]"=&f"(ftmp[8]),
1628 [tmp0]"=&r"(tmp[0]),
1629 RESTRICT_ASM_ALL64
1630 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1631 [h]"+&r"(h),
1632 [dst]"+&r"(dst), [src]"+&r"(src)
1633 : [ff_pw_64]"f"(ff_pw_64.f),
1634 [srcstride]"r"((mips_reg)srcstride),
1635 [dststride]"r"((mips_reg)dststride),
1636 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1637 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1638 : "memory"
1639 );
1640 #else
1641 const uint8_t *filter = subpel_filters[mx - 1];
1642 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1643 int x, y;
1644
1645 for (y = 0; y < h; y++) {
1646 for (x = 0; x < 16; x++)
1647 dst[x] = FILTER_4TAP(src, filter, 1);
1648 dst += dststride;
1649 src += srcstride;
1650 }
1651 #endif
1652 }
1653
ff_put_vp8_epel8_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1654 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1655 ptrdiff_t srcstride, int h, int mx, int my)
1656 {
1657 #if 1
1658 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1659 double ftmp[9];
1660 uint32_t tmp[1];
1661 union av_intfloat64 filter1;
1662 union av_intfloat64 filter2;
1663 union av_intfloat64 filter3;
1664 union av_intfloat64 filter4;
1665 DECLARE_VAR_ALL64;
1666 filter1.i = filter[1];
1667 filter2.i = filter[2];
1668 filter3.i = filter[3];
1669 filter4.i = filter[4];
1670
1671
1672 /*
1673 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1674 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1675 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1676 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1677 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1678 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1679 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1680 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1681 */
1682 __asm__ volatile (
1683 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1684 "li %[tmp0], 0x07 \n\t"
1685 "mtc1 %[tmp0], %[ftmp4] \n\t"
1686
1687 "1: \n\t"
1688 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1689
1690 "addiu %[h], %[h], -0x01 \n\t"
1691 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1692 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1693 "bnez %[h], 1b \n\t"
1694 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1695 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1696 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1697 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1698 [ftmp8]"=&f"(ftmp[8]),
1699 [tmp0]"=&r"(tmp[0]),
1700 RESTRICT_ASM_ALL64
1701 [h]"+&r"(h),
1702 [dst]"+&r"(dst), [src]"+&r"(src)
1703 : [ff_pw_64]"f"(ff_pw_64.f),
1704 [srcstride]"r"((mips_reg)srcstride),
1705 [dststride]"r"((mips_reg)dststride),
1706 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1707 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1708 : "memory"
1709 );
1710 #else
1711 const uint8_t *filter = subpel_filters[mx - 1];
1712 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1713 int x, y;
1714
1715 for (y = 0; y < h; y++) {
1716 for (x = 0; x < 8; x++)
1717 dst[x] = FILTER_4TAP(src, filter, 1);
1718 dst += dststride;
1719 src += srcstride;
1720 }
1721 #endif
1722 }
1723
ff_put_vp8_epel4_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1724 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1725 ptrdiff_t srcstride, int h, int mx, int my)
1726 {
1727 #if 1
1728 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1729 double ftmp[6];
1730 uint32_t tmp[1];
1731 union av_intfloat64 filter1;
1732 union av_intfloat64 filter2;
1733 union av_intfloat64 filter3;
1734 union av_intfloat64 filter4;
1735 DECLARE_VAR_LOW32;
1736 filter1.i = filter[1];
1737 filter2.i = filter[2];
1738 filter3.i = filter[3];
1739 filter4.i = filter[4];
1740
1741 /*
1742 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1743 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1744 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1745 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1746 */
1747 __asm__ volatile (
1748 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1749 "li %[tmp0], 0x07 \n\t"
1750 "mtc1 %[tmp0], %[ftmp4] \n\t"
1751
1752 "1: \n\t"
1753 PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1754
1755 "addiu %[h], %[h], -0x01 \n\t"
1756 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1757 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1758 "bnez %[h], 1b \n\t"
1759 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1760 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1761 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1762 [tmp0]"=&r"(tmp[0]),
1763 RESTRICT_ASM_LOW32
1764 [h]"+&r"(h),
1765 [dst]"+&r"(dst), [src]"+&r"(src)
1766 : [ff_pw_64]"f"(ff_pw_64.f),
1767 [srcstride]"r"((mips_reg)srcstride),
1768 [dststride]"r"((mips_reg)dststride),
1769 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1770 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1771 : "memory"
1772 );
1773 #else
1774 const uint8_t *filter = subpel_filters[mx - 1];
1775 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1776 int x, y;
1777
1778 for (y = 0; y < h; y++) {
1779 for (x = 0; x < 4; x++)
1780 dst[x] = FILTER_4TAP(src, filter, 1);
1781 dst += dststride;
1782 src += srcstride;
1783 }
1784 #endif
1785 }
1786
ff_put_vp8_epel16_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1787 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1788 ptrdiff_t srcstride, int h, int mx, int my)
1789 {
1790 #if 1
1791 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1792 double ftmp[9];
1793 uint32_t tmp[1];
1794 mips_reg src1, dst1;
1795 union av_intfloat64 filter0;
1796 union av_intfloat64 filter1;
1797 union av_intfloat64 filter2;
1798 union av_intfloat64 filter3;
1799 union av_intfloat64 filter4;
1800 union av_intfloat64 filter5;
1801 DECLARE_VAR_ALL64;
1802 filter0.i = filter[0];
1803 filter1.i = filter[1];
1804 filter2.i = filter[2];
1805 filter3.i = filter[3];
1806 filter4.i = filter[4];
1807 filter5.i = filter[5];
1808
1809 /*
1810 dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1811 dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1812 dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1813 dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1814 dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1815 dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1816 dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1817 dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1818
1819 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1820 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1821 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1822 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1823 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1824 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1825 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1826 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1827 */
1828 __asm__ volatile (
1829 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1830 "li %[tmp0], 0x07 \n\t"
1831 "mtc1 %[tmp0], %[ftmp4] \n\t"
1832
1833 "1: \n\t"
1834 // 0 - 7
1835 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1836 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1837 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1838 // 8 - 15
1839 PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1840
1841 "addiu %[h], %[h], -0x01 \n\t"
1842 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1843 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1844 "bnez %[h], 1b \n\t"
1845 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1846 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1847 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1848 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1849 [ftmp8]"=&f"(ftmp[8]),
1850 [tmp0]"=&r"(tmp[0]),
1851 RESTRICT_ASM_ALL64
1852 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1853 [h]"+&r"(h),
1854 [dst]"+&r"(dst), [src]"+&r"(src)
1855 : [ff_pw_64]"f"(ff_pw_64.f),
1856 [srcstride]"r"((mips_reg)srcstride),
1857 [dststride]"r"((mips_reg)dststride),
1858 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1859 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1860 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1861 : "memory"
1862 );
1863 #else
1864 const uint8_t *filter = subpel_filters[mx - 1];
1865 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1866 int x, y;
1867
1868 for (y = 0; y < h; y++) {
1869 for (x = 0; x < 16; x++)
1870 dst[x] = FILTER_6TAP(src, filter, 1);
1871 dst += dststride;
1872 src += srcstride;
1873 }
1874 #endif
1875 }
1876
ff_put_vp8_epel8_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1877 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1878 ptrdiff_t srcstride, int h, int mx, int my)
1879 {
1880 #if 1
1881 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1882 double ftmp[9];
1883 uint32_t tmp[1];
1884 union av_intfloat64 filter0;
1885 union av_intfloat64 filter1;
1886 union av_intfloat64 filter2;
1887 union av_intfloat64 filter3;
1888 union av_intfloat64 filter4;
1889 union av_intfloat64 filter5;
1890 DECLARE_VAR_ALL64;
1891 filter0.i = filter[0];
1892 filter1.i = filter[1];
1893 filter2.i = filter[2];
1894 filter3.i = filter[3];
1895 filter4.i = filter[4];
1896 filter5.i = filter[5];
1897
1898 /*
1899 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1900 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1901 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1902 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1903 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1904 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1905 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1906 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1907 */
1908 __asm__ volatile (
1909 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1910 "li %[tmp0], 0x07 \n\t"
1911 "mtc1 %[tmp0], %[ftmp4] \n\t"
1912
1913 "1: \n\t"
1914 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1915
1916 "addiu %[h], %[h], -0x01 \n\t"
1917 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1918 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1919 "bnez %[h], 1b \n\t"
1920 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1921 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1922 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1923 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1924 [ftmp8]"=&f"(ftmp[8]),
1925 [tmp0]"=&r"(tmp[0]),
1926 RESTRICT_ASM_ALL64
1927 [h]"+&r"(h),
1928 [dst]"+&r"(dst), [src]"+&r"(src)
1929 : [ff_pw_64]"f"(ff_pw_64.f),
1930 [srcstride]"r"((mips_reg)srcstride),
1931 [dststride]"r"((mips_reg)dststride),
1932 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1933 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1934 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1935 : "memory"
1936 );
1937 #else
1938 const uint8_t *filter = subpel_filters[mx - 1];
1939 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1940 int x, y;
1941
1942 for (y = 0; y < h; y++) {
1943 for (x = 0; x < 8; x++)
1944 dst[x] = FILTER_6TAP(src, filter, 1);
1945 dst += dststride;
1946 src += srcstride;
1947 }
1948 #endif
1949 }
1950
ff_put_vp8_epel4_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1951 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1952 ptrdiff_t srcstride, int h, int mx, int my)
1953 {
1954 #if 1
1955 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1956 double ftmp[6];
1957 uint32_t tmp[1];
1958 union av_intfloat64 filter0;
1959 union av_intfloat64 filter1;
1960 union av_intfloat64 filter2;
1961 union av_intfloat64 filter3;
1962 union av_intfloat64 filter4;
1963 union av_intfloat64 filter5;
1964 DECLARE_VAR_LOW32;
1965 filter0.i = filter[0];
1966 filter1.i = filter[1];
1967 filter2.i = filter[2];
1968 filter3.i = filter[3];
1969 filter4.i = filter[4];
1970 filter5.i = filter[5];
1971
1972 /*
1973 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1974 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1975 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1976 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1977 */
1978 __asm__ volatile (
1979 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1980 "li %[tmp0], 0x07 \n\t"
1981 "mtc1 %[tmp0], %[ftmp4] \n\t"
1982
1983 "1: \n\t"
1984 PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1985
1986 "addiu %[h], %[h], -0x01 \n\t"
1987 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1988 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1989 "bnez %[h], 1b \n\t"
1990 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1991 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1992 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1993 [tmp0]"=&r"(tmp[0]),
1994 RESTRICT_ASM_LOW32
1995 [h]"+&r"(h),
1996 [dst]"+&r"(dst), [src]"+&r"(src)
1997 : [ff_pw_64]"f"(ff_pw_64.f),
1998 [srcstride]"r"((mips_reg)srcstride),
1999 [dststride]"r"((mips_reg)dststride),
2000 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2001 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2002 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2003 : "memory"
2004 );
2005 #else
2006 const uint8_t *filter = subpel_filters[mx - 1];
2007 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2008 int x, y;
2009
2010 for (y = 0; y < h; y++) {
2011 for (x = 0; x < 4; x++)
2012 dst[x] = FILTER_6TAP(src, filter, 1);
2013 dst += dststride;
2014 src += srcstride;
2015 }
2016 #endif
2017 }
2018
ff_put_vp8_epel16_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2019 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2020 ptrdiff_t srcstride, int h, int mx, int my)
2021 {
2022 #if 1
2023 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2024 double ftmp[9];
2025 uint32_t tmp[1];
2026 mips_reg src0, src1, dst0;
2027 union av_intfloat64 filter1;
2028 union av_intfloat64 filter2;
2029 union av_intfloat64 filter3;
2030 union av_intfloat64 filter4;
2031 DECLARE_VAR_ALL64;
2032 filter1.i = filter[1];
2033 filter2.i = filter[2];
2034 filter3.i = filter[3];
2035 filter4.i = filter[4];
2036
2037 /*
2038 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2039 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2040 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2041 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2042 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2043 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2044 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2045 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2046
2047 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2048 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2049 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2050 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2051 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2052 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2053 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2054 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2055 */
2056 __asm__ volatile (
2057 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2058 "li %[tmp0], 0x07 \n\t"
2059 "mtc1 %[tmp0], %[ftmp4] \n\t"
2060
2061 "1: \n\t"
2062 // 0 - 7
2063 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2064 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2065 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2066 // 8 - 15
2067 PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2068
2069 "addiu %[h], %[h], -0x01 \n\t"
2070 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2071 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2072 "bnez %[h], 1b \n\t"
2073 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2074 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2075 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2076 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2077 [ftmp8]"=&f"(ftmp[8]),
2078 [tmp0]"=&r"(tmp[0]),
2079 RESTRICT_ASM_ALL64
2080 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2081 [src1]"=&r"(src1),
2082 [h]"+&r"(h),
2083 [dst]"+&r"(dst), [src]"+&r"(src)
2084 : [ff_pw_64]"f"(ff_pw_64.f),
2085 [srcstride]"r"((mips_reg)srcstride),
2086 [dststride]"r"((mips_reg)dststride),
2087 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2088 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2089 : "memory"
2090 );
2091 #else
2092 const uint8_t *filter = subpel_filters[my - 1];
2093 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2094 int x, y;
2095
2096 for (y = 0; y < h; y++) {
2097 for (x = 0; x < 16; x++)
2098 dst[x] = FILTER_4TAP(src, filter, srcstride);
2099 dst += dststride;
2100 src += srcstride;
2101 }
2102 #endif
2103 }
2104
ff_put_vp8_epel8_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2105 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2106 ptrdiff_t srcstride, int h, int mx, int my)
2107 {
2108 #if 1
2109 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2110 double ftmp[9];
2111 uint32_t tmp[1];
2112 mips_reg src1;
2113 union av_intfloat64 filter1;
2114 union av_intfloat64 filter2;
2115 union av_intfloat64 filter3;
2116 union av_intfloat64 filter4;
2117 DECLARE_VAR_ALL64;
2118 filter1.i = filter[1];
2119 filter2.i = filter[2];
2120 filter3.i = filter[3];
2121 filter4.i = filter[4];
2122
2123 /*
2124 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2125 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2126 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2127 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2128 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2129 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2130 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2131 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2132 */
2133 __asm__ volatile (
2134 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2135 "li %[tmp0], 0x07 \n\t"
2136 "mtc1 %[tmp0], %[ftmp4] \n\t"
2137
2138 "1: \n\t"
2139 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2140
2141 "addiu %[h], %[h], -0x01 \n\t"
2142 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2143 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2144 "bnez %[h], 1b \n\t"
2145 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2146 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2147 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2148 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2149 [ftmp8]"=&f"(ftmp[8]),
2150 [tmp0]"=&r"(tmp[0]),
2151 RESTRICT_ASM_ALL64
2152 [src1]"=&r"(src1),
2153 [h]"+&r"(h),
2154 [dst]"+&r"(dst), [src]"+&r"(src)
2155 : [ff_pw_64]"f"(ff_pw_64.f),
2156 [srcstride]"r"((mips_reg)srcstride),
2157 [dststride]"r"((mips_reg)dststride),
2158 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2159 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2160 : "memory"
2161 );
2162 #else
2163 const uint8_t *filter = subpel_filters[my - 1];
2164 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2165 int x, y;
2166
2167 for (y = 0; y < h; y++) {
2168 for (x = 0; x < 8; x++)
2169 dst[x] = FILTER_4TAP(src, filter, srcstride);
2170 dst += dststride;
2171 src += srcstride;
2172 }
2173 #endif
2174 }
2175
ff_put_vp8_epel4_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2176 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2177 ptrdiff_t srcstride, int h, int mx, int my)
2178 {
2179 #if 1
2180 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2181 double ftmp[6];
2182 uint32_t tmp[1];
2183 mips_reg src1;
2184 union av_intfloat64 filter1;
2185 union av_intfloat64 filter2;
2186 union av_intfloat64 filter3;
2187 union av_intfloat64 filter4;
2188 DECLARE_VAR_LOW32;
2189 filter1.i = filter[1];
2190 filter2.i = filter[2];
2191 filter3.i = filter[3];
2192 filter4.i = filter[4];
2193
2194 /*
2195 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2196 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2197 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2198 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2199 */
2200 __asm__ volatile (
2201 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2202 "li %[tmp0], 0x07 \n\t"
2203 "mtc1 %[tmp0], %[ftmp4] \n\t"
2204
2205 "1: \n\t"
2206 PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2207
2208 "addiu %[h], %[h], -0x01 \n\t"
2209 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2210 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2211 "bnez %[h], 1b \n\t"
2212 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2213 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2214 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2215 [tmp0]"=&r"(tmp[0]),
2216 RESTRICT_ASM_LOW32
2217 [src1]"=&r"(src1),
2218 [h]"+&r"(h),
2219 [dst]"+&r"(dst), [src]"+&r"(src)
2220 : [ff_pw_64]"f"(ff_pw_64.f),
2221 [srcstride]"r"((mips_reg)srcstride),
2222 [dststride]"r"((mips_reg)dststride),
2223 [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2224 [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2225 : "memory"
2226 );
2227 #else
2228 const uint8_t *filter = subpel_filters[my - 1];
2229 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2230 int x, y;
2231
2232 for (y = 0; y < h; y++) {
2233 for (x = 0; x < 4; x++)
2234 dst[x] = FILTER_4TAP(src, filter, srcstride);
2235 dst += dststride;
2236 src += srcstride;
2237 }
2238 #endif
2239 }
2240
ff_put_vp8_epel16_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2241 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2242 ptrdiff_t srcstride, int h, int mx, int my)
2243 {
2244 #if 1
2245 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2246 double ftmp[9];
2247 uint32_t tmp[1];
2248 mips_reg src0, src1, dst0;
2249 union av_intfloat64 filter0;
2250 union av_intfloat64 filter1;
2251 union av_intfloat64 filter2;
2252 union av_intfloat64 filter3;
2253 union av_intfloat64 filter4;
2254 union av_intfloat64 filter5;
2255 DECLARE_VAR_ALL64;
2256 filter0.i = filter[0];
2257 filter1.i = filter[1];
2258 filter2.i = filter[2];
2259 filter3.i = filter[3];
2260 filter4.i = filter[4];
2261 filter5.i = filter[5];
2262
2263 /*
2264 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2265 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2266 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2267 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2268 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2269 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2270 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2271 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2272
2273 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2274 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2275 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2276 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2277 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2278 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2279 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2280 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2281 */
2282 __asm__ volatile (
2283 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2284 "li %[tmp0], 0x07 \n\t"
2285 "mtc1 %[tmp0], %[ftmp4] \n\t"
2286
2287 "1: \n\t"
2288 // 0 - 7
2289 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2290 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2291 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2292 // 8 - 15
2293 PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2294
2295 "addiu %[h], %[h], -0x01 \n\t"
2296 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2297 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2298 "bnez %[h], 1b \n\t"
2299 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2300 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2301 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2302 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2303 [ftmp8]"=&f"(ftmp[8]),
2304 [tmp0]"=&r"(tmp[0]),
2305 RESTRICT_ASM_ALL64
2306 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2307 [src1]"=&r"(src1),
2308 [h]"+&r"(h),
2309 [dst]"+&r"(dst), [src]"+&r"(src)
2310 : [ff_pw_64]"f"(ff_pw_64.f),
2311 [srcstride]"r"((mips_reg)srcstride),
2312 [dststride]"r"((mips_reg)dststride),
2313 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2314 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2315 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2316 : "memory"
2317 );
2318 #else
2319 const uint8_t *filter = subpel_filters[my - 1];
2320 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2321 int x, y;
2322
2323 for (y = 0; y < h; y++) {
2324 for (x = 0; x < 16; x++)
2325 dst[x] = FILTER_6TAP(src, filter, srcstride);
2326 dst += dststride;
2327 src += srcstride;
2328 }
2329 #endif
2330 }
2331
ff_put_vp8_epel8_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2332 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2333 ptrdiff_t srcstride, int h, int mx, int my)
2334 {
2335 #if 1
2336 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2337 double ftmp[9];
2338 uint32_t tmp[1];
2339 mips_reg src1;
2340 union av_intfloat64 filter0;
2341 union av_intfloat64 filter1;
2342 union av_intfloat64 filter2;
2343 union av_intfloat64 filter3;
2344 union av_intfloat64 filter4;
2345 union av_intfloat64 filter5;
2346 DECLARE_VAR_ALL64;
2347 filter0.i = filter[0];
2348 filter1.i = filter[1];
2349 filter2.i = filter[2];
2350 filter3.i = filter[3];
2351 filter4.i = filter[4];
2352 filter5.i = filter[5];
2353
2354 /*
2355 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2356 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2357 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2358 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2359 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2360 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2361 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2362 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2363 */
2364 __asm__ volatile (
2365 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2366 "li %[tmp0], 0x07 \n\t"
2367 "mtc1 %[tmp0], %[ftmp4] \n\t"
2368
2369 "1: \n\t"
2370 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2371
2372 "addiu %[h], %[h], -0x01 \n\t"
2373 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2374 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2375 "bnez %[h], 1b \n\t"
2376 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2377 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2378 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2379 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2380 [ftmp8]"=&f"(ftmp[8]),
2381 [tmp0]"=&r"(tmp[0]),
2382 RESTRICT_ASM_ALL64
2383 [src1]"=&r"(src1),
2384 [h]"+&r"(h),
2385 [dst]"+&r"(dst), [src]"+&r"(src)
2386 : [ff_pw_64]"f"(ff_pw_64.f),
2387 [srcstride]"r"((mips_reg)srcstride),
2388 [dststride]"r"((mips_reg)dststride),
2389 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2390 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2391 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2392 : "memory"
2393 );
2394 #else
2395 const uint8_t *filter = subpel_filters[my - 1];
2396 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2397 int x, y;
2398
2399 for (y = 0; y < h; y++) {
2400 for (x = 0; x < 8; x++)
2401 dst[x] = FILTER_6TAP(src, filter, srcstride);
2402 dst += dststride;
2403 src += srcstride;
2404 }
2405 #endif
2406 }
2407
ff_put_vp8_epel4_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2408 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2409 ptrdiff_t srcstride, int h, int mx, int my)
2410 {
2411 #if 1
2412 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2413 double ftmp[6];
2414 uint32_t tmp[1];
2415 mips_reg src1;
2416 union av_intfloat64 filter0;
2417 union av_intfloat64 filter1;
2418 union av_intfloat64 filter2;
2419 union av_intfloat64 filter3;
2420 union av_intfloat64 filter4;
2421 union av_intfloat64 filter5;
2422 DECLARE_VAR_LOW32;
2423 filter0.i = filter[0];
2424 filter1.i = filter[1];
2425 filter2.i = filter[2];
2426 filter3.i = filter[3];
2427 filter4.i = filter[4];
2428 filter5.i = filter[5];
2429
2430 /*
2431 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2432 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2433 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2434 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2435 */
2436 __asm__ volatile (
2437 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2438 "li %[tmp0], 0x07 \n\t"
2439 "mtc1 %[tmp0], %[ftmp4] \n\t"
2440
2441 "1: \n\t"
2442 PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2443
2444 "addiu %[h], %[h], -0x01 \n\t"
2445 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2446 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2447 "bnez %[h], 1b \n\t"
2448 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2449 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2450 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2451 [tmp0]"=&r"(tmp[0]),
2452 RESTRICT_ASM_LOW32
2453 [src1]"=&r"(src1),
2454 [h]"+&r"(h),
2455 [dst]"+&r"(dst), [src]"+&r"(src)
2456 : [ff_pw_64]"f"(ff_pw_64.f),
2457 [srcstride]"r"((mips_reg)srcstride),
2458 [dststride]"r"((mips_reg)dststride),
2459 [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2460 [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2461 [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2462 : "memory"
2463 );
2464 #else
2465 const uint8_t *filter = subpel_filters[my - 1];
2466 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2467 int x, y;
2468
2469 for (y = 0; y < h; y++) {
2470 for (x = 0; x < 4; x++)
2471 dst[x] = FILTER_6TAP(src, filter, srcstride);
2472 dst += dststride;
2473 src += srcstride;
2474 }
2475 #endif
2476 }
2477
ff_put_vp8_epel16_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2478 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2479 ptrdiff_t srcstride, int h, int mx, int my)
2480 {
2481 #if 1
2482 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2483 uint8_t *tmp = tmp_array;
2484
2485 src -= srcstride;
2486 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2487 tmp = tmp_array + 16;
2488 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2489 #else
2490 const uint8_t *filter = subpel_filters[mx - 1];
2491 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2492 int x, y;
2493 uint8_t tmp_array[560];
2494 uint8_t *tmp = tmp_array;
2495
2496 src -= srcstride;
2497
2498 for (y = 0; y < h + 3; y++) {
2499 for (x = 0; x < 16; x++)
2500 tmp[x] = FILTER_4TAP(src, filter, 1);
2501 tmp += 16;
2502 src += srcstride;
2503 }
2504
2505 tmp = tmp_array + 16;
2506 filter = subpel_filters[my - 1];
2507
2508 for (y = 0; y < h; y++) {
2509 for (x = 0; x < 16; x++)
2510 dst[x] = FILTER_4TAP(tmp, filter, 16);
2511 dst += dststride;
2512 tmp += 16;
2513 }
2514 #endif
2515 }
2516
ff_put_vp8_epel8_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2517 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2518 ptrdiff_t srcstride, int h, int mx, int my)
2519 {
2520 #if 1
2521 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2522 uint8_t *tmp = tmp_array;
2523
2524 src -= srcstride;
2525 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2526 tmp = tmp_array + 8;
2527 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2528 #else
2529 const uint8_t *filter = subpel_filters[mx - 1];
2530 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2531 int x, y;
2532 uint8_t tmp_array[152];
2533 uint8_t *tmp = tmp_array;
2534
2535 src -= srcstride;
2536
2537 for (y = 0; y < h + 3; y++) {
2538 for (x = 0; x < 8; x++)
2539 tmp[x] = FILTER_4TAP(src, filter, 1);
2540 tmp += 8;
2541 src += srcstride;
2542 }
2543
2544 tmp = tmp_array + 8;
2545 filter = subpel_filters[my - 1];
2546
2547 for (y = 0; y < h; y++) {
2548 for (x = 0; x < 8; x++)
2549 dst[x] = FILTER_4TAP(tmp, filter, 8);
2550 dst += dststride;
2551 tmp += 8;
2552 }
2553 #endif
2554 }
2555
ff_put_vp8_epel4_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2556 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2557 ptrdiff_t srcstride, int h, int mx, int my)
2558 {
2559 #if 1
2560 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2561 uint8_t *tmp = tmp_array;
2562
2563 src -= srcstride;
2564 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2565 tmp = tmp_array + 4;
2566 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2567 #else
2568 const uint8_t *filter = subpel_filters[mx - 1];
2569 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2570 int x, y;
2571 uint8_t tmp_array[44];
2572 uint8_t *tmp = tmp_array;
2573
2574 src -= srcstride;
2575
2576 for (y = 0; y < h + 3; y++) {
2577 for (x = 0; x < 4; x++)
2578 tmp[x] = FILTER_4TAP(src, filter, 1);
2579 tmp += 4;
2580 src += srcstride;
2581 }
2582 tmp = tmp_array + 4;
2583 filter = subpel_filters[my - 1];
2584
2585 for (y = 0; y < h; y++) {
2586 for (x = 0; x < 4; x++)
2587 dst[x] = FILTER_4TAP(tmp, filter, 4);
2588 dst += dststride;
2589 tmp += 4;
2590 }
2591 #endif
2592 }
2593
ff_put_vp8_epel16_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2594 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2595 ptrdiff_t srcstride, int h, int mx, int my)
2596 {
2597 #if 1
2598 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2599 uint8_t *tmp = tmp_array;
2600
2601 src -= 2 * srcstride;
2602 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2603 tmp = tmp_array + 32;
2604 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2605 #else
2606 const uint8_t *filter = subpel_filters[mx - 1];
2607 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2608 int x, y;
2609 uint8_t tmp_array[592];
2610 uint8_t *tmp = tmp_array;
2611
2612 src -= 2 * srcstride;
2613
2614 for (y = 0; y < h + 5; y++) {
2615 for (x = 0; x < 16; x++)
2616 tmp[x] = FILTER_4TAP(src, filter, 1);
2617 tmp += 16;
2618 src += srcstride;
2619 }
2620
2621 tmp = tmp_array + 32;
2622 filter = subpel_filters[my - 1];
2623
2624 for (y = 0; y < h; y++) {
2625 for (x = 0; x < 16; x++)
2626 dst[x] = FILTER_6TAP(tmp, filter, 16);
2627 dst += dststride;
2628 tmp += 16;
2629 }
2630 #endif
2631 }
2632
ff_put_vp8_epel8_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2633 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2634 ptrdiff_t srcstride, int h, int mx, int my)
2635 {
2636 #if 1
2637 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2638 uint8_t *tmp = tmp_array;
2639
2640 src -= 2 * srcstride;
2641 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2642 tmp = tmp_array + 16;
2643 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2644 #else
2645 const uint8_t *filter = subpel_filters[mx - 1];
2646 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2647 int x, y;
2648 uint8_t tmp_array[168];
2649 uint8_t *tmp = tmp_array;
2650
2651 src -= 2 * srcstride;
2652
2653 for (y = 0; y < h + 5; y++) {
2654 for (x = 0; x < 8; x++)
2655 tmp[x] = FILTER_4TAP(src, filter, 1);
2656 tmp += 8;
2657 src += srcstride;
2658 }
2659
2660 tmp = tmp_array + 16;
2661 filter = subpel_filters[my - 1];
2662
2663 for (y = 0; y < h; y++) {
2664 for (x = 0; x < 8; x++)
2665 dst[x] = FILTER_6TAP(tmp, filter, 8);
2666 dst += dststride;
2667 tmp += 8;
2668 }
2669 #endif
2670 }
2671
ff_put_vp8_epel4_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2672 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2673 ptrdiff_t srcstride, int h, int mx, int my)
2674 {
2675 #if 1
2676 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2677 uint8_t *tmp = tmp_array;
2678
2679 src -= 2 * srcstride;
2680 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2681 tmp = tmp_array + 8;
2682 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2683 #else
2684 const uint8_t *filter = subpel_filters[mx - 1];
2685 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2686 int x, y;
2687 uint8_t tmp_array[52];
2688 uint8_t *tmp = tmp_array;
2689
2690 src -= 2 * srcstride;
2691
2692 for (y = 0; y < h + 5; y++) {
2693 for (x = 0; x < 4; x++)
2694 tmp[x] = FILTER_4TAP(src, filter, 1);
2695 tmp += 4;
2696 src += srcstride;
2697 }
2698
2699 tmp = tmp_array + 8;
2700 filter = subpel_filters[my - 1];
2701
2702 for (y = 0; y < h; y++) {
2703 for (x = 0; x < 4; x++)
2704 dst[x] = FILTER_6TAP(tmp, filter, 4);
2705 dst += dststride;
2706 tmp += 4;
2707 }
2708 #endif
2709 }
2710
ff_put_vp8_epel16_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2711 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2712 ptrdiff_t srcstride, int h, int mx, int my)
2713 {
2714 #if 1
2715 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2716 uint8_t *tmp = tmp_array;
2717
2718 src -= srcstride;
2719 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2720 tmp = tmp_array + 16;
2721 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2722 #else
2723 const uint8_t *filter = subpel_filters[mx - 1];
2724 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2725 int x, y;
2726 uint8_t tmp_array[560];
2727 uint8_t *tmp = tmp_array;
2728
2729 src -= srcstride;
2730
2731 for (y = 0; y < h + 3; y++) {
2732 for (x = 0; x < 16; x++)
2733 tmp[x] = FILTER_6TAP(src, filter, 1);
2734 tmp += 16;
2735 src += srcstride;
2736 }
2737
2738 tmp = tmp_array + 16;
2739 filter = subpel_filters[my - 1];
2740
2741 for (y = 0; y < h; y++) {
2742 for (x = 0; x < 16; x++)
2743 dst[x] = FILTER_4TAP(tmp, filter, 16);
2744 dst += dststride;
2745 tmp += 16;
2746 }
2747 #endif
2748 }
2749
ff_put_vp8_epel8_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2750 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2751 ptrdiff_t srcstride, int h, int mx, int my)
2752 {
2753 #if 1
2754 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2755 uint8_t *tmp = tmp_array;
2756
2757 src -= srcstride;
2758 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2759 tmp = tmp_array + 8;
2760 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2761 #else
2762 const uint8_t *filter = subpel_filters[mx - 1];
2763 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2764 int x, y;
2765 uint8_t tmp_array[152];
2766 uint8_t *tmp = tmp_array;
2767
2768 src -= srcstride;
2769
2770 for (y = 0; y < h + 3; y++) {
2771 for (x = 0; x < 8; x++)
2772 tmp[x] = FILTER_6TAP(src, filter, 1);
2773 tmp += 8;
2774 src += srcstride;
2775 }
2776
2777 tmp = tmp_array + 8;
2778 filter = subpel_filters[my - 1];
2779
2780 for (y = 0; y < h; y++) {
2781 for (x = 0; x < 8; x++)
2782 dst[x] = FILTER_4TAP(tmp, filter, 8);
2783 dst += dststride;
2784 tmp += 8;
2785 }
2786 #endif
2787 }
2788
ff_put_vp8_epel4_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2789 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2790 ptrdiff_t srcstride, int h, int mx, int my)
2791 {
2792 #if 1
2793 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2794 uint8_t *tmp = tmp_array;
2795
2796 src -= srcstride;
2797 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2798 tmp = tmp_array + 4;
2799 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2800 #else
2801 const uint8_t *filter = subpel_filters[mx - 1];
2802 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2803 int x, y;
2804 uint8_t tmp_array[44];
2805 uint8_t *tmp = tmp_array;
2806
2807 src -= srcstride;
2808
2809 for (y = 0; y < h + 3; y++) {
2810 for (x = 0; x < 4; x++)
2811 tmp[x] = FILTER_6TAP(src, filter, 1);
2812 tmp += 4;
2813 src += srcstride;
2814 }
2815
2816 tmp = tmp_array + 4;
2817 filter = subpel_filters[my - 1];
2818
2819 for (y = 0; y < h; y++) {
2820 for (x = 0; x < 4; x++)
2821 dst[x] = FILTER_4TAP(tmp, filter, 4);
2822 dst += dststride;
2823 tmp += 4;
2824 }
2825 #endif
2826 }
2827
ff_put_vp8_epel16_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2828 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2829 ptrdiff_t srcstride, int h, int mx, int my)
2830 {
2831 #if 1
2832 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2833 uint8_t *tmp = tmp_array;
2834
2835 src -= 2 * srcstride;
2836 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2837 tmp = tmp_array + 32;
2838 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2839 #else
2840 const uint8_t *filter = subpel_filters[mx - 1];
2841 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2842 int x, y;
2843 uint8_t tmp_array[592];
2844 uint8_t *tmp = tmp_array;
2845
2846 src -= 2 * srcstride;
2847
2848 for (y = 0; y < h + 5; y++) {
2849 for (x = 0; x < 16; x++)
2850 tmp[x] = FILTER_6TAP(src, filter, 1);
2851 tmp += 16;
2852 src += srcstride;
2853 }
2854
2855 tmp = tmp_array + 32;
2856 filter = subpel_filters[my - 1];
2857
2858 for (y = 0; y < h; y++) {
2859 for (x = 0; x < 16; x++)
2860 dst[x] = FILTER_6TAP(tmp, filter, 16);
2861 dst += dststride;
2862 tmp += 16;
2863 }
2864 #endif
2865 }
2866
ff_put_vp8_epel8_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2867 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2868 ptrdiff_t srcstride, int h, int mx, int my)
2869 {
2870 #if 1
2871 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2872 uint8_t *tmp = tmp_array;
2873
2874 src -= 2 * srcstride;
2875 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2876 tmp = tmp_array + 16;
2877 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2878 #else
2879 const uint8_t *filter = subpel_filters[mx - 1];
2880 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2881 int x, y;
2882 uint8_t tmp_array[168];
2883 uint8_t *tmp = tmp_array;
2884
2885 src -= 2 * srcstride;
2886
2887 for (y = 0; y < h + 5; y++) {
2888 for (x = 0; x < 8; x++)
2889 tmp[x] = FILTER_6TAP(src, filter, 1);
2890 tmp += 8;
2891 src += srcstride;
2892 }
2893
2894 tmp = tmp_array + 16;
2895 filter = subpel_filters[my - 1];
2896
2897 for (y = 0; y < h; y++) {
2898 for (x = 0; x < 8; x++)
2899 dst[x] = FILTER_6TAP(tmp, filter, 8);
2900 dst += dststride;
2901 tmp += 8;
2902 }
2903 #endif
2904 }
2905
ff_put_vp8_epel4_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2906 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2907 ptrdiff_t srcstride, int h, int mx, int my)
2908 {
2909 #if 1
2910 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2911 uint8_t *tmp = tmp_array;
2912
2913 src -= 2 * srcstride;
2914 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2915 tmp = tmp_array + 8;
2916 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2917 #else
2918 const uint8_t *filter = subpel_filters[mx - 1];
2919 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2920 int x, y;
2921 uint8_t tmp_array[52];
2922 uint8_t *tmp = tmp_array;
2923
2924 src -= 2 * srcstride;
2925
2926 for (y = 0; y < h + 5; y++) {
2927 for (x = 0; x < 4; x++)
2928 tmp[x] = FILTER_6TAP(src, filter, 1);
2929 tmp += 4;
2930 src += srcstride;
2931 }
2932
2933 tmp = tmp_array + 8;
2934 filter = subpel_filters[my - 1];
2935
2936 for (y = 0; y < h; y++) {
2937 for (x = 0; x < 4; x++)
2938 dst[x] = FILTER_6TAP(tmp, filter, 4);
2939 dst += dststride;
2940 tmp += 4;
2941 }
2942 #endif
2943 }
2944
ff_put_vp8_bilinear16_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2945 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2946 ptrdiff_t sstride, int h, int mx, int my)
2947 {
2948 #if 1
2949 union mmi_intfloat64 a, b;
2950 double ftmp[7];
2951 uint32_t tmp[1];
2952 mips_reg dst0, src0;
2953 DECLARE_VAR_ALL64;
2954 a.i = 8 - mx;
2955 b.i = mx;
2956
2957 /*
2958 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2959 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2960 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2961 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2962 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2963 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2964 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2965 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2966
2967 dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2968 dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2969 dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2970 dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2971 dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2972 dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2973 dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2974 dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2975 */
2976 __asm__ volatile (
2977 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2978 "li %[tmp0], 0x03 \n\t"
2979 "mtc1 %[tmp0], %[ftmp4] \n\t"
2980 "pshufh %[a], %[a], %[ftmp0] \n\t"
2981 "pshufh %[b], %[b], %[ftmp0] \n\t"
2982
2983 "1: \n\t"
2984 // 0 - 7
2985 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2986 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2987 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2988 // 8 - 15
2989 PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2990
2991 "addiu %[h], %[h], -0x01 \n\t"
2992 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2993 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2994 "bnez %[h], 1b \n\t"
2995 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2996 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2997 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2998 [ftmp6]"=&f"(ftmp[6]),
2999 [tmp0]"=&r"(tmp[0]),
3000 RESTRICT_ASM_ALL64
3001 [dst0]"=&r"(dst0), [src0]"=&r"(src0),
3002 [h]"+&r"(h),
3003 [dst]"+&r"(dst), [src]"+&r"(src),
3004 [a]"+&f"(a.f), [b]"+&f"(b.f)
3005 : [sstride]"r"((mips_reg)sstride),
3006 [dstride]"r"((mips_reg)dstride),
3007 [ff_pw_4]"f"(ff_pw_4.f)
3008 : "memory"
3009 );
3010 #else
3011 int a = 8 - mx, b = mx;
3012 int x, y;
3013
3014 for (y = 0; y < h; y++) {
3015 for (x = 0; x < 16; x++)
3016 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3017 dst += dstride;
3018 src += sstride;
3019 }
3020 #endif
3021 }
3022
ff_put_vp8_bilinear16_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3023 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3024 ptrdiff_t sstride, int h, int mx, int my)
3025 {
3026 #if 1
3027 union mmi_intfloat64 c, d;
3028 double ftmp[7];
3029 uint32_t tmp[1];
3030 mips_reg src0, src1, dst0;
3031 DECLARE_VAR_ALL64;
3032 c.i = 8 - my;
3033 d.i = my;
3034
3035 /*
3036 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3037 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3038 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3039 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3040 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3041 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3042 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3043 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3044 */
3045 __asm__ volatile (
3046 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3047 "li %[tmp0], 0x03 \n\t"
3048 "mtc1 %[tmp0], %[ftmp4] \n\t"
3049 "pshufh %[c], %[c], %[ftmp0] \n\t"
3050 "pshufh %[d], %[d], %[ftmp0] \n\t"
3051
3052 "1: \n\t"
3053 // 0 - 7
3054 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3055 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
3056 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
3057 // 8 - 15
3058 PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
3059
3060 "addiu %[h], %[h], -0x01 \n\t"
3061 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3062 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3063 "bnez %[h], 1b \n\t"
3064 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3065 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3066 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3067 [ftmp6]"=&f"(ftmp[6]),
3068 [tmp0]"=&r"(tmp[0]),
3069 RESTRICT_ASM_ALL64
3070 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
3071 [src1]"=&r"(src1),
3072 [h]"+&r"(h),
3073 [dst]"+&r"(dst), [src]"+&r"(src),
3074 [c]"+&f"(c.f), [d]"+&f"(d.f)
3075 : [sstride]"r"((mips_reg)sstride),
3076 [dstride]"r"((mips_reg)dstride),
3077 [ff_pw_4]"f"(ff_pw_4.f)
3078 : "memory"
3079 );
3080 #else
3081 int c = 8 - my, d = my;
3082 int x, y;
3083
3084 for (y = 0; y < h; y++) {
3085 for (x = 0; x < 16; x++)
3086 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3087 dst += dstride;
3088 src += sstride;
3089 }
3090 #endif
3091 }
3092
ff_put_vp8_bilinear16_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3093 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3094 ptrdiff_t sstride, int h, int mx, int my)
3095 {
3096 #if 1
3097 DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
3098 uint8_t *tmp = tmp_array;
3099
3100 ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
3101 ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
3102 #else
3103 int a = 8 - mx, b = mx;
3104 int c = 8 - my, d = my;
3105 int x, y;
3106 uint8_t tmp_array[528];
3107 uint8_t *tmp = tmp_array;
3108
3109 for (y = 0; y < h + 1; y++) {
3110 for (x = 0; x < 16; x++)
3111 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3112 tmp += 16;
3113 src += sstride;
3114 }
3115
3116 tmp = tmp_array;
3117
3118 for (y = 0; y < h; y++) {
3119 for (x = 0; x < 16; x++)
3120 dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3121 dst += dstride;
3122 tmp += 16;
3123 }
3124 #endif
3125 }
3126
ff_put_vp8_bilinear8_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3127 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3128 ptrdiff_t sstride, int h, int mx, int my)
3129 {
3130 #if 1
3131 union mmi_intfloat64 a, b;
3132 double ftmp[7];
3133 uint32_t tmp[1];
3134 DECLARE_VAR_ALL64;
3135 a.i = 8 - mx;
3136 b.i = mx;
3137
3138 /*
3139 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3140 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3141 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3142 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3143 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3144 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3145 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3146 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3147 */
3148 __asm__ volatile (
3149 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3150 "li %[tmp0], 0x03 \n\t"
3151 "mtc1 %[tmp0], %[ftmp4] \n\t"
3152 "pshufh %[a], %[a], %[ftmp0] \n\t"
3153 "pshufh %[b], %[b], %[ftmp0] \n\t"
3154
3155 "1: \n\t"
3156 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3157
3158 "addiu %[h], %[h], -0x01 \n\t"
3159 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3160 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3161 "bnez %[h], 1b \n\t"
3162 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3163 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3164 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3165 [ftmp6]"=&f"(ftmp[6]),
3166 [tmp0]"=&r"(tmp[0]),
3167 RESTRICT_ASM_ALL64
3168 [h]"+&r"(h),
3169 [dst]"+&r"(dst), [src]"+&r"(src),
3170 [a]"+&f"(a.f), [b]"+&f"(b.f)
3171 : [sstride]"r"((mips_reg)sstride),
3172 [dstride]"r"((mips_reg)dstride),
3173 [ff_pw_4]"f"(ff_pw_4.f)
3174 : "memory"
3175 );
3176 #else
3177 int a = 8 - mx, b = mx;
3178 int x, y;
3179
3180 for (y = 0; y < h; y++) {
3181 for (x = 0; x < 8; x++)
3182 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3183 dst += dstride;
3184 src += sstride;
3185 }
3186 #endif
3187 }
3188
ff_put_vp8_bilinear8_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3189 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3190 ptrdiff_t sstride, int h, int mx, int my)
3191 {
3192 #if 1
3193 union mmi_intfloat64 c, d;
3194 double ftmp[7];
3195 uint32_t tmp[1];
3196 mips_reg src1;
3197 DECLARE_VAR_ALL64;
3198 c.i = 8 - my;
3199 d.i = my;
3200
3201 /*
3202 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3203 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3204 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3205 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3206 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3207 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3208 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3209 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3210 */
3211 __asm__ volatile (
3212 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3213 "li %[tmp0], 0x03 \n\t"
3214 "mtc1 %[tmp0], %[ftmp4] \n\t"
3215 "pshufh %[c], %[c], %[ftmp0] \n\t"
3216 "pshufh %[d], %[d], %[ftmp0] \n\t"
3217
3218 "1: \n\t"
3219 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3220
3221 "addiu %[h], %[h], -0x01 \n\t"
3222 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3223 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3224 "bnez %[h], 1b \n\t"
3225 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3226 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3227 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3228 [ftmp6]"=&f"(ftmp[6]),
3229 [tmp0]"=&r"(tmp[0]),
3230 RESTRICT_ASM_ALL64
3231 [src1]"=&r"(src1),
3232 [h]"+&r"(h),
3233 [dst]"+&r"(dst), [src]"+&r"(src),
3234 [c]"+&f"(c.f), [d]"+&f"(d.f)
3235 : [sstride]"r"((mips_reg)sstride),
3236 [dstride]"r"((mips_reg)dstride),
3237 [ff_pw_4]"f"(ff_pw_4.f)
3238 : "memory"
3239 );
3240 #else
3241 int c = 8 - my, d = my;
3242 int x, y;
3243
3244 for (y = 0; y < h; y++) {
3245 for (x = 0; x < 8; x++)
3246 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3247 dst += dstride;
3248 src += sstride;
3249 }
3250 #endif
3251 }
3252
ff_put_vp8_bilinear8_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3253 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3254 ptrdiff_t sstride, int h, int mx, int my)
3255 {
3256 #if 1
3257 DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3258 uint8_t *tmp = tmp_array;
3259
3260 ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3261 ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3262 #else
3263 int a = 8 - mx, b = mx;
3264 int c = 8 - my, d = my;
3265 int x, y;
3266 uint8_t tmp_array[136];
3267 uint8_t *tmp = tmp_array;
3268
3269 for (y = 0; y < h + 1; y++) {
3270 for (x = 0; x < 8; x++)
3271 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3272 tmp += 8;
3273 src += sstride;
3274 }
3275
3276 tmp = tmp_array;
3277
3278 for (y = 0; y < h; y++) {
3279 for (x = 0; x < 8; x++)
3280 dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3281 dst += dstride;
3282 tmp += 8;
3283 }
3284 #endif
3285 }
3286
ff_put_vp8_bilinear4_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3287 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3288 ptrdiff_t sstride, int h, int mx, int my)
3289 {
3290 #if 1
3291 union mmi_intfloat64 a, b;
3292 double ftmp[5];
3293 uint32_t tmp[1];
3294 DECLARE_VAR_LOW32;
3295 DECLARE_VAR_ALL64;
3296 a.i = 8 - mx;
3297 b.i = mx;
3298
3299 /*
3300 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3301 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3302 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3303 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3304 */
3305 __asm__ volatile (
3306 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3307 "li %[tmp0], 0x03 \n\t"
3308 "mtc1 %[tmp0], %[ftmp4] \n\t"
3309 "pshufh %[a], %[a], %[ftmp0] \n\t"
3310 "pshufh %[b], %[b], %[ftmp0] \n\t"
3311
3312 "1: \n\t"
3313 PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3314
3315 "addiu %[h], %[h], -0x01 \n\t"
3316 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3317 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3318 "bnez %[h], 1b \n\t"
3319 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3320 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3321 [ftmp4]"=&f"(ftmp[4]),
3322 [tmp0]"=&r"(tmp[0]),
3323 RESTRICT_ASM_LOW32
3324 RESTRICT_ASM_ALL64
3325 [h]"+&r"(h),
3326 [dst]"+&r"(dst), [src]"+&r"(src),
3327 [a]"+&f"(a.f), [b]"+&f"(b.f)
3328 : [sstride]"r"((mips_reg)sstride),
3329 [dstride]"r"((mips_reg)dstride),
3330 [ff_pw_4]"f"(ff_pw_4.f)
3331 : "memory"
3332 );
3333 #else
3334 int a = 8 - mx, b = mx;
3335 int x, y;
3336
3337 for (y = 0; y < h; y++) {
3338 for (x = 0; x < 4; x++)
3339 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3340 dst += dstride;
3341 src += sstride;
3342 }
3343 #endif
3344 }
3345
ff_put_vp8_bilinear4_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3346 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3347 ptrdiff_t sstride, int h, int mx, int my)
3348 {
3349 #if 1
3350 union mmi_intfloat64 c, d;
3351 double ftmp[7];
3352 uint32_t tmp[1];
3353 mips_reg src1;
3354 DECLARE_VAR_LOW32;
3355 DECLARE_VAR_ALL64;
3356 c.i = 8 - my;
3357 d.i = my;
3358
3359 /*
3360 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3361 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3362 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3363 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3364 */
3365 __asm__ volatile (
3366 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3367 "li %[tmp0], 0x03 \n\t"
3368 "mtc1 %[tmp0], %[ftmp4] \n\t"
3369 "pshufh %[c], %[c], %[ftmp0] \n\t"
3370 "pshufh %[d], %[d], %[ftmp0] \n\t"
3371
3372 "1: \n\t"
3373 PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3374
3375 "addiu %[h], %[h], -0x01 \n\t"
3376 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3377 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3378 "bnez %[h], 1b \n\t"
3379 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3380 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3381 [ftmp4]"=&f"(ftmp[4]),
3382 [tmp0]"=&r"(tmp[0]),
3383 RESTRICT_ASM_LOW32
3384 RESTRICT_ASM_ALL64
3385 [src1]"=&r"(src1),
3386 [h]"+&r"(h),
3387 [dst]"+&r"(dst), [src]"+&r"(src),
3388 [c]"+&f"(c.f), [d]"+&f"(d.f)
3389 : [sstride]"r"((mips_reg)sstride),
3390 [dstride]"r"((mips_reg)dstride),
3391 [ff_pw_4]"f"(ff_pw_4.f)
3392 : "memory"
3393 );
3394 #else
3395 int c = 8 - my, d = my;
3396 int x, y;
3397
3398 for (y = 0; y < h; y++) {
3399 for (x = 0; x < 4; x++)
3400 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3401 dst += dstride;
3402 src += sstride;
3403 }
3404 #endif
3405 }
3406
ff_put_vp8_bilinear4_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3407 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3408 ptrdiff_t sstride, int h, int mx, int my)
3409 {
3410 #if 1
3411 DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3412 uint8_t *tmp = tmp_array;
3413
3414 ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3415 ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3416 #else
3417 int a = 8 - mx, b = mx;
3418 int c = 8 - my, d = my;
3419 int x, y;
3420 uint8_t tmp_array[36];
3421 uint8_t *tmp = tmp_array;
3422
3423 for (y = 0; y < h + 1; y++) {
3424 for (x = 0; x < 4; x++)
3425 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3426 tmp += 4;
3427 src += sstride;
3428 }
3429
3430 tmp = tmp_array;
3431
3432 for (y = 0; y < h; y++) {
3433 for (x = 0; x < 4; x++)
3434 dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3435 dst += dstride;
3436 tmp += 4;
3437 }
3438 #endif
3439 }
3440