1 /*
2 * Loongson SIMD optimized vp8dsp
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/mips/mmiutils.h"
27
28 #define DECLARE_DOUBLE_1 double db_1
29 #define DECLARE_DOUBLE_2 double db_2
30 #define DECLARE_UINT32_T uint32_t it_1
31 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
32 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
33 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
34
35 #define MMI_PCMPGTUB(dst, src1, src2) \
36 "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
37 "pmaxub %[db_2], "#src1", "#src2" \n\t" \
38 "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
39 "xor "#dst", %[db_2], %[db_1] \n\t"
40
41 #define MMI_BTOH(dst_l, dst_r, src) \
42 "xor %[db_1], %[db_1], %[db_1] \n\t" \
43 "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
44 "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
45 "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
46
47 #define MMI_VP8_LOOP_FILTER \
48 /* Calculation of hev */ \
49 "dmtc1 %[thresh], %[ftmp3] \n\t" \
50 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
51 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
52 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53 "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
54 "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
55 "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
56 MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
57 /* Calculation of mask */ \
58 "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
59 "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
60 "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
61 "li %[tmp0], 0x09 \n\t" \
62 "dmtc1 %[tmp0], %[ftmp3] \n\t" \
63 PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
64 "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
65 "dmtc1 %[e], %[ftmp3] \n\t" \
66 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
67 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
68 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69 MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
70 "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
71 "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
72 "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
73 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
74 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
75 "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
76 "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
77 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
78 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
79 "dmtc1 %[i], %[ftmp3] \n\t" \
80 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
81 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
82 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83 MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
84 "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85 "xor %[mask], %[mask], %[ftmp3] \n\t" \
86 /* VP8_MBFILTER */ \
87 "li %[tmp0], 0x80808080 \n\t" \
88 "dmtc1 %[tmp0], %[ftmp7] \n\t" \
89 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
90 "xor %[p2], %[p2], %[ftmp7] \n\t" \
91 "xor %[p1], %[p1], %[ftmp7] \n\t" \
92 "xor %[p0], %[p0], %[ftmp7] \n\t" \
93 "xor %[q0], %[q0], %[ftmp7] \n\t" \
94 "xor %[q1], %[q1], %[ftmp7] \n\t" \
95 "xor %[q2], %[q2], %[ftmp7] \n\t" \
96 "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
97 "psubb %[ftmp5], %[q0], %[p0] \n\t" \
98 MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
99 MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
100 /* Right part */ \
101 "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
102 "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
103 "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
104 /* Left part */ \
105 "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
106 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
107 "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
108 /* Combine left and right part */ \
109 "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
110 "and %[ftmp1], %[ftmp1], %[mask] \n\t" \
111 "and %[ftmp2], %[ftmp1], %[hev] \n\t" \
112 "li %[tmp0], 0x04040404 \n\t" \
113 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
114 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
115 "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
116 "li %[tmp0], 0x0B \n\t" \
117 "dmtc1 %[tmp0], %[ftmp4] \n\t" \
118 PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
119 "li %[tmp0], 0x03030303 \n\t" \
120 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
121 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
122 "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
123 "li %[tmp0], 0x0B \n\t" \
124 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
125 PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
126 "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
127 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
128 /* filt_val &= ~hev */ \
129 "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
130 "xor %[hev], %[hev], %[ftmp0] \n\t" \
131 "and %[ftmp1], %[ftmp1], %[hev] \n\t" \
132 MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
133 "li %[tmp0], 0x07 \n\t" \
134 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
135 "li %[tmp0], 0x001b001b \n\t" \
136 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
137 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
138 "li %[tmp0], 0x003f003f \n\t" \
139 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
140 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
141 /* Right part */ \
142 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
143 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
144 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
145 /* Left part */ \
146 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
147 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
148 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
149 /* Combine left and right part */ \
150 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
151 "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
152 "xor %[q0], %[q0], %[ftmp7] \n\t" \
153 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
154 "xor %[p0], %[p0], %[ftmp7] \n\t" \
155 "li %[tmp0], 0x00120012 \n\t" \
156 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
157 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
158 /* Right part */ \
159 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
160 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
161 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
162 /* Left part */ \
163 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
164 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
165 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
166 /* Combine left and right part */ \
167 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
168 "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
169 "xor %[q1], %[q1], %[ftmp7] \n\t" \
170 "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
171 "xor %[p1], %[p1], %[ftmp7] \n\t" \
172 "li %[tmp0], 0x03 \n\t" \
173 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
174 /* Right part */ \
175 "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
176 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
177 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
178 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
179 /* Left part */ \
180 "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
181 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
182 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
183 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
184 /* Combine left and right part */ \
185 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
186 "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
187 "xor %[q2], %[q2], %[ftmp7] \n\t" \
188 "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
189 "xor %[p2], %[p2], %[ftmp7] \n\t"
190
191 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
192 MMI_ULWC1(%[ftmp1], src, 0x00) \
193 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
194 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
195 \
196 MMI_ULWC1(%[ftmp1], src, -0x01) \
197 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
198 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
199 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
200 \
201 MMI_ULWC1(%[ftmp1], src, -0x02) \
202 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
203 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
204 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
205 \
206 MMI_ULWC1(%[ftmp1], src, 0x01) \
207 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
208 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
209 \
210 MMI_ULWC1(%[ftmp1], src, 0x02) \
211 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
212 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
213 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
214 \
215 MMI_ULWC1(%[ftmp1], src, 0x03) \
216 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
217 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
218 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
219 \
220 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
221 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
222 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
223 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
224 \
225 MMI_SWC1(%[ftmp1], dst, 0x00)
226
227
228 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
229 MMI_ULWC1(%[ftmp1], src, 0x00) \
230 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
231 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
232 \
233 MMI_ULWC1(%[ftmp1], src, -0x01) \
234 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
235 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
236 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
237 \
238 MMI_ULWC1(%[ftmp1], src, 0x01) \
239 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
240 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
241 \
242 MMI_ULWC1(%[ftmp1], src, 0x02) \
243 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
244 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
245 "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
246 \
247 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
248 \
249 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
250 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
251 \
252 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
253 MMI_SWC1(%[ftmp1], dst, 0x00)
254
255
256 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
257 MMI_ULWC1(%[ftmp1], src, 0x00) \
258 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
259 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
260 \
261 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
262 MMI_ULWC1(%[ftmp1], src1, 0x00) \
263 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
264 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
265 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
266 \
267 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
268 MMI_ULWC1(%[ftmp1], src1, 0x00) \
269 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
270 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
271 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
272 \
273 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
274 MMI_ULWC1(%[ftmp1], src1, 0x00) \
275 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
276 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
277 \
278 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
279 MMI_ULWC1(%[ftmp1], src1, 0x00) \
280 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
281 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
282 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
283 \
284 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
285 MMI_ULWC1(%[ftmp1], src1, 0x00) \
286 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
287 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
288 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
289 \
290 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
291 \
292 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
293 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
294 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
295 \
296 MMI_SWC1(%[ftmp1], dst, 0x00)
297
298
299 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
300 MMI_ULWC1(%[ftmp1], src, 0x00) \
301 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
302 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
303 \
304 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
305 MMI_ULWC1(%[ftmp1], src1, 0x00) \
306 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
307 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
308 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
309 \
310 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
311 MMI_ULWC1(%[ftmp1], src1, 0x00) \
312 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
313 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
314 \
315 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
316 MMI_ULWC1(%[ftmp1], src1, 0x00) \
317 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
318 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
319 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
320 \
321 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
322 \
323 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
324 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
325 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
326 \
327 MMI_SWC1(%[ftmp1], dst, 0x00)
328
329
330 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
331 MMI_ULDC1(%[ftmp1], src, 0x00) \
332 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
333 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
334 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
335 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
336 \
337 MMI_ULDC1(%[ftmp1], src, -0x01) \
338 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
339 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
340 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
341 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
342 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
343 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
344 \
345 MMI_ULDC1(%[ftmp1], src, -0x02) \
346 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
347 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
348 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
349 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
350 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
351 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
352 \
353 MMI_ULDC1(%[ftmp1], src, 0x01) \
354 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
355 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
356 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
357 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
358 \
359 MMI_ULDC1(%[ftmp1], src, 0x02) \
360 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
361 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
362 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
363 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
364 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
365 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
366 \
367 MMI_ULDC1(%[ftmp1], src, 0x03) \
368 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
369 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
370 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
371 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
372 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
373 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
374 \
375 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
376 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
377 \
378 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
379 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
380 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
381 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
382 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
383 \
384 MMI_SDC1(%[ftmp1], dst, 0x00)
385
386
387 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
388 MMI_ULDC1(%[ftmp1], src, 0x00) \
389 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
390 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
391 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
392 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
393 \
394 MMI_ULDC1(%[ftmp1], src, -0x01) \
395 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
396 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
397 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
398 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
399 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
400 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
401 \
402 MMI_ULDC1(%[ftmp1], src, 0x01) \
403 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
404 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
405 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
406 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
407 \
408 MMI_ULDC1(%[ftmp1], src, 0x02) \
409 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
410 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
411 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
412 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
413 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
414 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
415 \
416 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
417 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
418 \
419 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
420 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
421 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
422 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
423 \
424 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
425 MMI_SDC1(%[ftmp1], dst, 0x00)
426
427
428 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
429 MMI_ULDC1(%[ftmp1], src, 0x00) \
430 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
431 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
432 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
433 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
434 \
435 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
436 MMI_ULDC1(%[ftmp1], src1, 0x00) \
437 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
438 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
439 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
440 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
441 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
442 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
443 \
444 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
445 MMI_ULDC1(%[ftmp1], src1, 0x00) \
446 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
447 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
448 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
449 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
450 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
451 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
452 \
453 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
454 MMI_ULDC1(%[ftmp1], src1, 0x00) \
455 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
456 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
457 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
458 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
459 \
460 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
461 MMI_ULDC1(%[ftmp1], src1, 0x00) \
462 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
463 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
464 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
465 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
466 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
467 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
468 \
469 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
470 MMI_ULDC1(%[ftmp1], src1, 0x00) \
471 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
472 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
473 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
474 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
475 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
476 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
477 \
478 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
479 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
480 \
481 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
482 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
483 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
484 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
485 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
486 \
487 MMI_SDC1(%[ftmp1], dst, 0x00)
488
489
490 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
491 MMI_ULDC1(%[ftmp1], src, 0x00) \
492 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
493 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
494 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
495 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
496 \
497 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
498 MMI_ULDC1(%[ftmp1], src1, 0x00) \
499 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
500 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
501 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
502 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
503 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
504 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
505 \
506 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
507 MMI_ULDC1(%[ftmp1], src1, 0x00) \
508 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
509 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
510 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
511 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
512 \
513 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
514 MMI_ULDC1(%[ftmp1], src1, 0x00) \
515 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
516 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
517 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
518 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
519 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
520 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
521 \
522 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
523 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
524 \
525 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
526 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
527 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
528 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
529 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
530 \
531 MMI_SDC1(%[ftmp1], dst, 0x00)
532
533
534 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
535 MMI_ULDC1(%[ftmp1], src, 0x00) \
536 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
537 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
538 "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
539 "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
540 \
541 MMI_ULDC1(%[ftmp1], src, 0x01) \
542 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
543 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
544 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
545 "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
546 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
547 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
548 \
549 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
550 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
551 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
552 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
553 \
554 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
555 MMI_SDC1(%[ftmp1], dst, 0x00)
556
557
558 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
559 MMI_ULWC1(%[ftmp1], src, 0x00) \
560 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
561 "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
562 \
563 MMI_ULWC1(%[ftmp1], src, 0x01) \
564 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
565 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
566 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
567 \
568 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
569 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
570 \
571 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
572 MMI_SWC1(%[ftmp1], dst, 0x00)
573
574
575 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
576 MMI_ULDC1(%[ftmp1], src, 0x00) \
577 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
578 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
579 "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
580 "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
581 \
582 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
583 MMI_ULDC1(%[ftmp1], src1, 0x00) \
584 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
585 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
586 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
587 "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
588 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
589 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
590 \
591 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
592 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
593 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
594 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
595 \
596 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
597 MMI_SDC1(%[ftmp1], dst, 0x00)
598
599
600 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
601 MMI_ULWC1(%[ftmp1], src, 0x00) \
602 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
603 "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
604 \
605 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
606 MMI_ULWC1(%[ftmp1], src1, 0x00) \
607 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
608 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
609 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
610 \
611 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
612 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
613 \
614 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
615 MMI_SWC1(%[ftmp1], dst, 0x00)
616
617
618 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
619 {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
620 0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
621
622 {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
623 0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
624
625 {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
626 0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
627
628 {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
629 0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
630
631 {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
632 0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
633
634 {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
635 0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
636
637 {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
638 0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
639 };
640
641 #if 0
642 #define FILTER_6TAP(src, F, stride) \
643 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
644 F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
645 F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
646
647 #define FILTER_4TAP(src, F, stride) \
648 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
649 F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
650
651 static const uint8_t subpel_filters[7][6] = {
652 { 0, 6, 123, 12, 1, 0 },
653 { 2, 11, 108, 36, 8, 1 },
654 { 0, 9, 93, 50, 6, 0 },
655 { 3, 16, 77, 77, 16, 3 },
656 { 0, 6, 50, 93, 9, 0 },
657 { 1, 8, 36, 108, 11, 2 },
658 { 0, 1, 12, 123, 6, 0 },
659 };
660
661 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
662 #define MUL_35468(a) (((a) * 35468) >> 16)
663 #endif
664
665 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
vp8_filter_common_is4tap(uint8_t * p,ptrdiff_t stride)666 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
667 ptrdiff_t stride)
668 {
669 int av_unused p1 = p[-2 * stride];
670 int av_unused p0 = p[-1 * stride];
671 int av_unused q0 = p[ 0 * stride];
672 int av_unused q1 = p[ 1 * stride];
673 int a, f1, f2;
674 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
675
676 a = 3 * (q0 - p0);
677 a += clip_int8(p1 - q1);
678 a = clip_int8(a);
679
680 // We deviate from the spec here with c(a+3) >> 3
681 // since that's what libvpx does.
682 f1 = FFMIN(a + 4, 127) >> 3;
683 f2 = FFMIN(a + 3, 127) >> 3;
684
685 // Despite what the spec says, we do need to clamp here to
686 // be bitexact with libvpx.
687 p[-1 * stride] = cm[p0 + f2];
688 p[ 0 * stride] = cm[q0 - f1];
689 }
690
vp8_filter_common_isnot4tap(uint8_t * p,ptrdiff_t stride)691 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
692 ptrdiff_t stride)
693 {
694 int av_unused p1 = p[-2 * stride];
695 int av_unused p0 = p[-1 * stride];
696 int av_unused q0 = p[ 0 * stride];
697 int av_unused q1 = p[ 1 * stride];
698 int a, f1, f2;
699 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
700
701 a = 3 * (q0 - p0);
702 a = clip_int8(a);
703
704 // We deviate from the spec here with c(a+3) >> 3
705 // since that's what libvpx does.
706 f1 = FFMIN(a + 4, 127) >> 3;
707 f2 = FFMIN(a + 3, 127) >> 3;
708
709 // Despite what the spec says, we do need to clamp here to
710 // be bitexact with libvpx.
711 p[-1 * stride] = cm[p0 + f2];
712 p[ 0 * stride] = cm[q0 - f1];
713 a = (f1 + 1) >> 1;
714 p[-2 * stride] = cm[p1 + a];
715 p[ 1 * stride] = cm[q1 - a];
716 }
717
vp8_simple_limit(uint8_t * p,ptrdiff_t stride,int flim)718 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
719 int flim)
720 {
721 int av_unused p1 = p[-2 * stride];
722 int av_unused p0 = p[-1 * stride];
723 int av_unused q0 = p[ 0 * stride];
724 int av_unused q1 = p[ 1 * stride];
725
726 return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
727 }
728
hev(uint8_t * p,ptrdiff_t stride,int thresh)729 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
730 {
731 int av_unused p1 = p[-2 * stride];
732 int av_unused p0 = p[-1 * stride];
733 int av_unused q0 = p[ 0 * stride];
734 int av_unused q1 = p[ 1 * stride];
735
736 return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
737 }
738
filter_mbedge(uint8_t * p,ptrdiff_t stride)739 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
740 {
741 int a0, a1, a2, w;
742 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
743
744 int av_unused p2 = p[-3 * stride];
745 int av_unused p1 = p[-2 * stride];
746 int av_unused p0 = p[-1 * stride];
747 int av_unused q0 = p[ 0 * stride];
748 int av_unused q1 = p[ 1 * stride];
749 int av_unused q2 = p[ 2 * stride];
750
751 w = clip_int8(p1 - q1);
752 w = clip_int8(w + 3 * (q0 - p0));
753
754 a0 = (27 * w + 63) >> 7;
755 a1 = (18 * w + 63) >> 7;
756 a2 = (9 * w + 63) >> 7;
757
758 p[-3 * stride] = cm[p2 + a2];
759 p[-2 * stride] = cm[p1 + a1];
760 p[-1 * stride] = cm[p0 + a0];
761 p[ 0 * stride] = cm[q0 - a0];
762 p[ 1 * stride] = cm[q1 - a1];
763 p[ 2 * stride] = cm[q2 - a2];
764 }
765
vp8_normal_limit(uint8_t * p,ptrdiff_t stride,int E,int I)766 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
767 int E, int I)
768 {
769 int av_unused p3 = p[-4 * stride];
770 int av_unused p2 = p[-3 * stride];
771 int av_unused p1 = p[-2 * stride];
772 int av_unused p0 = p[-1 * stride];
773 int av_unused q0 = p[ 0 * stride];
774 int av_unused q1 = p[ 1 * stride];
775 int av_unused q2 = p[ 2 * stride];
776 int av_unused q3 = p[ 3 * stride];
777
778 return vp8_simple_limit(p, stride, E) &&
779 FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
780 FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
781 FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
782 }
783
vp8_v_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)784 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
785 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
786 {
787 double ftmp[18];
788 uint32_t tmp[1];
789 DECLARE_DOUBLE_1;
790 DECLARE_DOUBLE_2;
791 DECLARE_UINT32_T;
792 __asm__ volatile(
793 /* Get data from dst */
794 "gsldlc1 %[q0], 0x07(%[dst]) \n\t"
795 "gsldrc1 %[q0], 0x00(%[dst]) \n\t"
796 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
797 "gsldlc1 %[p0], 0x07(%[tmp0]) \n\t"
798 "gsldrc1 %[p0], 0x00(%[tmp0]) \n\t"
799 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
800 "gsldlc1 %[p1], 0x07(%[tmp0]) \n\t"
801 "gsldrc1 %[p1], 0x00(%[tmp0]) \n\t"
802 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
803 "gsldlc1 %[p2], 0x07(%[tmp0]) \n\t"
804 "gsldrc1 %[p2], 0x00(%[tmp0]) \n\t"
805 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
806 "gsldlc1 %[p3], 0x07(%[tmp0]) \n\t"
807 "gsldrc1 %[p3], 0x00(%[tmp0]) \n\t"
808 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
809 "gsldlc1 %[q1], 0x07(%[tmp0]) \n\t"
810 "gsldrc1 %[q1], 0x00(%[tmp0]) \n\t"
811 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
812 "gsldlc1 %[q2], 0x07(%[tmp0]) \n\t"
813 "gsldrc1 %[q2], 0x00(%[tmp0]) \n\t"
814 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
815 "gsldlc1 %[q3], 0x07(%[tmp0]) \n\t"
816 "gsldrc1 %[q3], 0x00(%[tmp0]) \n\t"
817 MMI_VP8_LOOP_FILTER
818 /* Move to dst */
819 "gssdlc1 %[q0], 0x07(%[dst]) \n\t"
820 "gssdrc1 %[q0], 0x00(%[dst]) \n\t"
821 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
822 "gssdlc1 %[p0], 0x07(%[tmp0]) \n\t"
823 "gssdrc1 %[p0], 0x00(%[tmp0]) \n\t"
824 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
825 "gssdlc1 %[p1], 0x07(%[tmp0]) \n\t"
826 "gssdrc1 %[p1], 0x00(%[tmp0]) \n\t"
827 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
828 "gssdlc1 %[p2], 0x07(%[tmp0]) \n\t"
829 "gssdrc1 %[p2], 0x00(%[tmp0]) \n\t"
830 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
831 "gssdlc1 %[q1], 0x07(%[tmp0]) \n\t"
832 "gssdrc1 %[q1], 0x00(%[tmp0]) \n\t"
833 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
834 "gssdlc1 %[q2], 0x07(%[tmp0]) \n\t"
835 "gssdrc1 %[q2], 0x00(%[tmp0]) \n\t"
836 : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
837 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
838 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
839 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
840 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
841 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
842 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
843 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
844 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
845 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
846 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
847 RESTRICT_ASM_UINT32_T
848 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
849 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
850 : "memory"
851 );
852 }
853
vp8_v_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)854 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
855 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
856 {
857 int i;
858
859 for (i = 0; i < 8; i++)
860 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
861 int hv = hev(dst + i * 1, stride, hev_thresh);
862 if (hv)
863 vp8_filter_common_is4tap(dst + i * 1, stride);
864 else
865 vp8_filter_common_isnot4tap(dst + i * 1, stride);
866 }
867 }
868
vp8_h_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)869 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
870 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
871 {
872 double ftmp[18];
873 uint32_t tmp[1];
874 DECLARE_DOUBLE_1;
875 DECLARE_DOUBLE_2;
876 DECLARE_UINT32_T;
877 __asm__ volatile(
878 /* Get data from dst */
879 "gsldlc1 %[p3], 0x03(%[dst]) \n\t"
880 "gsldrc1 %[p3], -0x04(%[dst]) \n\t"
881 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
882 "gsldlc1 %[p2], 0x03(%[tmp0]) \n\t"
883 "gsldrc1 %[p2], -0x04(%[tmp0]) \n\t"
884 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
885 "gsldlc1 %[p1], 0x03(%[tmp0]) \n\t"
886 "gsldrc1 %[p1], -0x04(%[tmp0]) \n\t"
887 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
888 "gsldlc1 %[p0], 0x03(%[tmp0]) \n\t"
889 "gsldrc1 %[p0], -0x04(%[tmp0]) \n\t"
890 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
891 "gsldlc1 %[q0], 0x03(%[tmp0]) \n\t"
892 "gsldrc1 %[q0], -0x04(%[tmp0]) \n\t"
893 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
894 "gsldlc1 %[q1], 0x03(%[tmp0]) \n\t"
895 "gsldrc1 %[q1], -0x04(%[tmp0]) \n\t"
896 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
897 "gsldlc1 %[q2], 0x03(%[tmp0]) \n\t"
898 "gsldrc1 %[q2], -0x04(%[tmp0]) \n\t"
899 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
900 "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t"
901 "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t"
902 /* Matrix transpose */
903 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
904 %[q0], %[q1], %[q2], %[q3],
905 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
906 MMI_VP8_LOOP_FILTER
907 /* Matrix transpose */
908 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
909 %[q0], %[q1], %[q2], %[q3],
910 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
911 /* Move to dst */
912 "gssdlc1 %[p3], 0x03(%[dst]) \n\t"
913 "gssdrc1 %[p3], -0x04(%[dst]) \n\t"
914 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
915 "gssdlc1 %[p2], 0x03(%[dst]) \n\t"
916 "gssdrc1 %[p2], -0x04(%[dst]) \n\t"
917 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
918 "gssdlc1 %[p1], 0x03(%[dst]) \n\t"
919 "gssdrc1 %[p1], -0x04(%[dst]) \n\t"
920 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
921 "gssdlc1 %[p0], 0x03(%[dst]) \n\t"
922 "gssdrc1 %[p0], -0x04(%[dst]) \n\t"
923 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
924 "gssdlc1 %[q0], 0x03(%[dst]) \n\t"
925 "gssdrc1 %[q0], -0x04(%[dst]) \n\t"
926 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
927 "gssdlc1 %[q1], 0x03(%[dst]) \n\t"
928 "gssdrc1 %[q1], -0x04(%[dst]) \n\t"
929 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
930 "gssdlc1 %[q2], 0x03(%[dst]) \n\t"
931 "gssdrc1 %[q2], -0x04(%[dst]) \n\t"
932 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
933 "gssdlc1 %[q3], 0x03(%[dst]) \n\t"
934 "gssdrc1 %[q3], -0x04(%[dst]) \n\t"
935 : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
936 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
937 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
938 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
939 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
940 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
941 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
942 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
943 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
944 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
945 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
946 RESTRICT_ASM_UINT32_T
947 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
948 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
949 : "memory"
950 );
951 }
952
vp8_h_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)953 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
954 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
955 {
956 int i;
957
958 for (i = 0; i < 8; i++)
959 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
960 int hv = hev(dst + i * stride, 1, hev_thresh);
961 if (hv)
962 vp8_filter_common_is4tap(dst + i * stride, 1);
963 else
964 vp8_filter_common_isnot4tap(dst + i * stride, 1);
965 }
966 }
967
ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16],int16_t dc[16])968 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
969 {
970 #if 1
971 double ftmp[8];
972 DECLARE_VAR_ALL64;
973
974 __asm__ volatile (
975 MMI_LDC1(%[ftmp0], %[dc], 0x00)
976 MMI_LDC1(%[ftmp1], %[dc], 0x08)
977 MMI_LDC1(%[ftmp2], %[dc], 0x10)
978 MMI_LDC1(%[ftmp3], %[dc], 0x18)
979 "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
980 "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
981 "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
982 "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
983 "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
984 "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
985 "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
986 "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
987 MMI_SDC1(%[ftmp0], %[dc], 0x00)
988 MMI_SDC1(%[ftmp1], %[dc], 0x08)
989 MMI_SDC1(%[ftmp2], %[dc], 0x10)
990 MMI_SDC1(%[ftmp3], %[dc], 0x18)
991 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
992 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
993 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
994 [ftmp6]"=&f"(ftmp[6]),
995 RESTRICT_ASM_ALL64
996 [ftmp7]"=&f"(ftmp[7])
997 : [dc]"r"((uint8_t*)dc)
998 : "memory"
999 );
1000
1001 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1002 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1003 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1004 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1005
1006 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1007 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1008 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1009 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1010
1011 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1012 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1013 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1014 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1015
1016 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1017 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1018 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1019 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1020
1021 __asm__ volatile (
1022 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1023 MMI_SDC1(%[ftmp0], %[dc], 0x00)
1024 MMI_SDC1(%[ftmp0], %[dc], 0x08)
1025 MMI_SDC1(%[ftmp0], %[dc], 0x10)
1026 MMI_SDC1(%[ftmp0], %[dc], 0x18)
1027 : RESTRICT_ASM_ALL64
1028 [ftmp0]"=&f"(ftmp[0])
1029 : [dc]"r"((uint8_t *)dc)
1030 : "memory"
1031 );
1032 #else
1033 int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1034
1035 t00 = dc[0] + dc[12];
1036 t10 = dc[1] + dc[13];
1037 t20 = dc[2] + dc[14];
1038 t30 = dc[3] + dc[15];
1039
1040 t03 = dc[0] - dc[12];
1041 t13 = dc[1] - dc[13];
1042 t23 = dc[2] - dc[14];
1043 t33 = dc[3] - dc[15];
1044
1045 t01 = dc[4] + dc[ 8];
1046 t11 = dc[5] + dc[ 9];
1047 t21 = dc[6] + dc[10];
1048 t31 = dc[7] + dc[11];
1049
1050 t02 = dc[4] - dc[ 8];
1051 t12 = dc[5] - dc[ 9];
1052 t22 = dc[6] - dc[10];
1053 t32 = dc[7] - dc[11];
1054
1055 dc[ 0] = t00 + t01;
1056 dc[ 1] = t10 + t11;
1057 dc[ 2] = t20 + t21;
1058 dc[ 3] = t30 + t31;
1059
1060 dc[ 4] = t03 + t02;
1061 dc[ 5] = t13 + t12;
1062 dc[ 6] = t23 + t22;
1063 dc[ 7] = t33 + t32;
1064
1065 dc[ 8] = t00 - t01;
1066 dc[ 9] = t10 - t11;
1067 dc[10] = t20 - t21;
1068 dc[11] = t30 - t31;
1069
1070 dc[12] = t03 - t02;
1071 dc[13] = t13 - t12;
1072 dc[14] = t23 - t22;
1073 dc[15] = t33 - t32;
1074
1075 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1076 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1077 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1078 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1079
1080 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1081 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1082 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1083 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1084
1085 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1086 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1087 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1088 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1089
1090 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1091 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1092 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1093 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1094
1095 AV_ZERO64(dc + 0);
1096 AV_ZERO64(dc + 4);
1097 AV_ZERO64(dc + 8);
1098 AV_ZERO64(dc + 12);
1099 #endif
1100 }
1101
ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16],int16_t dc[16])1102 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1103 {
1104 int val = (dc[0] + 3) >> 3;
1105
1106 dc[0] = 0;
1107
1108 block[0][0][0] = val;
1109 block[0][1][0] = val;
1110 block[0][2][0] = val;
1111 block[0][3][0] = val;
1112 block[1][0][0] = val;
1113 block[1][1][0] = val;
1114 block[1][2][0] = val;
1115 block[1][3][0] = val;
1116 block[2][0][0] = val;
1117 block[2][1][0] = val;
1118 block[2][2][0] = val;
1119 block[2][3][0] = val;
1120 block[3][0][0] = val;
1121 block[3][1][0] = val;
1122 block[3][2][0] = val;
1123 block[3][3][0] = val;
1124 }
1125
ff_vp8_idct_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1126 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1127 {
1128 #if 1
1129 DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1130 DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1131 double ftmp[12];
1132 uint32_t tmp[1];
1133 DECLARE_VAR_LOW32;
1134 DECLARE_VAR_ALL64;
1135
1136 __asm__ volatile (
1137 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1138 MMI_LDC1(%[ftmp1], %[block], 0x00)
1139 MMI_LDC1(%[ftmp2], %[block], 0x08)
1140 MMI_LDC1(%[ftmp3], %[block], 0x10)
1141 MMI_LDC1(%[ftmp4], %[block], 0x18)
1142
1143 "li %[tmp0], 0x02 \n\t"
1144 "mtc1 %[tmp0], %[ftmp11] \n\t"
1145
1146 // block[0...3] + block[8...11]
1147 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1148 // block[0...3] - block[8...11]
1149 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1150 // MUL_35468(block[12...15])
1151 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1152 "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1153 // MUL_35468(block[4...7])
1154 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1155 "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1156 // MUL_20091(block[4...7]
1157 "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1158 "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1159 // MUL_20091(block[12...15])
1160 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1161 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1162
1163 // tmp[0 4 8 12]
1164 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1165 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1166 // tmp[1 5 9 13]
1167 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1168 "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1169 // tmp[2 6 10 14]
1170 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1171 "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1172 // tmp[3 7 11 15]
1173 "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1174 "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1175
1176 MMI_SDC1(%[ftmp0], %[block], 0x00)
1177 MMI_SDC1(%[ftmp0], %[block], 0x08)
1178 MMI_SDC1(%[ftmp0], %[block], 0x10)
1179 MMI_SDC1(%[ftmp0], %[block], 0x18)
1180
1181 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1182 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1183
1184 // t[0 4 8 12]
1185 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1186 // t[1 5 9 13]
1187 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1188 // t[2 6 10 14]
1189 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1190 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1191 "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1192 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1193 "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1194 // t[3 7 11 15]
1195 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1196 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1197 "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1198 "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1199 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1200
1201 "li %[tmp0], 0x03 \n\t"
1202 "mtc1 %[tmp0], %[ftmp11] \n\t"
1203 "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1204 "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1205 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1206 "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1207 "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1208 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1209 "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1210 "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1211 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1212 "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1213 "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1214 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1215
1216 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1217 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1218
1219 MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1220 MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1221 MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1222 MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1223
1224 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1225 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1226 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1227 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1228
1229 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1230 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1231 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1232 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1233
1234 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1235 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1236 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1237 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1238
1239 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1240 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1241 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1242 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1243 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1244 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1245 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1246 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1247 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1248 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1249 RESTRICT_ASM_LOW32
1250 RESTRICT_ASM_ALL64
1251 [tmp0]"=&r"(tmp[0])
1252 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1253 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1254 [block]"r"(block), [ff_pw_4]"f"(ff_pw_4),
1255 [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_22a3]"f"(ff_ph_22a3)
1256 : "memory"
1257 );
1258 #else
1259 int i, t0, t1, t2, t3;
1260 int16_t tmp[16];
1261
1262 for (i = 0; i < 4; i++) {
1263 t0 = block[0 + i] + block[8 + i];
1264 t1 = block[0 + i] - block[8 + i];
1265 t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1266 t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1267 block[ 0 + i] = 0;
1268 block[ 4 + i] = 0;
1269 block[ 8 + i] = 0;
1270 block[12 + i] = 0;
1271
1272 tmp[i * 4 + 0] = t0 + t3;
1273 tmp[i * 4 + 1] = t1 + t2;
1274 tmp[i * 4 + 2] = t1 - t2;
1275 tmp[i * 4 + 3] = t0 - t3;
1276 }
1277
1278 for (i = 0; i < 4; i++) {
1279 t0 = tmp[0 + i] + tmp[8 + i];
1280 t1 = tmp[0 + i] - tmp[8 + i];
1281 t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1282 t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1283
1284 dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1285 dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1286 dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1287 dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1288 dst += stride;
1289 }
1290 #endif
1291 }
1292
ff_vp8_idct_dc_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1293 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1294 {
1295 #if 1
1296 int dc = (block[0] + 4) >> 3;
1297 double ftmp[6];
1298 DECLARE_VAR_LOW32;
1299
1300 block[0] = 0;
1301
1302 __asm__ volatile (
1303 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1304 "mtc1 %[dc], %[ftmp5] \n\t"
1305 MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1306 MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1307 MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1308 MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1309 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1310 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1311 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1312 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1313 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1314 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1315 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1316 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1317 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1318 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1319 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1320 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1321 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1322 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1323 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1324 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1325 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1326 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1327 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1328 [ftmp4]"=&f"(ftmp[4]),
1329 RESTRICT_ASM_LOW32
1330 [ftmp5]"=&f"(ftmp[5])
1331 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1332 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1333 [dc]"r"(dc)
1334 : "memory"
1335 );
1336 #else
1337 int i, dc = (block[0] + 4) >> 3;
1338
1339 block[0] = 0;
1340
1341 for (i = 0; i < 4; i++) {
1342 dst[0] = av_clip_uint8(dst[0] + dc);
1343 dst[1] = av_clip_uint8(dst[1] + dc);
1344 dst[2] = av_clip_uint8(dst[2] + dc);
1345 dst[3] = av_clip_uint8(dst[3] + dc);
1346 dst += stride;
1347 }
1348 #endif
1349 }
1350
ff_vp8_idct_dc_add4y_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1351 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1352 ptrdiff_t stride)
1353 {
1354 ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1355 ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1356 ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1357 ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1358 }
1359
ff_vp8_idct_dc_add4uv_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1360 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1361 ptrdiff_t stride)
1362 {
1363 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1364 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1365 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1366 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1367 }
1368
1369 // loop filter applied to edges between macroblocks
ff_vp8_v_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1370 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1371 int flim_I, int hev_thresh)
1372 {
1373 vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1374 vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1375 }
1376
ff_vp8_h_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1377 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1378 int flim_I, int hev_thresh)
1379 {
1380 vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1381 vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1382 hev_thresh);
1383 }
1384
ff_vp8_v_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1385 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1386 int flim_E, int flim_I, int hev_thresh)
1387 {
1388 vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1389 vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1390 }
1391
ff_vp8_h_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1392 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1393 int flim_E, int flim_I, int hev_thresh)
1394 {
1395 vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1396 vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1397 }
1398
1399 // loop filter applied to inner macroblock edges
ff_vp8_v_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1400 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1401 int flim_E, int flim_I, int hev_thresh)
1402 {
1403 int i;
1404
1405 for (i = 0; i < 16; i++)
1406 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1407 int hv = hev(dst + i * 1, stride, hev_thresh);
1408 if (hv)
1409 vp8_filter_common_is4tap(dst + i * 1, stride);
1410 else
1411 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1412 }
1413 }
1414
ff_vp8_h_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1415 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1416 int flim_E, int flim_I, int hev_thresh)
1417 {
1418 int i;
1419
1420 for (i = 0; i < 16; i++)
1421 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1422 int hv = hev(dst + i * stride, 1, hev_thresh);
1423 if (hv)
1424 vp8_filter_common_is4tap(dst + i * stride, 1);
1425 else
1426 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1427 }
1428 }
1429
ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1430 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1431 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1432 {
1433 vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1434 vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1435 }
1436
ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1437 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1438 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1439 {
1440 vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1441 vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1442 }
1443
ff_vp8_v_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1444 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1445 {
1446 int i;
1447
1448 for (i = 0; i < 16; i++)
1449 if (vp8_simple_limit(dst + i, stride, flim))
1450 vp8_filter_common_is4tap(dst + i, stride);
1451 }
1452
ff_vp8_h_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1453 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1454 {
1455 int i;
1456
1457 for (i = 0; i < 16; i++)
1458 if (vp8_simple_limit(dst + i * stride, 1, flim))
1459 vp8_filter_common_is4tap(dst + i * stride, 1);
1460 }
1461
ff_put_vp8_pixels16_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1462 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1463 ptrdiff_t srcstride, int h, int x, int y)
1464 {
1465 #if 1
1466 double ftmp[2];
1467 uint64_t tmp[2];
1468 mips_reg addr[2];
1469 DECLARE_VAR_ALL64;
1470
1471 __asm__ volatile (
1472 "1: \n\t"
1473 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1474 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1475 "ldl %[tmp0], 0x0f(%[src]) \n\t"
1476 "ldr %[tmp0], 0x08(%[src]) \n\t"
1477 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1478 "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1479 "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1480 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1481 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1482 "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1483 "sdr %[tmp0], 0x08(%[dst]) \n\t"
1484 "addiu %[h], %[h], -0x02 \n\t"
1485 MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1486 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1487 "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1488 "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1489 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1490 "bnez %[h], 1b \n\t"
1491 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1492 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1493 RESTRICT_ASM_ALL64
1494 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1495 [dst]"+&r"(dst), [src]"+&r"(src),
1496 [h]"+&r"(h)
1497 : [dststride]"r"((mips_reg)dststride),
1498 [srcstride]"r"((mips_reg)srcstride)
1499 : "memory"
1500 );
1501 #else
1502 int i;
1503
1504 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1505 memcpy(dst, src, 16);
1506 #endif
1507 }
1508
ff_put_vp8_pixels8_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1509 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1510 ptrdiff_t srcstride, int h, int x, int y)
1511 {
1512 #if 1
1513 double ftmp[1];
1514 uint64_t tmp[1];
1515 mips_reg addr[2];
1516 DECLARE_VAR_ALL64;
1517
1518 __asm__ volatile (
1519 "1: \n\t"
1520 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1521 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1522 "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1523 "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1524 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1525 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1526 "addiu %[h], %[h], -0x02 \n\t"
1527 "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1528 "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1529 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1530 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1531 "bnez %[h], 1b \n\t"
1532 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1533 RESTRICT_ASM_ALL64
1534 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1535 [dst]"+&r"(dst), [src]"+&r"(src),
1536 [h]"+&r"(h)
1537 : [dststride]"r"((mips_reg)dststride),
1538 [srcstride]"r"((mips_reg)srcstride)
1539 : "memory"
1540 );
1541 #else
1542 int i;
1543
1544 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1545 memcpy(dst, src, 8);
1546 #endif
1547 }
1548
ff_put_vp8_pixels4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1549 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1550 ptrdiff_t srcstride, int h, int x, int y)
1551 {
1552 #if 1
1553 double ftmp[1];
1554 uint64_t tmp[1];
1555 mips_reg addr[2];
1556 DECLARE_VAR_LOW32;
1557
1558 __asm__ volatile (
1559 "1: \n\t"
1560 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1561 MMI_LWC1(%[ftmp0], %[src], 0x00)
1562 "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1563 "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1564 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1565 MMI_SWC1(%[ftmp0], %[dst], 0x00)
1566 "addiu %[h], %[h], -0x02 \n\t"
1567 "swl %[tmp0], 0x03(%[addr1]) \n\t"
1568 "swr %[tmp0], 0x00(%[addr1]) \n\t"
1569 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1570 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1571 "bnez %[h], 1b \n\t"
1572 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1573 RESTRICT_ASM_LOW32
1574 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1575 [dst]"+&r"(dst), [src]"+&r"(src),
1576 [h]"+&r"(h)
1577 : [dststride]"r"((mips_reg)dststride),
1578 [srcstride]"r"((mips_reg)srcstride)
1579 : "memory"
1580 );
1581 #else
1582 int i;
1583
1584 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1585 memcpy(dst, src, 4);
1586 #endif
1587 }
1588
ff_put_vp8_epel16_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1589 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1590 ptrdiff_t srcstride, int h, int mx, int my)
1591 {
1592 #if 1
1593 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1594 double ftmp[9];
1595 uint32_t tmp[1];
1596 mips_reg src1, dst1;
1597 DECLARE_VAR_ALL64;
1598
1599 /*
1600 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1601 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1602 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1603 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1604 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1605 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1606 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1607 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1608
1609 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1610 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1611 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1612 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1613 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1614 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1615 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1616 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1617 */
1618 __asm__ volatile (
1619 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1620 "li %[tmp0], 0x07 \n\t"
1621 "mtc1 %[tmp0], %[ftmp4] \n\t"
1622
1623 "1: \n\t"
1624 // 0 - 7
1625 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1626 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1627 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1628 // 8 - 15
1629 PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1630
1631 "addiu %[h], %[h], -0x01 \n\t"
1632 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1633 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1634 "bnez %[h], 1b \n\t"
1635 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1636 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1637 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1638 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1639 [ftmp8]"=&f"(ftmp[8]),
1640 [tmp0]"=&r"(tmp[0]),
1641 RESTRICT_ASM_ALL64
1642 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1643 [h]"+&r"(h),
1644 [dst]"+&r"(dst), [src]"+&r"(src)
1645 : [ff_pw_64]"f"(ff_pw_64),
1646 [srcstride]"r"((mips_reg)srcstride),
1647 [dststride]"r"((mips_reg)dststride),
1648 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1649 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1650 : "memory"
1651 );
1652 #else
1653 const uint8_t *filter = subpel_filters[mx - 1];
1654 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1655 int x, y;
1656
1657 for (y = 0; y < h; y++) {
1658 for (x = 0; x < 16; x++)
1659 dst[x] = FILTER_4TAP(src, filter, 1);
1660 dst += dststride;
1661 src += srcstride;
1662 }
1663 #endif
1664 }
1665
ff_put_vp8_epel8_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1666 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1667 ptrdiff_t srcstride, int h, int mx, int my)
1668 {
1669 #if 1
1670 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1671 double ftmp[9];
1672 uint32_t tmp[1];
1673 DECLARE_VAR_ALL64;
1674
1675 /*
1676 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1677 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1678 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1679 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1680 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1681 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1682 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1683 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1684 */
1685 __asm__ volatile (
1686 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1687 "li %[tmp0], 0x07 \n\t"
1688 "mtc1 %[tmp0], %[ftmp4] \n\t"
1689
1690 "1: \n\t"
1691 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1692
1693 "addiu %[h], %[h], -0x01 \n\t"
1694 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1695 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1696 "bnez %[h], 1b \n\t"
1697 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1698 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1699 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1700 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1701 [ftmp8]"=&f"(ftmp[8]),
1702 [tmp0]"=&r"(tmp[0]),
1703 RESTRICT_ASM_ALL64
1704 [h]"+&r"(h),
1705 [dst]"+&r"(dst), [src]"+&r"(src)
1706 : [ff_pw_64]"f"(ff_pw_64),
1707 [srcstride]"r"((mips_reg)srcstride),
1708 [dststride]"r"((mips_reg)dststride),
1709 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1710 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1711 : "memory"
1712 );
1713 #else
1714 const uint8_t *filter = subpel_filters[mx - 1];
1715 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1716 int x, y;
1717
1718 for (y = 0; y < h; y++) {
1719 for (x = 0; x < 8; x++)
1720 dst[x] = FILTER_4TAP(src, filter, 1);
1721 dst += dststride;
1722 src += srcstride;
1723 }
1724 #endif
1725 }
1726
ff_put_vp8_epel4_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1727 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1728 ptrdiff_t srcstride, int h, int mx, int my)
1729 {
1730 #if 1
1731 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1732 double ftmp[6];
1733 uint32_t tmp[1];
1734 DECLARE_VAR_LOW32;
1735
1736 /*
1737 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1738 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1739 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1740 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1741 */
1742 __asm__ volatile (
1743 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1744 "li %[tmp0], 0x07 \n\t"
1745 "mtc1 %[tmp0], %[ftmp4] \n\t"
1746
1747 "1: \n\t"
1748 PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1749
1750 "addiu %[h], %[h], -0x01 \n\t"
1751 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1752 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1753 "bnez %[h], 1b \n\t"
1754 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1755 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1756 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1757 [tmp0]"=&r"(tmp[0]),
1758 RESTRICT_ASM_LOW32
1759 [h]"+&r"(h),
1760 [dst]"+&r"(dst), [src]"+&r"(src)
1761 : [ff_pw_64]"f"(ff_pw_64),
1762 [srcstride]"r"((mips_reg)srcstride),
1763 [dststride]"r"((mips_reg)dststride),
1764 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1765 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1766 : "memory"
1767 );
1768 #else
1769 const uint8_t *filter = subpel_filters[mx - 1];
1770 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1771 int x, y;
1772
1773 for (y = 0; y < h; y++) {
1774 for (x = 0; x < 4; x++)
1775 dst[x] = FILTER_4TAP(src, filter, 1);
1776 dst += dststride;
1777 src += srcstride;
1778 }
1779 #endif
1780 }
1781
ff_put_vp8_epel16_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1782 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1783 ptrdiff_t srcstride, int h, int mx, int my)
1784 {
1785 #if 1
1786 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1787 double ftmp[9];
1788 uint32_t tmp[1];
1789 mips_reg src1, dst1;
1790 DECLARE_VAR_ALL64;
1791
1792 /*
1793 dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1794 dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1795 dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1796 dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1797 dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1798 dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1799 dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1800 dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1801
1802 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1803 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1804 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1805 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1806 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1807 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1808 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1809 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1810 */
1811 __asm__ volatile (
1812 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1813 "li %[tmp0], 0x07 \n\t"
1814 "mtc1 %[tmp0], %[ftmp4] \n\t"
1815
1816 "1: \n\t"
1817 // 0 - 7
1818 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1819 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1820 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1821 // 8 - 15
1822 PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1823
1824 "addiu %[h], %[h], -0x01 \n\t"
1825 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1826 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1827 "bnez %[h], 1b \n\t"
1828 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1829 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1830 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1831 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1832 [ftmp8]"=&f"(ftmp[8]),
1833 [tmp0]"=&r"(tmp[0]),
1834 RESTRICT_ASM_ALL64
1835 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1836 [h]"+&r"(h),
1837 [dst]"+&r"(dst), [src]"+&r"(src)
1838 : [ff_pw_64]"f"(ff_pw_64),
1839 [srcstride]"r"((mips_reg)srcstride),
1840 [dststride]"r"((mips_reg)dststride),
1841 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1842 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1843 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1844 : "memory"
1845 );
1846 #else
1847 const uint8_t *filter = subpel_filters[mx - 1];
1848 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1849 int x, y;
1850
1851 for (y = 0; y < h; y++) {
1852 for (x = 0; x < 16; x++)
1853 dst[x] = FILTER_6TAP(src, filter, 1);
1854 dst += dststride;
1855 src += srcstride;
1856 }
1857 #endif
1858 }
1859
ff_put_vp8_epel8_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1860 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1861 ptrdiff_t srcstride, int h, int mx, int my)
1862 {
1863 #if 1
1864 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1865 double ftmp[9];
1866 uint32_t tmp[1];
1867 DECLARE_VAR_ALL64;
1868
1869 /*
1870 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1871 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1872 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1873 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1874 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1875 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1876 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1877 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1878 */
1879 __asm__ volatile (
1880 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1881 "li %[tmp0], 0x07 \n\t"
1882 "mtc1 %[tmp0], %[ftmp4] \n\t"
1883
1884 "1: \n\t"
1885 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1886
1887 "addiu %[h], %[h], -0x01 \n\t"
1888 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1889 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1890 "bnez %[h], 1b \n\t"
1891 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1892 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1893 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1894 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1895 [ftmp8]"=&f"(ftmp[8]),
1896 [tmp0]"=&r"(tmp[0]),
1897 RESTRICT_ASM_ALL64
1898 [h]"+&r"(h),
1899 [dst]"+&r"(dst), [src]"+&r"(src)
1900 : [ff_pw_64]"f"(ff_pw_64),
1901 [srcstride]"r"((mips_reg)srcstride),
1902 [dststride]"r"((mips_reg)dststride),
1903 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1904 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1905 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1906 : "memory"
1907 );
1908 #else
1909 const uint8_t *filter = subpel_filters[mx - 1];
1910 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1911 int x, y;
1912
1913 for (y = 0; y < h; y++) {
1914 for (x = 0; x < 8; x++)
1915 dst[x] = FILTER_6TAP(src, filter, 1);
1916 dst += dststride;
1917 src += srcstride;
1918 }
1919 #endif
1920 }
1921
ff_put_vp8_epel4_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1922 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1923 ptrdiff_t srcstride, int h, int mx, int my)
1924 {
1925 #if 1
1926 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1927 double ftmp[6];
1928 uint32_t tmp[1];
1929 DECLARE_VAR_LOW32;
1930
1931 /*
1932 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1933 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1934 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1935 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1936 */
1937 __asm__ volatile (
1938 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1939 "li %[tmp0], 0x07 \n\t"
1940 "mtc1 %[tmp0], %[ftmp4] \n\t"
1941
1942 "1: \n\t"
1943 PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1944
1945 "addiu %[h], %[h], -0x01 \n\t"
1946 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1947 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1948 "bnez %[h], 1b \n\t"
1949 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1950 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1951 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1952 [tmp0]"=&r"(tmp[0]),
1953 RESTRICT_ASM_LOW32
1954 [h]"+&r"(h),
1955 [dst]"+&r"(dst), [src]"+&r"(src)
1956 : [ff_pw_64]"f"(ff_pw_64),
1957 [srcstride]"r"((mips_reg)srcstride),
1958 [dststride]"r"((mips_reg)dststride),
1959 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1960 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1961 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1962 : "memory"
1963 );
1964 #else
1965 const uint8_t *filter = subpel_filters[mx - 1];
1966 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1967 int x, y;
1968
1969 for (y = 0; y < h; y++) {
1970 for (x = 0; x < 4; x++)
1971 dst[x] = FILTER_6TAP(src, filter, 1);
1972 dst += dststride;
1973 src += srcstride;
1974 }
1975 #endif
1976 }
1977
ff_put_vp8_epel16_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1978 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1979 ptrdiff_t srcstride, int h, int mx, int my)
1980 {
1981 #if 1
1982 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1983 double ftmp[9];
1984 uint32_t tmp[1];
1985 mips_reg src0, src1, dst0;
1986 DECLARE_VAR_ALL64;
1987
1988 /*
1989 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1990 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1991 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1992 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1993 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1994 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1995 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1996 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1997
1998 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
1999 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2000 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2001 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2002 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2003 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2004 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2005 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2006 */
2007 __asm__ volatile (
2008 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2009 "li %[tmp0], 0x07 \n\t"
2010 "mtc1 %[tmp0], %[ftmp4] \n\t"
2011
2012 "1: \n\t"
2013 // 0 - 7
2014 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2015 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2016 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2017 // 8 - 15
2018 PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2019
2020 "addiu %[h], %[h], -0x01 \n\t"
2021 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2022 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2023 "bnez %[h], 1b \n\t"
2024 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2025 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2026 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2027 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2028 [ftmp8]"=&f"(ftmp[8]),
2029 [tmp0]"=&r"(tmp[0]),
2030 RESTRICT_ASM_ALL64
2031 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2032 [src1]"=&r"(src1),
2033 [h]"+&r"(h),
2034 [dst]"+&r"(dst), [src]"+&r"(src)
2035 : [ff_pw_64]"f"(ff_pw_64),
2036 [srcstride]"r"((mips_reg)srcstride),
2037 [dststride]"r"((mips_reg)dststride),
2038 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2039 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2040 : "memory"
2041 );
2042 #else
2043 const uint8_t *filter = subpel_filters[my - 1];
2044 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2045 int x, y;
2046
2047 for (y = 0; y < h; y++) {
2048 for (x = 0; x < 16; x++)
2049 dst[x] = FILTER_4TAP(src, filter, srcstride);
2050 dst += dststride;
2051 src += srcstride;
2052 }
2053 #endif
2054 }
2055
ff_put_vp8_epel8_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2056 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2057 ptrdiff_t srcstride, int h, int mx, int my)
2058 {
2059 #if 1
2060 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2061 double ftmp[9];
2062 uint32_t tmp[1];
2063 mips_reg src1;
2064 DECLARE_VAR_ALL64;
2065
2066 /*
2067 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2068 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2069 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2070 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2071 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2072 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2073 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2074 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2075 */
2076 __asm__ volatile (
2077 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2078 "li %[tmp0], 0x07 \n\t"
2079 "mtc1 %[tmp0], %[ftmp4] \n\t"
2080
2081 "1: \n\t"
2082 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2083
2084 "addiu %[h], %[h], -0x01 \n\t"
2085 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2086 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2087 "bnez %[h], 1b \n\t"
2088 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2089 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2090 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2091 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2092 [ftmp8]"=&f"(ftmp[8]),
2093 [tmp0]"=&r"(tmp[0]),
2094 RESTRICT_ASM_ALL64
2095 [src1]"=&r"(src1),
2096 [h]"+&r"(h),
2097 [dst]"+&r"(dst), [src]"+&r"(src)
2098 : [ff_pw_64]"f"(ff_pw_64),
2099 [srcstride]"r"((mips_reg)srcstride),
2100 [dststride]"r"((mips_reg)dststride),
2101 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2102 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2103 : "memory"
2104 );
2105 #else
2106 const uint8_t *filter = subpel_filters[my - 1];
2107 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2108 int x, y;
2109
2110 for (y = 0; y < h; y++) {
2111 for (x = 0; x < 8; x++)
2112 dst[x] = FILTER_4TAP(src, filter, srcstride);
2113 dst += dststride;
2114 src += srcstride;
2115 }
2116 #endif
2117 }
2118
ff_put_vp8_epel4_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2119 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2120 ptrdiff_t srcstride, int h, int mx, int my)
2121 {
2122 #if 1
2123 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2124 double ftmp[6];
2125 uint32_t tmp[1];
2126 mips_reg src1;
2127 DECLARE_VAR_LOW32;
2128
2129 /*
2130 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2131 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2132 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2133 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2134 */
2135 __asm__ volatile (
2136 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2137 "li %[tmp0], 0x07 \n\t"
2138 "mtc1 %[tmp0], %[ftmp4] \n\t"
2139
2140 "1: \n\t"
2141 PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2142
2143 "addiu %[h], %[h], -0x01 \n\t"
2144 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2145 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2146 "bnez %[h], 1b \n\t"
2147 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2148 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2149 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2150 [tmp0]"=&r"(tmp[0]),
2151 RESTRICT_ASM_LOW32
2152 [src1]"=&r"(src1),
2153 [h]"+&r"(h),
2154 [dst]"+&r"(dst), [src]"+&r"(src)
2155 : [ff_pw_64]"f"(ff_pw_64),
2156 [srcstride]"r"((mips_reg)srcstride),
2157 [dststride]"r"((mips_reg)dststride),
2158 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2159 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2160 : "memory"
2161 );
2162 #else
2163 const uint8_t *filter = subpel_filters[my - 1];
2164 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2165 int x, y;
2166
2167 for (y = 0; y < h; y++) {
2168 for (x = 0; x < 4; x++)
2169 dst[x] = FILTER_4TAP(src, filter, srcstride);
2170 dst += dststride;
2171 src += srcstride;
2172 }
2173 #endif
2174 }
2175
ff_put_vp8_epel16_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2176 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2177 ptrdiff_t srcstride, int h, int mx, int my)
2178 {
2179 #if 1
2180 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2181 double ftmp[9];
2182 uint32_t tmp[1];
2183 mips_reg src0, src1, dst0;
2184 DECLARE_VAR_ALL64;
2185
2186 /*
2187 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2188 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2189 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2190 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2191 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2192 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2193 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2194 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2195
2196 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2197 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2198 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2199 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2200 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2201 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2202 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2203 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2204 */
2205 __asm__ volatile (
2206 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2207 "li %[tmp0], 0x07 \n\t"
2208 "mtc1 %[tmp0], %[ftmp4] \n\t"
2209
2210 "1: \n\t"
2211 // 0 - 7
2212 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2213 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2214 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2215 // 8 - 15
2216 PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2217
2218 "addiu %[h], %[h], -0x01 \n\t"
2219 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2220 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2221 "bnez %[h], 1b \n\t"
2222 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2223 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2224 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2225 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2226 [ftmp8]"=&f"(ftmp[8]),
2227 [tmp0]"=&r"(tmp[0]),
2228 RESTRICT_ASM_ALL64
2229 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2230 [src1]"=&r"(src1),
2231 [h]"+&r"(h),
2232 [dst]"+&r"(dst), [src]"+&r"(src)
2233 : [ff_pw_64]"f"(ff_pw_64),
2234 [srcstride]"r"((mips_reg)srcstride),
2235 [dststride]"r"((mips_reg)dststride),
2236 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2237 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2238 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2239 : "memory"
2240 );
2241 #else
2242 const uint8_t *filter = subpel_filters[my - 1];
2243 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2244 int x, y;
2245
2246 for (y = 0; y < h; y++) {
2247 for (x = 0; x < 16; x++)
2248 dst[x] = FILTER_6TAP(src, filter, srcstride);
2249 dst += dststride;
2250 src += srcstride;
2251 }
2252 #endif
2253 }
2254
ff_put_vp8_epel8_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2255 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2256 ptrdiff_t srcstride, int h, int mx, int my)
2257 {
2258 #if 1
2259 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2260 double ftmp[9];
2261 uint32_t tmp[1];
2262 mips_reg src1;
2263 DECLARE_VAR_ALL64;
2264
2265 /*
2266 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2267 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2268 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2269 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2270 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2271 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2272 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2273 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2274 */
2275 __asm__ volatile (
2276 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2277 "li %[tmp0], 0x07 \n\t"
2278 "mtc1 %[tmp0], %[ftmp4] \n\t"
2279
2280 "1: \n\t"
2281 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2282
2283 "addiu %[h], %[h], -0x01 \n\t"
2284 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2285 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2286 "bnez %[h], 1b \n\t"
2287 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2288 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2289 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2290 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2291 [ftmp8]"=&f"(ftmp[8]),
2292 [tmp0]"=&r"(tmp[0]),
2293 RESTRICT_ASM_ALL64
2294 [src1]"=&r"(src1),
2295 [h]"+&r"(h),
2296 [dst]"+&r"(dst), [src]"+&r"(src)
2297 : [ff_pw_64]"f"(ff_pw_64),
2298 [srcstride]"r"((mips_reg)srcstride),
2299 [dststride]"r"((mips_reg)dststride),
2300 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2301 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2302 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2303 : "memory"
2304 );
2305 #else
2306 const uint8_t *filter = subpel_filters[my - 1];
2307 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2308 int x, y;
2309
2310 for (y = 0; y < h; y++) {
2311 for (x = 0; x < 8; x++)
2312 dst[x] = FILTER_6TAP(src, filter, srcstride);
2313 dst += dststride;
2314 src += srcstride;
2315 }
2316 #endif
2317 }
2318
ff_put_vp8_epel4_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2319 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2320 ptrdiff_t srcstride, int h, int mx, int my)
2321 {
2322 #if 1
2323 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2324 double ftmp[6];
2325 uint32_t tmp[1];
2326 mips_reg src1;
2327 DECLARE_VAR_LOW32;
2328
2329 /*
2330 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2331 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2332 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2333 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2334 */
2335 __asm__ volatile (
2336 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2337 "li %[tmp0], 0x07 \n\t"
2338 "mtc1 %[tmp0], %[ftmp4] \n\t"
2339
2340 "1: \n\t"
2341 PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2342
2343 "addiu %[h], %[h], -0x01 \n\t"
2344 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2345 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2346 "bnez %[h], 1b \n\t"
2347 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2348 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2349 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2350 [tmp0]"=&r"(tmp[0]),
2351 RESTRICT_ASM_LOW32
2352 [src1]"=&r"(src1),
2353 [h]"+&r"(h),
2354 [dst]"+&r"(dst), [src]"+&r"(src)
2355 : [ff_pw_64]"f"(ff_pw_64),
2356 [srcstride]"r"((mips_reg)srcstride),
2357 [dststride]"r"((mips_reg)dststride),
2358 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2359 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2360 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2361 : "memory"
2362 );
2363 #else
2364 const uint8_t *filter = subpel_filters[my - 1];
2365 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2366 int x, y;
2367
2368 for (y = 0; y < h; y++) {
2369 for (x = 0; x < 4; x++)
2370 dst[x] = FILTER_6TAP(src, filter, srcstride);
2371 dst += dststride;
2372 src += srcstride;
2373 }
2374 #endif
2375 }
2376
ff_put_vp8_epel16_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2377 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2378 ptrdiff_t srcstride, int h, int mx, int my)
2379 {
2380 #if 1
2381 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2382 uint8_t *tmp = tmp_array;
2383
2384 src -= srcstride;
2385 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2386 tmp = tmp_array + 16;
2387 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2388 #else
2389 const uint8_t *filter = subpel_filters[mx - 1];
2390 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2391 int x, y;
2392 uint8_t tmp_array[560];
2393 uint8_t *tmp = tmp_array;
2394
2395 src -= srcstride;
2396
2397 for (y = 0; y < h + 3; y++) {
2398 for (x = 0; x < 16; x++)
2399 tmp[x] = FILTER_4TAP(src, filter, 1);
2400 tmp += 16;
2401 src += srcstride;
2402 }
2403
2404 tmp = tmp_array + 16;
2405 filter = subpel_filters[my - 1];
2406
2407 for (y = 0; y < h; y++) {
2408 for (x = 0; x < 16; x++)
2409 dst[x] = FILTER_4TAP(tmp, filter, 16);
2410 dst += dststride;
2411 tmp += 16;
2412 }
2413 #endif
2414 }
2415
ff_put_vp8_epel8_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2416 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2417 ptrdiff_t srcstride, int h, int mx, int my)
2418 {
2419 #if 1
2420 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2421 uint8_t *tmp = tmp_array;
2422
2423 src -= srcstride;
2424 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2425 tmp = tmp_array + 8;
2426 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2427 #else
2428 const uint8_t *filter = subpel_filters[mx - 1];
2429 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2430 int x, y;
2431 uint8_t tmp_array[152];
2432 uint8_t *tmp = tmp_array;
2433
2434 src -= srcstride;
2435
2436 for (y = 0; y < h + 3; y++) {
2437 for (x = 0; x < 8; x++)
2438 tmp[x] = FILTER_4TAP(src, filter, 1);
2439 tmp += 8;
2440 src += srcstride;
2441 }
2442
2443 tmp = tmp_array + 8;
2444 filter = subpel_filters[my - 1];
2445
2446 for (y = 0; y < h; y++) {
2447 for (x = 0; x < 8; x++)
2448 dst[x] = FILTER_4TAP(tmp, filter, 8);
2449 dst += dststride;
2450 tmp += 8;
2451 }
2452 #endif
2453 }
2454
ff_put_vp8_epel4_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2455 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2456 ptrdiff_t srcstride, int h, int mx, int my)
2457 {
2458 #if 1
2459 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2460 uint8_t *tmp = tmp_array;
2461
2462 src -= srcstride;
2463 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2464 tmp = tmp_array + 4;
2465 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2466 #else
2467 const uint8_t *filter = subpel_filters[mx - 1];
2468 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2469 int x, y;
2470 uint8_t tmp_array[44];
2471 uint8_t *tmp = tmp_array;
2472
2473 src -= srcstride;
2474
2475 for (y = 0; y < h + 3; y++) {
2476 for (x = 0; x < 4; x++)
2477 tmp[x] = FILTER_4TAP(src, filter, 1);
2478 tmp += 4;
2479 src += srcstride;
2480 }
2481 tmp = tmp_array + 4;
2482 filter = subpel_filters[my - 1];
2483
2484 for (y = 0; y < h; y++) {
2485 for (x = 0; x < 4; x++)
2486 dst[x] = FILTER_4TAP(tmp, filter, 4);
2487 dst += dststride;
2488 tmp += 4;
2489 }
2490 #endif
2491 }
2492
ff_put_vp8_epel16_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2493 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2494 ptrdiff_t srcstride, int h, int mx, int my)
2495 {
2496 #if 1
2497 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2498 uint8_t *tmp = tmp_array;
2499
2500 src -= 2 * srcstride;
2501 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2502 tmp = tmp_array + 32;
2503 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2504 #else
2505 const uint8_t *filter = subpel_filters[mx - 1];
2506 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2507 int x, y;
2508 uint8_t tmp_array[592];
2509 uint8_t *tmp = tmp_array;
2510
2511 src -= 2 * srcstride;
2512
2513 for (y = 0; y < h + 5; y++) {
2514 for (x = 0; x < 16; x++)
2515 tmp[x] = FILTER_4TAP(src, filter, 1);
2516 tmp += 16;
2517 src += srcstride;
2518 }
2519
2520 tmp = tmp_array + 32;
2521 filter = subpel_filters[my - 1];
2522
2523 for (y = 0; y < h; y++) {
2524 for (x = 0; x < 16; x++)
2525 dst[x] = FILTER_6TAP(tmp, filter, 16);
2526 dst += dststride;
2527 tmp += 16;
2528 }
2529 #endif
2530 }
2531
ff_put_vp8_epel8_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2532 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2533 ptrdiff_t srcstride, int h, int mx, int my)
2534 {
2535 #if 1
2536 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2537 uint8_t *tmp = tmp_array;
2538
2539 src -= 2 * srcstride;
2540 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2541 tmp = tmp_array + 16;
2542 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2543 #else
2544 const uint8_t *filter = subpel_filters[mx - 1];
2545 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2546 int x, y;
2547 uint8_t tmp_array[168];
2548 uint8_t *tmp = tmp_array;
2549
2550 src -= 2 * srcstride;
2551
2552 for (y = 0; y < h + 5; y++) {
2553 for (x = 0; x < 8; x++)
2554 tmp[x] = FILTER_4TAP(src, filter, 1);
2555 tmp += 8;
2556 src += srcstride;
2557 }
2558
2559 tmp = tmp_array + 16;
2560 filter = subpel_filters[my - 1];
2561
2562 for (y = 0; y < h; y++) {
2563 for (x = 0; x < 8; x++)
2564 dst[x] = FILTER_6TAP(tmp, filter, 8);
2565 dst += dststride;
2566 tmp += 8;
2567 }
2568 #endif
2569 }
2570
ff_put_vp8_epel4_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2571 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2572 ptrdiff_t srcstride, int h, int mx, int my)
2573 {
2574 #if 1
2575 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2576 uint8_t *tmp = tmp_array;
2577
2578 src -= 2 * srcstride;
2579 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2580 tmp = tmp_array + 8;
2581 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2582 #else
2583 const uint8_t *filter = subpel_filters[mx - 1];
2584 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2585 int x, y;
2586 uint8_t tmp_array[52];
2587 uint8_t *tmp = tmp_array;
2588
2589 src -= 2 * srcstride;
2590
2591 for (y = 0; y < h + 5; y++) {
2592 for (x = 0; x < 4; x++)
2593 tmp[x] = FILTER_4TAP(src, filter, 1);
2594 tmp += 4;
2595 src += srcstride;
2596 }
2597
2598 tmp = tmp_array + 8;
2599 filter = subpel_filters[my - 1];
2600
2601 for (y = 0; y < h; y++) {
2602 for (x = 0; x < 4; x++)
2603 dst[x] = FILTER_6TAP(tmp, filter, 4);
2604 dst += dststride;
2605 tmp += 4;
2606 }
2607 #endif
2608 }
2609
ff_put_vp8_epel16_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2610 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2611 ptrdiff_t srcstride, int h, int mx, int my)
2612 {
2613 #if 1
2614 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2615 uint8_t *tmp = tmp_array;
2616
2617 src -= srcstride;
2618 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2619 tmp = tmp_array + 16;
2620 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2621 #else
2622 const uint8_t *filter = subpel_filters[mx - 1];
2623 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2624 int x, y;
2625 uint8_t tmp_array[560];
2626 uint8_t *tmp = tmp_array;
2627
2628 src -= srcstride;
2629
2630 for (y = 0; y < h + 3; y++) {
2631 for (x = 0; x < 16; x++)
2632 tmp[x] = FILTER_6TAP(src, filter, 1);
2633 tmp += 16;
2634 src += srcstride;
2635 }
2636
2637 tmp = tmp_array + 16;
2638 filter = subpel_filters[my - 1];
2639
2640 for (y = 0; y < h; y++) {
2641 for (x = 0; x < 16; x++)
2642 dst[x] = FILTER_4TAP(tmp, filter, 16);
2643 dst += dststride;
2644 tmp += 16;
2645 }
2646 #endif
2647 }
2648
ff_put_vp8_epel8_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2649 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2650 ptrdiff_t srcstride, int h, int mx, int my)
2651 {
2652 #if 1
2653 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2654 uint8_t *tmp = tmp_array;
2655
2656 src -= srcstride;
2657 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2658 tmp = tmp_array + 8;
2659 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2660 #else
2661 const uint8_t *filter = subpel_filters[mx - 1];
2662 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2663 int x, y;
2664 uint8_t tmp_array[152];
2665 uint8_t *tmp = tmp_array;
2666
2667 src -= srcstride;
2668
2669 for (y = 0; y < h + 3; y++) {
2670 for (x = 0; x < 8; x++)
2671 tmp[x] = FILTER_6TAP(src, filter, 1);
2672 tmp += 8;
2673 src += srcstride;
2674 }
2675
2676 tmp = tmp_array + 8;
2677 filter = subpel_filters[my - 1];
2678
2679 for (y = 0; y < h; y++) {
2680 for (x = 0; x < 8; x++)
2681 dst[x] = FILTER_4TAP(tmp, filter, 8);
2682 dst += dststride;
2683 tmp += 8;
2684 }
2685 #endif
2686 }
2687
ff_put_vp8_epel4_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2688 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2689 ptrdiff_t srcstride, int h, int mx, int my)
2690 {
2691 #if 1
2692 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2693 uint8_t *tmp = tmp_array;
2694
2695 src -= srcstride;
2696 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2697 tmp = tmp_array + 4;
2698 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2699 #else
2700 const uint8_t *filter = subpel_filters[mx - 1];
2701 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2702 int x, y;
2703 uint8_t tmp_array[44];
2704 uint8_t *tmp = tmp_array;
2705
2706 src -= srcstride;
2707
2708 for (y = 0; y < h + 3; y++) {
2709 for (x = 0; x < 4; x++)
2710 tmp[x] = FILTER_6TAP(src, filter, 1);
2711 tmp += 4;
2712 src += srcstride;
2713 }
2714
2715 tmp = tmp_array + 4;
2716 filter = subpel_filters[my - 1];
2717
2718 for (y = 0; y < h; y++) {
2719 for (x = 0; x < 4; x++)
2720 dst[x] = FILTER_4TAP(tmp, filter, 4);
2721 dst += dststride;
2722 tmp += 4;
2723 }
2724 #endif
2725 }
2726
ff_put_vp8_epel16_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2727 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2728 ptrdiff_t srcstride, int h, int mx, int my)
2729 {
2730 #if 1
2731 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2732 uint8_t *tmp = tmp_array;
2733
2734 src -= 2 * srcstride;
2735 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2736 tmp = tmp_array + 32;
2737 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2738 #else
2739 const uint8_t *filter = subpel_filters[mx - 1];
2740 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2741 int x, y;
2742 uint8_t tmp_array[592];
2743 uint8_t *tmp = tmp_array;
2744
2745 src -= 2 * srcstride;
2746
2747 for (y = 0; y < h + 5; y++) {
2748 for (x = 0; x < 16; x++)
2749 tmp[x] = FILTER_6TAP(src, filter, 1);
2750 tmp += 16;
2751 src += srcstride;
2752 }
2753
2754 tmp = tmp_array + 32;
2755 filter = subpel_filters[my - 1];
2756
2757 for (y = 0; y < h; y++) {
2758 for (x = 0; x < 16; x++)
2759 dst[x] = FILTER_6TAP(tmp, filter, 16);
2760 dst += dststride;
2761 tmp += 16;
2762 }
2763 #endif
2764 }
2765
ff_put_vp8_epel8_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2766 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2767 ptrdiff_t srcstride, int h, int mx, int my)
2768 {
2769 #if 1
2770 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2771 uint8_t *tmp = tmp_array;
2772
2773 src -= 2 * srcstride;
2774 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2775 tmp = tmp_array + 16;
2776 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2777 #else
2778 const uint8_t *filter = subpel_filters[mx - 1];
2779 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2780 int x, y;
2781 uint8_t tmp_array[168];
2782 uint8_t *tmp = tmp_array;
2783
2784 src -= 2 * srcstride;
2785
2786 for (y = 0; y < h + 5; y++) {
2787 for (x = 0; x < 8; x++)
2788 tmp[x] = FILTER_6TAP(src, filter, 1);
2789 tmp += 8;
2790 src += srcstride;
2791 }
2792
2793 tmp = tmp_array + 16;
2794 filter = subpel_filters[my - 1];
2795
2796 for (y = 0; y < h; y++) {
2797 for (x = 0; x < 8; x++)
2798 dst[x] = FILTER_6TAP(tmp, filter, 8);
2799 dst += dststride;
2800 tmp += 8;
2801 }
2802 #endif
2803 }
2804
ff_put_vp8_epel4_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2805 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2806 ptrdiff_t srcstride, int h, int mx, int my)
2807 {
2808 #if 1
2809 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2810 uint8_t *tmp = tmp_array;
2811
2812 src -= 2 * srcstride;
2813 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2814 tmp = tmp_array + 8;
2815 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2816 #else
2817 const uint8_t *filter = subpel_filters[mx - 1];
2818 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2819 int x, y;
2820 uint8_t tmp_array[52];
2821 uint8_t *tmp = tmp_array;
2822
2823 src -= 2 * srcstride;
2824
2825 for (y = 0; y < h + 5; y++) {
2826 for (x = 0; x < 4; x++)
2827 tmp[x] = FILTER_6TAP(src, filter, 1);
2828 tmp += 4;
2829 src += srcstride;
2830 }
2831
2832 tmp = tmp_array + 8;
2833 filter = subpel_filters[my - 1];
2834
2835 for (y = 0; y < h; y++) {
2836 for (x = 0; x < 4; x++)
2837 dst[x] = FILTER_6TAP(tmp, filter, 4);
2838 dst += dststride;
2839 tmp += 4;
2840 }
2841 #endif
2842 }
2843
ff_put_vp8_bilinear16_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2844 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2845 ptrdiff_t sstride, int h, int mx, int my)
2846 {
2847 #if 1
2848 int a = 8 - mx, b = mx;
2849 double ftmp[7];
2850 uint32_t tmp[1];
2851 mips_reg dst0, src0;
2852 DECLARE_VAR_ALL64;
2853
2854 /*
2855 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2856 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2857 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2858 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2859 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2860 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2861 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2862 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2863
2864 dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2865 dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2866 dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2867 dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2868 dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2869 dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2870 dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2871 dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2872 */
2873 __asm__ volatile (
2874 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2875 "li %[tmp0], 0x03 \n\t"
2876 "mtc1 %[tmp0], %[ftmp4] \n\t"
2877 "pshufh %[a], %[a], %[ftmp0] \n\t"
2878 "pshufh %[b], %[b], %[ftmp0] \n\t"
2879
2880 "1: \n\t"
2881 // 0 - 7
2882 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2883 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2884 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2885 // 8 - 15
2886 PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2887
2888 "addiu %[h], %[h], -0x01 \n\t"
2889 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2890 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2891 "bnez %[h], 1b \n\t"
2892 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2893 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2894 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2895 [ftmp6]"=&f"(ftmp[6]),
2896 [tmp0]"=&r"(tmp[0]),
2897 RESTRICT_ASM_ALL64
2898 [dst0]"=&r"(dst0), [src0]"=&r"(src0),
2899 [h]"+&r"(h),
2900 [dst]"+&r"(dst), [src]"+&r"(src),
2901 [a]"+&f"(a), [b]"+&f"(b)
2902 : [sstride]"r"((mips_reg)sstride),
2903 [dstride]"r"((mips_reg)dstride),
2904 [ff_pw_4]"f"(ff_pw_4)
2905 : "memory"
2906 );
2907 #else
2908 int a = 8 - mx, b = mx;
2909 int x, y;
2910
2911 for (y = 0; y < h; y++) {
2912 for (x = 0; x < 16; x++)
2913 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2914 dst += dstride;
2915 src += sstride;
2916 }
2917 #endif
2918 }
2919
ff_put_vp8_bilinear16_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2920 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2921 ptrdiff_t sstride, int h, int mx, int my)
2922 {
2923 #if 1
2924 int c = 8 - my, d = my;
2925 double ftmp[7];
2926 uint32_t tmp[1];
2927 mips_reg src0, src1, dst0;
2928 DECLARE_VAR_ALL64;
2929
2930 /*
2931 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2932 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2933 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2934 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2935 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2936 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2937 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2938 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2939 */
2940 __asm__ volatile (
2941 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2942 "li %[tmp0], 0x03 \n\t"
2943 "mtc1 %[tmp0], %[ftmp4] \n\t"
2944 "pshufh %[c], %[c], %[ftmp0] \n\t"
2945 "pshufh %[d], %[d], %[ftmp0] \n\t"
2946
2947 "1: \n\t"
2948 // 0 - 7
2949 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2950 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2951 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2952 // 8 - 15
2953 PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2954
2955 "addiu %[h], %[h], -0x01 \n\t"
2956 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2957 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2958 "bnez %[h], 1b \n\t"
2959 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2960 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2961 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2962 [ftmp6]"=&f"(ftmp[6]),
2963 [tmp0]"=&r"(tmp[0]),
2964 RESTRICT_ASM_ALL64
2965 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2966 [src1]"=&r"(src1),
2967 [h]"+&r"(h),
2968 [dst]"+&r"(dst), [src]"+&r"(src),
2969 [c]"+&f"(c), [d]"+&f"(d)
2970 : [sstride]"r"((mips_reg)sstride),
2971 [dstride]"r"((mips_reg)dstride),
2972 [ff_pw_4]"f"(ff_pw_4)
2973 : "memory"
2974 );
2975 #else
2976 int c = 8 - my, d = my;
2977 int x, y;
2978
2979 for (y = 0; y < h; y++) {
2980 for (x = 0; x < 16; x++)
2981 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2982 dst += dstride;
2983 src += sstride;
2984 }
2985 #endif
2986 }
2987
ff_put_vp8_bilinear16_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2988 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2989 ptrdiff_t sstride, int h, int mx, int my)
2990 {
2991 #if 1
2992 DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2993 uint8_t *tmp = tmp_array;
2994
2995 ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2996 ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2997 #else
2998 int a = 8 - mx, b = mx;
2999 int c = 8 - my, d = my;
3000 int x, y;
3001 uint8_t tmp_array[528];
3002 uint8_t *tmp = tmp_array;
3003
3004 for (y = 0; y < h + 1; y++) {
3005 for (x = 0; x < 16; x++)
3006 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3007 tmp += 16;
3008 src += sstride;
3009 }
3010
3011 tmp = tmp_array;
3012
3013 for (y = 0; y < h; y++) {
3014 for (x = 0; x < 16; x++)
3015 dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3016 dst += dstride;
3017 tmp += 16;
3018 }
3019 #endif
3020 }
3021
ff_put_vp8_bilinear8_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3022 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3023 ptrdiff_t sstride, int h, int mx, int my)
3024 {
3025 #if 1
3026 int a = 8 - mx, b = mx;
3027 double ftmp[7];
3028 uint32_t tmp[1];
3029 DECLARE_VAR_ALL64;
3030
3031 /*
3032 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3033 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3034 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3035 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3036 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3037 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3038 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3039 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3040 */
3041 __asm__ volatile (
3042 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3043 "li %[tmp0], 0x03 \n\t"
3044 "mtc1 %[tmp0], %[ftmp4] \n\t"
3045 "pshufh %[a], %[a], %[ftmp0] \n\t"
3046 "pshufh %[b], %[b], %[ftmp0] \n\t"
3047
3048 "1: \n\t"
3049 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3050
3051 "addiu %[h], %[h], -0x01 \n\t"
3052 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3053 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3054 "bnez %[h], 1b \n\t"
3055 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3056 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3057 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3058 [ftmp6]"=&f"(ftmp[6]),
3059 [tmp0]"=&r"(tmp[0]),
3060 RESTRICT_ASM_ALL64
3061 [h]"+&r"(h),
3062 [dst]"+&r"(dst), [src]"+&r"(src),
3063 [a]"+&f"(a), [b]"+&f"(b)
3064 : [sstride]"r"((mips_reg)sstride),
3065 [dstride]"r"((mips_reg)dstride),
3066 [ff_pw_4]"f"(ff_pw_4)
3067 : "memory"
3068 );
3069 #else
3070 int a = 8 - mx, b = mx;
3071 int x, y;
3072
3073 for (y = 0; y < h; y++) {
3074 for (x = 0; x < 8; x++)
3075 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3076 dst += dstride;
3077 src += sstride;
3078 }
3079 #endif
3080 }
3081
ff_put_vp8_bilinear8_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3082 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3083 ptrdiff_t sstride, int h, int mx, int my)
3084 {
3085 #if 1
3086 int c = 8 - my, d = my;
3087 double ftmp[7];
3088 uint32_t tmp[1];
3089 mips_reg src1;
3090 DECLARE_VAR_ALL64;
3091
3092 /*
3093 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3094 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3095 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3096 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3097 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3098 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3099 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3100 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3101 */
3102 __asm__ volatile (
3103 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3104 "li %[tmp0], 0x03 \n\t"
3105 "mtc1 %[tmp0], %[ftmp4] \n\t"
3106 "pshufh %[c], %[c], %[ftmp0] \n\t"
3107 "pshufh %[d], %[d], %[ftmp0] \n\t"
3108
3109 "1: \n\t"
3110 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3111
3112 "addiu %[h], %[h], -0x01 \n\t"
3113 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3114 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3115 "bnez %[h], 1b \n\t"
3116 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3117 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3118 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3119 [ftmp6]"=&f"(ftmp[6]),
3120 [tmp0]"=&r"(tmp[0]),
3121 RESTRICT_ASM_ALL64
3122 [src1]"=&r"(src1),
3123 [h]"+&r"(h),
3124 [dst]"+&r"(dst), [src]"+&r"(src),
3125 [c]"+&f"(c), [d]"+&f"(d)
3126 : [sstride]"r"((mips_reg)sstride),
3127 [dstride]"r"((mips_reg)dstride),
3128 [ff_pw_4]"f"(ff_pw_4)
3129 : "memory"
3130 );
3131 #else
3132 int c = 8 - my, d = my;
3133 int x, y;
3134
3135 for (y = 0; y < h; y++) {
3136 for (x = 0; x < 8; x++)
3137 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3138 dst += dstride;
3139 src += sstride;
3140 }
3141 #endif
3142 }
3143
ff_put_vp8_bilinear8_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3144 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3145 ptrdiff_t sstride, int h, int mx, int my)
3146 {
3147 #if 1
3148 DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3149 uint8_t *tmp = tmp_array;
3150
3151 ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3152 ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3153 #else
3154 int a = 8 - mx, b = mx;
3155 int c = 8 - my, d = my;
3156 int x, y;
3157 uint8_t tmp_array[136];
3158 uint8_t *tmp = tmp_array;
3159
3160 for (y = 0; y < h + 1; y++) {
3161 for (x = 0; x < 8; x++)
3162 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3163 tmp += 8;
3164 src += sstride;
3165 }
3166
3167 tmp = tmp_array;
3168
3169 for (y = 0; y < h; y++) {
3170 for (x = 0; x < 8; x++)
3171 dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3172 dst += dstride;
3173 tmp += 8;
3174 }
3175 #endif
3176 }
3177
ff_put_vp8_bilinear4_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3178 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3179 ptrdiff_t sstride, int h, int mx, int my)
3180 {
3181 #if 1
3182 int a = 8 - mx, b = mx;
3183 double ftmp[5];
3184 uint32_t tmp[1];
3185 DECLARE_VAR_LOW32;
3186 DECLARE_VAR_ALL64;
3187
3188 /*
3189 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3190 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3191 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3192 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3193 */
3194 __asm__ volatile (
3195 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3196 "li %[tmp0], 0x03 \n\t"
3197 "mtc1 %[tmp0], %[ftmp4] \n\t"
3198 "pshufh %[a], %[a], %[ftmp0] \n\t"
3199 "pshufh %[b], %[b], %[ftmp0] \n\t"
3200
3201 "1: \n\t"
3202 PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3203
3204 "addiu %[h], %[h], -0x01 \n\t"
3205 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3206 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3207 "bnez %[h], 1b \n\t"
3208 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3209 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3210 [ftmp4]"=&f"(ftmp[4]),
3211 [tmp0]"=&r"(tmp[0]),
3212 RESTRICT_ASM_LOW32
3213 RESTRICT_ASM_ALL64
3214 [h]"+&r"(h),
3215 [dst]"+&r"(dst), [src]"+&r"(src),
3216 [a]"+&f"(a), [b]"+&f"(b)
3217 : [sstride]"r"((mips_reg)sstride),
3218 [dstride]"r"((mips_reg)dstride),
3219 [ff_pw_4]"f"(ff_pw_4)
3220 : "memory"
3221 );
3222 #else
3223 int a = 8 - mx, b = mx;
3224 int x, y;
3225
3226 for (y = 0; y < h; y++) {
3227 for (x = 0; x < 4; x++)
3228 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3229 dst += dstride;
3230 src += sstride;
3231 }
3232 #endif
3233 }
3234
ff_put_vp8_bilinear4_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3235 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3236 ptrdiff_t sstride, int h, int mx, int my)
3237 {
3238 #if 1
3239 int c = 8 - my, d = my;
3240 double ftmp[7];
3241 uint32_t tmp[1];
3242 mips_reg src1;
3243 DECLARE_VAR_LOW32;
3244 DECLARE_VAR_ALL64;
3245
3246 /*
3247 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3248 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3249 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3250 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3251 */
3252 __asm__ volatile (
3253 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3254 "li %[tmp0], 0x03 \n\t"
3255 "mtc1 %[tmp0], %[ftmp4] \n\t"
3256 "pshufh %[c], %[c], %[ftmp0] \n\t"
3257 "pshufh %[d], %[d], %[ftmp0] \n\t"
3258
3259 "1: \n\t"
3260 PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3261
3262 "addiu %[h], %[h], -0x01 \n\t"
3263 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3264 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3265 "bnez %[h], 1b \n\t"
3266 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3267 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3268 [ftmp4]"=&f"(ftmp[4]),
3269 [tmp0]"=&r"(tmp[0]),
3270 RESTRICT_ASM_LOW32
3271 RESTRICT_ASM_ALL64
3272 [src1]"=&r"(src1),
3273 [h]"+&r"(h),
3274 [dst]"+&r"(dst), [src]"+&r"(src),
3275 [c]"+&f"(c), [d]"+&f"(d)
3276 : [sstride]"r"((mips_reg)sstride),
3277 [dstride]"r"((mips_reg)dstride),
3278 [ff_pw_4]"f"(ff_pw_4)
3279 : "memory"
3280 );
3281 #else
3282 int c = 8 - my, d = my;
3283 int x, y;
3284
3285 for (y = 0; y < h; y++) {
3286 for (x = 0; x < 4; x++)
3287 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3288 dst += dstride;
3289 src += sstride;
3290 }
3291 #endif
3292 }
3293
ff_put_vp8_bilinear4_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3294 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3295 ptrdiff_t sstride, int h, int mx, int my)
3296 {
3297 #if 1
3298 DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3299 uint8_t *tmp = tmp_array;
3300
3301 ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3302 ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3303 #else
3304 int a = 8 - mx, b = mx;
3305 int c = 8 - my, d = my;
3306 int x, y;
3307 uint8_t tmp_array[36];
3308 uint8_t *tmp = tmp_array;
3309
3310 for (y = 0; y < h + 1; y++) {
3311 for (x = 0; x < 4; x++)
3312 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3313 tmp += 4;
3314 src += sstride;
3315 }
3316
3317 tmp = tmp_array;
3318
3319 for (y = 0; y < h; y++) {
3320 for (x = 0; x < 4; x++)
3321 dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3322 dst += dstride;
3323 tmp += 4;
3324 }
3325 #endif
3326 }
3327