• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/attributes.h"
27 #include "libavutil/mips/mmiutils.h"
28 #include "libavutil/mem_internal.h"
29 
30 #define DECLARE_DOUBLE_1            double db_1
31 #define DECLARE_DOUBLE_2            double db_2
32 #define DECLARE_UINT32_T            uint32_t  it_1
33 #define RESTRICT_ASM_DOUBLE_1       [db_1]"=&f"(db_1)
34 #define RESTRICT_ASM_DOUBLE_2       [db_2]"=&f"(db_2)
35 #define RESTRICT_ASM_UINT32_T       [it_1]"=&r"(it_1)
36 
37 #define MMI_PCMPGTUB(dst, src1, src2)                                       \
38         "pcmpeqb    %[db_1],    "#src1",        "#src2"             \n\t"   \
39         "pmaxub     %[db_2],    "#src1",        "#src2"             \n\t"   \
40         "pcmpeqb    %[db_2],    %[db_2],        "#src1"             \n\t"   \
41         "pxor       "#dst",     %[db_2],        %[db_1]             \n\t"
42 
43 #define MMI_BTOH(dst_l, dst_r, src)                                         \
44         "pxor       %[db_1],    %[db_1],        %[db_1]             \n\t"   \
45         "pcmpgtb    %[db_2],    %[db_1],        "#src"              \n\t"   \
46         "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
47         "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
48 
49 #define MMI_VP8_LOOP_FILTER                                                 \
50         /* Calculation of hev */                                            \
51         "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
52         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
53         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
54         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
55         "pasubub    %[ftmp0],   %[p1],          %[p0]               \n\t"   \
56         "pasubub    %[ftmp1],   %[q1],          %[q0]               \n\t"   \
57         "pmaxub     %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"   \
58         MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3])                            \
59         /* Calculation of mask */                                           \
60         "pasubub    %[ftmp1],   %[p0],          %[q0]               \n\t"   \
61         "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
62         "pasubub    %[ftmp2],   %[p1],          %[q1]               \n\t"   \
63         "li         %[tmp0],    0x09                                \n\t"   \
64         "dmtc1      %[tmp0],    %[ftmp3]                            \n\t"   \
65         PSRLB_MMI(%[ftmp2],  %[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp2])     \
66         "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
67         "dmtc1      %[e],       %[ftmp3]                            \n\t"   \
68         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
69         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
70         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
71         MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3])                           \
72         "pmaxub     %[mask],    %[mask],        %[ftmp0]            \n\t"   \
73         "pasubub    %[ftmp1],   %[p3],          %[p2]               \n\t"   \
74         "pasubub    %[ftmp2],   %[p2],          %[p1]               \n\t"   \
75         "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
76         "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
77         "pasubub    %[ftmp1],   %[q3],          %[q2]               \n\t"   \
78         "pasubub    %[ftmp2],   %[q2],          %[q1]               \n\t"   \
79         "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
80         "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
81         "dmtc1      %[i],       %[ftmp3]                            \n\t"   \
82         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
83         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
84         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
85         MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3])                            \
86         "pcmpeqw    %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
87         "pxor       %[mask],    %[mask],        %[ftmp3]            \n\t"   \
88         /* VP8_MBFILTER */                                                  \
89         "li         %[tmp0],    0x80808080                          \n\t"   \
90         "dmtc1      %[tmp0],    %[ftmp7]                            \n\t"   \
91         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"   \
92         "pxor       %[p2],      %[p2],          %[ftmp7]            \n\t"   \
93         "pxor       %[p1],      %[p1],          %[ftmp7]            \n\t"   \
94         "pxor       %[p0],      %[p0],          %[ftmp7]            \n\t"   \
95         "pxor       %[q0],      %[q0],          %[ftmp7]            \n\t"   \
96         "pxor       %[q1],      %[q1],          %[ftmp7]            \n\t"   \
97         "pxor       %[q2],      %[q2],          %[ftmp7]            \n\t"   \
98         "psubsb     %[ftmp4],   %[p1],          %[q1]               \n\t"   \
99         "psubb      %[ftmp5],   %[q0],          %[p0]               \n\t"   \
100         MMI_BTOH(%[ftmp1],  %[ftmp0],  %[ftmp5])                            \
101         MMI_BTOH(%[ftmp3],  %[ftmp2],  %[ftmp4])                            \
102         /* Right part */                                                    \
103         "paddh      %[ftmp5],   %[ftmp0],       %[ftmp0]            \n\t"   \
104         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"   \
105         "paddh      %[ftmp0],   %[ftmp2],       %[ftmp0]            \n\t"   \
106         /* Left part */                                                     \
107         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp1]            \n\t"   \
108         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"   \
109         "paddh      %[ftmp1],   %[ftmp3],       %[ftmp1]            \n\t"   \
110         /* Combine left and right part */                                   \
111         "packsshb   %[ftmp1],   %[ftmp0],       %[ftmp1]            \n\t"   \
112         "pand       %[ftmp1],   %[ftmp1],       %[mask]             \n\t"   \
113         "pand       %[ftmp2],   %[ftmp1],       %[hev]              \n\t"   \
114         "li         %[tmp0],    0x04040404                          \n\t"   \
115         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
116         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
117         "paddsb     %[ftmp3],   %[ftmp2],       %[ftmp0]            \n\t"   \
118         "li         %[tmp0],    0x0B                                \n\t"   \
119         "dmtc1      %[tmp0],    %[ftmp4]                            \n\t"   \
120         PSRAB_MMI(%[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp6],  %[ftmp3])     \
121         "li         %[tmp0],    0x03030303                          \n\t"   \
122         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
123         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
124         "paddsb     %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"   \
125         "li         %[tmp0],    0x0B                                \n\t"   \
126         "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
127         PSRAB_MMI(%[ftmp4],  %[ftmp2],  %[ftmp5],  %[ftmp6],  %[ftmp4])     \
128         "psubsb     %[q0],      %[q0],          %[ftmp3]            \n\t"   \
129         "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
130         /* filt_val &= ~hev */                                              \
131         "pcmpeqw    %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
132         "pxor       %[hev],     %[hev],         %[ftmp0]            \n\t"   \
133         "pand       %[ftmp1],   %[ftmp1],       %[hev]              \n\t"   \
134         MMI_BTOH(%[ftmp5],  %[ftmp6],  %[ftmp1])                            \
135         "li         %[tmp0],    0x07                                \n\t"   \
136         "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
137         "li         %[tmp0],    0x001b001b                          \n\t"   \
138         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
139         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
140         "li         %[tmp0],    0x003f003f                          \n\t"   \
141         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
142         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
143         /* Right part */                                                    \
144         "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
145         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
146         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
147         /* Left part */                                                     \
148         "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
149         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
150         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
151         /* Combine left and right part */                                   \
152         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
153         "psubsb     %[q0],      %[q0],          %[ftmp4]            \n\t"   \
154         "pxor       %[q0],      %[q0],          %[ftmp7]            \n\t"   \
155         "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
156         "pxor       %[p0],      %[p0],          %[ftmp7]            \n\t"   \
157         "li         %[tmp0],    0x00120012                          \n\t"   \
158         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
159         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
160         /* Right part */                                                    \
161         "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
162         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
163         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
164         /* Left part */                                                     \
165         "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
166         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
167         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
168         /* Combine left and right part */                                   \
169         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
170         "psubsb     %[q1],      %[q1],          %[ftmp4]            \n\t"   \
171         "pxor       %[q1],      %[q1],          %[ftmp7]            \n\t"   \
172         "paddsb     %[p1],      %[p1],          %[ftmp4]            \n\t"   \
173         "pxor       %[p1],      %[p1],          %[ftmp7]            \n\t"   \
174         "li         %[tmp0],    0x03                                \n\t"   \
175         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
176         /* Right part */                                                    \
177         "psllh      %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
178         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"   \
179         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
180         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
181         /* Left part */                                                     \
182         "psllh      %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
183         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"   \
184         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
185         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
186         /* Combine left and right part */                                   \
187         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
188         "psubsb     %[q2],      %[q2],          %[ftmp4]            \n\t"   \
189         "pxor       %[q2],      %[q2],          %[ftmp7]            \n\t"   \
190         "paddsb     %[p2],      %[p2],          %[ftmp4]            \n\t"   \
191         "pxor       %[p2],      %[p2],          %[ftmp7]            \n\t"
192 
193 #define PUT_VP8_EPEL4_H6_MMI(src, dst)                                      \
194         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
195         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
196         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
197                                                                             \
198         MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
199         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
200         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
201         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
202                                                                             \
203         MMI_ULWC1(%[ftmp1], src, -0x02)                                     \
204         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
205         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
206         "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
207                                                                             \
208         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
209         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
210         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
211                                                                             \
212         MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
213         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
214         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
215         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
216                                                                             \
217         MMI_ULWC1(%[ftmp1], src, 0x03)                                      \
218         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
219         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
220         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
221                                                                             \
222         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
223         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
224         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
225         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
226                                                                             \
227         MMI_SWC1(%[ftmp1], dst, 0x00)
228 
229 
230 #define PUT_VP8_EPEL4_H4_MMI(src, dst)                                      \
231         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
232         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
233         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
234                                                                             \
235         MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
236         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
237         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
238         "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
239                                                                             \
240         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
241         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
242         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
243                                                                             \
244         MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
245         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
246         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
247         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
248                                                                             \
249         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
250                                                                             \
251         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
252         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
253                                                                             \
254         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
255         MMI_SWC1(%[ftmp1], dst, 0x00)
256 
257 
258 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)                     \
259         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
260         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
261         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
262                                                                             \
263         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
264         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
265         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
266         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
267         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
268                                                                             \
269         PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
270         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
271         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
272         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
273         "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
274                                                                             \
275         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
276         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
277         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
278         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
279                                                                             \
280         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
281         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
282         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
283         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
284         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
285                                                                             \
286         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
287         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
288         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
289         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
290         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
291                                                                             \
292         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
293                                                                             \
294         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
295         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
296         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
297                                                                             \
298         MMI_SWC1(%[ftmp1], dst, 0x00)
299 
300 
301 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)                     \
302         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
303         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
304         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
305                                                                             \
306         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
307         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
308         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
309         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
310         "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
311                                                                             \
312         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
313         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
314         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
315         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
316                                                                             \
317         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
318         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
319         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
320         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
321         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
322                                                                             \
323         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
324                                                                             \
325         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
326         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
327         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
328                                                                             \
329         MMI_SWC1(%[ftmp1], dst, 0x00)
330 
331 
332 #define PUT_VP8_EPEL8_H6_MMI(src, dst)                                      \
333         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
334         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
335         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
336         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
337         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
338                                                                             \
339         MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
340         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
341         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
342         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
343         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
344         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
345         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
346                                                                             \
347         MMI_ULDC1(%[ftmp1], src, -0x02)                                     \
348         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
349         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
350         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
351         "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
352         "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
353         "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
354                                                                             \
355         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
356         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
357         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
358         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
359         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
360                                                                             \
361         MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
362         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
363         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
364         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
365         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
366         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
367         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
368                                                                             \
369         MMI_ULDC1(%[ftmp1], src, 0x03)                                      \
370         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
371         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
372         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
373         "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
374         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
375         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
376                                                                             \
377         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
378         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
379                                                                             \
380         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
381         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
382         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
383         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
384         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
385                                                                             \
386         MMI_SDC1(%[ftmp1], dst, 0x00)
387 
388 
389 #define PUT_VP8_EPEL8_H4_MMI(src, dst)                                      \
390         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
391         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
392         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
393         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
394         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
395                                                                             \
396         MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
397         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
398         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
399         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
400         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
401         "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
402         "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
403                                                                             \
404         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
405         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
406         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
407         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
408         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
409                                                                             \
410         MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
411         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
412         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
413         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
414         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
415         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
416         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
417                                                                             \
418         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
419         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
420                                                                             \
421         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
422         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
423         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
424         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
425                                                                             \
426         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
427         MMI_SDC1(%[ftmp1], dst, 0x00)
428 
429 
430 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)                     \
431         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
432         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
433         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
434         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
435         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
436                                                                             \
437         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
438         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
439         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
440         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
441         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
442         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
443         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
444         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
445                                                                             \
446         PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
447         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
448         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
449         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
450         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
451         "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
452         "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
453         "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
454                                                                             \
455         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
456         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
457         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
458         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
459         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
460         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
461                                                                             \
462         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
463         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
464         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
465         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
466         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
467         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
468         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
469         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
470                                                                             \
471         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
472         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
473         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
474         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
475         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
476         "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
477         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
478         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
479                                                                             \
480         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
481         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
482                                                                             \
483         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
484         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
485         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
486         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
487         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
488                                                                             \
489         MMI_SDC1(%[ftmp1], dst, 0x00)
490 
491 
492 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)                     \
493         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
494         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
495         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
496         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
497         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
498                                                                             \
499         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
500         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
501         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
502         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
503         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
504         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
505         "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
506         "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
507                                                                             \
508         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
509         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
510         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
511         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
512         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
513         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
514                                                                             \
515         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
516         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
517         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
518         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
519         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
520         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
521         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
522         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
523                                                                             \
524         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
525         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
526                                                                             \
527         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
528         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
529         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
530         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
531         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
532                                                                             \
533         MMI_SDC1(%[ftmp1], dst, 0x00)
534 
535 
536 #define PUT_VP8_BILINEAR8_H_MMI(src, dst)                                   \
537         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
538         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
539         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
540         "pmullh     %[ftmp5],   %[ftmp2],       %[a]                \n\t"   \
541         "pmullh     %[ftmp6],   %[ftmp3],       %[a]                \n\t"   \
542                                                                             \
543         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
544         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
545         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
546         "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
547         "pmullh     %[ftmp3],   %[ftmp3],       %[b]                \n\t"   \
548         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
549         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
550                                                                             \
551         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
552         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
553         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
554         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
555                                                                             \
556         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
557         MMI_SDC1(%[ftmp1], dst, 0x00)
558 
559 
560 #define PUT_VP8_BILINEAR4_H_MMI(src, dst)                                   \
561         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
562         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
563         "pmullh     %[ftmp3],   %[ftmp2],       %[a]                \n\t"   \
564                                                                             \
565         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
566         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
567         "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
568         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
569                                                                             \
570         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
571         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
572                                                                             \
573         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
574         MMI_SWC1(%[ftmp1], dst, 0x00)
575 
576 
577 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)                    \
578         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
579         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
580         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
581         "pmullh     %[ftmp5],   %[ftmp2],       %[c]                \n\t"   \
582         "pmullh     %[ftmp6],   %[ftmp3],       %[c]                \n\t"   \
583                                                                             \
584         PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
585         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
586         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
587         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
588         "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
589         "pmullh     %[ftmp3],   %[ftmp3],       %[d]                \n\t"   \
590         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
591         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
592                                                                             \
593         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
594         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
595         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
596         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
597                                                                             \
598         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
599         MMI_SDC1(%[ftmp1], dst, 0x00)
600 
601 
602 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)                    \
603         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
604         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
605         "pmullh     %[ftmp3],   %[ftmp2],       %[c]                \n\t"   \
606                                                                             \
607         PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
608         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
609         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
610         "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
611         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
612                                                                             \
613         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
614         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
615                                                                             \
616         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
617         MMI_SWC1(%[ftmp1], dst, 0x00)
618 
619 
620 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
621    {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
622     0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
623 
624    {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
625     0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
626 
627    {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
628     0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
629 
630    {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
631     0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
632 
633    {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
634     0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
635 
636    {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
637     0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
638 
639    {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
640     0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
641 };
642 
643 #if 0
644 #define FILTER_6TAP(src, F, stride)                                           \
645     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
646         F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] -             \
647         F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
648 
649 #define FILTER_4TAP(src, F, stride)                                           \
650     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
651         F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
652 
653 static const uint8_t subpel_filters[7][6] = {
654     { 0,  6, 123,  12,  1, 0 },
655     { 2, 11, 108,  36,  8, 1 },
656     { 0,  9,  93,  50,  6, 0 },
657     { 3, 16,  77,  77, 16, 3 },
658     { 0,  6,  50,  93,  9, 0 },
659     { 1,  8,  36, 108, 11, 2 },
660     { 0,  1,  12, 123,  6, 0 },
661 };
662 
663 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
664 #define MUL_35468(a)  (((a) * 35468) >> 16)
665 #endif
666 
667 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
vp8_filter_common_is4tap(uint8_t * p,ptrdiff_t stride)668 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
669         ptrdiff_t stride)
670 {
671     int av_unused p1 = p[-2 * stride];
672     int av_unused p0 = p[-1 * stride];
673     int av_unused q0 = p[ 0 * stride];
674     int av_unused q1 = p[ 1 * stride];
675     int a, f1, f2;
676     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
677 
678     a = 3 * (q0 - p0);
679     a += clip_int8(p1 - q1);
680     a = clip_int8(a);
681 
682     // We deviate from the spec here with c(a+3) >> 3
683     // since that's what libvpx does.
684     f1 = FFMIN(a + 4, 127) >> 3;
685     f2 = FFMIN(a + 3, 127) >> 3;
686 
687     // Despite what the spec says, we do need to clamp here to
688     // be bitexact with libvpx.
689     p[-1 * stride] = cm[p0 + f2];
690     p[ 0 * stride] = cm[q0 - f1];
691 }
692 
vp8_filter_common_isnot4tap(uint8_t * p,ptrdiff_t stride)693 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
694         ptrdiff_t stride)
695 {
696     int av_unused p1 = p[-2 * stride];
697     int av_unused p0 = p[-1 * stride];
698     int av_unused q0 = p[ 0 * stride];
699     int av_unused q1 = p[ 1 * stride];
700     int a, f1, f2;
701     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
702 
703     a = 3 * (q0 - p0);
704     a = clip_int8(a);
705 
706     // We deviate from the spec here with c(a+3) >> 3
707     // since that's what libvpx does.
708     f1 = FFMIN(a + 4, 127) >> 3;
709     f2 = FFMIN(a + 3, 127) >> 3;
710 
711     // Despite what the spec says, we do need to clamp here to
712     // be bitexact with libvpx.
713     p[-1 * stride] = cm[p0 + f2];
714     p[ 0 * stride] = cm[q0 - f1];
715     a              = (f1 + 1) >> 1;
716     p[-2 * stride] = cm[p1 + a];
717     p[ 1 * stride] = cm[q1 - a];
718 }
719 
vp8_simple_limit(uint8_t * p,ptrdiff_t stride,int flim)720 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
721         int flim)
722 {
723     int av_unused p1 = p[-2 * stride];
724     int av_unused p0 = p[-1 * stride];
725     int av_unused q0 = p[ 0 * stride];
726     int av_unused q1 = p[ 1 * stride];
727 
728     return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
729 }
730 
hev(uint8_t * p,ptrdiff_t stride,int thresh)731 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
732 {
733     int av_unused p1 = p[-2 * stride];
734     int av_unused p0 = p[-1 * stride];
735     int av_unused q0 = p[ 0 * stride];
736     int av_unused q1 = p[ 1 * stride];
737 
738     return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
739 }
740 
filter_mbedge(uint8_t * p,ptrdiff_t stride)741 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
742 {
743     int a0, a1, a2, w;
744     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
745 
746     int av_unused p2 = p[-3 * stride];
747     int av_unused p1 = p[-2 * stride];
748     int av_unused p0 = p[-1 * stride];
749     int av_unused q0 = p[ 0 * stride];
750     int av_unused q1 = p[ 1 * stride];
751     int av_unused q2 = p[ 2 * stride];
752 
753     w = clip_int8(p1 - q1);
754     w = clip_int8(w + 3 * (q0 - p0));
755 
756     a0 = (27 * w + 63) >> 7;
757     a1 = (18 * w + 63) >> 7;
758     a2 =  (9 * w + 63) >> 7;
759 
760     p[-3 * stride] = cm[p2 + a2];
761     p[-2 * stride] = cm[p1 + a1];
762     p[-1 * stride] = cm[p0 + a0];
763     p[ 0 * stride] = cm[q0 - a0];
764     p[ 1 * stride] = cm[q1 - a1];
765     p[ 2 * stride] = cm[q2 - a2];
766 }
767 
vp8_normal_limit(uint8_t * p,ptrdiff_t stride,int E,int I)768 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
769         int E, int I)
770 {
771     int av_unused p3 = p[-4 * stride];
772     int av_unused p2 = p[-3 * stride];
773     int av_unused p1 = p[-2 * stride];
774     int av_unused p0 = p[-1 * stride];
775     int av_unused q0 = p[ 0 * stride];
776     int av_unused q1 = p[ 1 * stride];
777     int av_unused q2 = p[ 2 * stride];
778     int av_unused q3 = p[ 3 * stride];
779 
780     return vp8_simple_limit(p, stride, E) &&
781            FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
782            FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
783            FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
784 }
785 
vp8_v_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)786 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
787         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
788 {
789     double ftmp[18];
790     uint32_t tmp[1];
791     DECLARE_DOUBLE_1;
792     DECLARE_DOUBLE_2;
793     DECLARE_UINT32_T;
794     DECLARE_VAR_ALL64;
795 
796     __asm__ volatile(
797         /* Get data from dst */
798         MMI_ULDC1(%[q0], %[dst], 0x0)
799         PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
800         MMI_ULDC1(%[p0], %[tmp0], 0x0)
801         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
802         MMI_ULDC1(%[p1], %[tmp0], 0x0)
803         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
804         MMI_ULDC1(%[p2], %[tmp0], 0x0)
805         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
806         MMI_ULDC1(%[p3], %[tmp0], 0x0)
807         PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
808         MMI_ULDC1(%[q1], %[tmp0], 0x0)
809         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
810         MMI_ULDC1(%[q2], %[tmp0], 0x0)
811         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
812         MMI_ULDC1(%[q3], %[tmp0], 0x0)
813         MMI_VP8_LOOP_FILTER
814         /* Move to dst */
815         MMI_USDC1(%[q0], %[dst], 0x0)
816         PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
817         MMI_USDC1(%[p0], %[tmp0], 0x0)
818         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
819         MMI_USDC1(%[p1], %[tmp0], 0x0)
820         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
821         MMI_USDC1(%[p2], %[tmp0], 0x0)
822         PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
823         MMI_USDC1(%[q1], %[tmp0], 0x0)
824         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
825         MMI_USDC1(%[q2], %[tmp0], 0x0)
826         : RESTRICT_ASM_ALL64
827           [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
828           [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
829           [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
830           [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
831           [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
832           [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
833           [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
834           [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
835           [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
836           [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
837           RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
838           RESTRICT_ASM_UINT32_T
839         : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
840           [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
841         : "memory"
842     );
843 }
844 
vp8_v_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)845 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
846         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
847 {
848     int i;
849 
850     for (i = 0; i < 8; i++)
851         if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
852             int hv = hev(dst + i * 1, stride, hev_thresh);
853             if (hv)
854                 vp8_filter_common_is4tap(dst + i * 1, stride);
855             else
856                 vp8_filter_common_isnot4tap(dst + i * 1, stride);
857         }
858 }
859 
vp8_h_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)860 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
861         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
862 {
863     double ftmp[18];
864     uint32_t tmp[1];
865     DECLARE_DOUBLE_1;
866     DECLARE_DOUBLE_2;
867     DECLARE_UINT32_T;
868     DECLARE_VAR_ALL64;
869 
870     __asm__ volatile(
871         /* Get data from dst */
872         MMI_ULDC1(%[p3], %[dst], -0x04)
873         PTR_ADDU    "%[tmp0],     %[dst],           %[stride]     \n\t"
874         MMI_ULDC1(%[p2], %[tmp0], -0x04)
875         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
876         MMI_ULDC1(%[p1], %[tmp0], -0x04)
877         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
878         MMI_ULDC1(%[p0], %[tmp0], -0x04)
879         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
880         MMI_ULDC1(%[q0], %[tmp0], -0x04)
881         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
882         MMI_ULDC1(%[q1], %[tmp0], -0x04)
883         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
884         MMI_ULDC1(%[q2], %[tmp0], -0x04)
885         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
886         MMI_ULDC1(%[q3], %[tmp0], -0x04)
887         /* Matrix transpose */
888         TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
889                      %[q0], %[q1], %[q2], %[q3],
890                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
891         MMI_VP8_LOOP_FILTER
892         /* Matrix transpose */
893         TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
894                      %[q0], %[q1], %[q2], %[q3],
895                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
896         /* Move to dst */
897         MMI_USDC1(%[p3], %[dst], -0x04)
898         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
899         MMI_USDC1(%[p2], %[dst], -0x04)
900         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
901         MMI_USDC1(%[p1], %[dst], -0x04)
902         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
903         MMI_USDC1(%[p0], %[dst], -0x04)
904         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
905         MMI_USDC1(%[q0], %[dst], -0x04)
906         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
907         MMI_USDC1(%[q1], %[dst], -0x04)
908         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
909         MMI_USDC1(%[q2], %[dst], -0x04)
910         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
911         MMI_USDC1(%[q3], %[dst], -0x04)
912         : RESTRICT_ASM_ALL64
913           [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
914           [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
915           [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
916           [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
917           [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
918           [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
919           [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
920           [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
921           [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
922           [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
923           RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
924           RESTRICT_ASM_UINT32_T
925         : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
926           [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
927         : "memory"
928     );
929 }
930 
vp8_h_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)931 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
932         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
933 {
934     int i;
935 
936     for (i = 0; i < 8; i++)
937         if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
938             int hv = hev(dst + i * stride, 1, hev_thresh);
939             if (hv)
940                 vp8_filter_common_is4tap(dst + i * stride, 1);
941             else
942                 vp8_filter_common_isnot4tap(dst + i * stride, 1);
943         }
944 }
945 
ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16],int16_t dc[16])946 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
947 {
948 #if 1
949     double ftmp[8];
950     DECLARE_VAR_ALL64;
951 
952     __asm__ volatile (
953         MMI_LDC1(%[ftmp0], %[dc], 0x00)
954         MMI_LDC1(%[ftmp1], %[dc], 0x08)
955         MMI_LDC1(%[ftmp2], %[dc], 0x10)
956         MMI_LDC1(%[ftmp3], %[dc], 0x18)
957         "paddsh     %[ftmp4],   %[ftmp0],       %[ftmp3]            \n\t"
958         "psubsh     %[ftmp5],   %[ftmp0],       %[ftmp3]            \n\t"
959         "paddsh     %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
960         "psubsh     %[ftmp7],   %[ftmp1],       %[ftmp2]            \n\t"
961         "paddsh     %[ftmp0],   %[ftmp4],       %[ftmp6]            \n\t"
962         "paddsh     %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
963         "psubsh     %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
964         "psubsh     %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
965         MMI_SDC1(%[ftmp0], %[dc], 0x00)
966         MMI_SDC1(%[ftmp1], %[dc], 0x08)
967         MMI_SDC1(%[ftmp2], %[dc], 0x10)
968         MMI_SDC1(%[ftmp3], %[dc], 0x18)
969         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
970           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
971           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
972           [ftmp6]"=&f"(ftmp[6]),
973           RESTRICT_ASM_ALL64
974           [ftmp7]"=&f"(ftmp[7])
975         : [dc]"r"((uint8_t*)dc)
976         : "memory"
977     );
978 
979     block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
980     block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
981     block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
982     block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
983 
984     block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
985     block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
986     block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
987     block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
988 
989     block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
990     block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
991     block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
992     block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
993 
994     block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
995     block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
996     block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
997     block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
998 
999     __asm__ volatile (
1000         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1001         MMI_SDC1(%[ftmp0], %[dc], 0x00)
1002         MMI_SDC1(%[ftmp0], %[dc], 0x08)
1003         MMI_SDC1(%[ftmp0], %[dc], 0x10)
1004         MMI_SDC1(%[ftmp0], %[dc], 0x18)
1005         : RESTRICT_ASM_ALL64
1006           [ftmp0]"=&f"(ftmp[0])
1007         : [dc]"r"((uint8_t *)dc)
1008         : "memory"
1009     );
1010 #else
1011     int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1012 
1013     t00 = dc[0] + dc[12];
1014     t10 = dc[1] + dc[13];
1015     t20 = dc[2] + dc[14];
1016     t30 = dc[3] + dc[15];
1017 
1018     t03 = dc[0] - dc[12];
1019     t13 = dc[1] - dc[13];
1020     t23 = dc[2] - dc[14];
1021     t33 = dc[3] - dc[15];
1022 
1023     t01 = dc[4] + dc[ 8];
1024     t11 = dc[5] + dc[ 9];
1025     t21 = dc[6] + dc[10];
1026     t31 = dc[7] + dc[11];
1027 
1028     t02 = dc[4] - dc[ 8];
1029     t12 = dc[5] - dc[ 9];
1030     t22 = dc[6] - dc[10];
1031     t32 = dc[7] - dc[11];
1032 
1033     dc[ 0] = t00 + t01;
1034     dc[ 1] = t10 + t11;
1035     dc[ 2] = t20 + t21;
1036     dc[ 3] = t30 + t31;
1037 
1038     dc[ 4] = t03 + t02;
1039     dc[ 5] = t13 + t12;
1040     dc[ 6] = t23 + t22;
1041     dc[ 7] = t33 + t32;
1042 
1043     dc[ 8] = t00 - t01;
1044     dc[ 9] = t10 - t11;
1045     dc[10] = t20 - t21;
1046     dc[11] = t30 - t31;
1047 
1048     dc[12] = t03 - t02;
1049     dc[13] = t13 - t12;
1050     dc[14] = t23 - t22;
1051     dc[15] = t33 - t32;
1052 
1053     block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1054     block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1055     block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1056     block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1057 
1058     block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1059     block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1060     block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1061     block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1062 
1063     block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1064     block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1065     block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1066     block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1067 
1068     block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1069     block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1070     block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1071     block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1072 
1073     AV_ZERO64(dc + 0);
1074     AV_ZERO64(dc + 4);
1075     AV_ZERO64(dc + 8);
1076     AV_ZERO64(dc + 12);
1077 #endif
1078 }
1079 
ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16],int16_t dc[16])1080 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1081 {
1082     int val = (dc[0] + 3) >> 3;
1083 
1084     dc[0] = 0;
1085 
1086     block[0][0][0] = val;
1087     block[0][1][0] = val;
1088     block[0][2][0] = val;
1089     block[0][3][0] = val;
1090     block[1][0][0] = val;
1091     block[1][1][0] = val;
1092     block[1][2][0] = val;
1093     block[1][3][0] = val;
1094     block[2][0][0] = val;
1095     block[2][1][0] = val;
1096     block[2][2][0] = val;
1097     block[2][3][0] = val;
1098     block[3][0][0] = val;
1099     block[3][1][0] = val;
1100     block[3][2][0] = val;
1101     block[3][3][0] = val;
1102 }
1103 
ff_vp8_idct_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1104 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1105 {
1106 #if 1
1107     double ftmp[12];
1108     uint32_t tmp[1];
1109     union av_intfloat64 ff_ph_4e7b_u;
1110     union av_intfloat64 ff_ph_22a3_u;
1111     DECLARE_VAR_LOW32;
1112     DECLARE_VAR_ALL64;
1113     ff_ph_4e7b_u.i = 0x4e7b4e7b4e7b4e7bULL;
1114     ff_ph_22a3_u.i = 0x22a322a322a322a3ULL;
1115 
1116     __asm__ volatile (
1117         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1118         MMI_LDC1(%[ftmp1], %[block], 0x00)
1119         MMI_LDC1(%[ftmp2], %[block], 0x08)
1120         MMI_LDC1(%[ftmp3], %[block], 0x10)
1121         MMI_LDC1(%[ftmp4], %[block], 0x18)
1122 
1123         "li         %[tmp0],    0x02                                \n\t"
1124         "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1125 
1126         // block[0...3] + block[8...11]
1127         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1128         // block[0...3] - block[8...11]
1129         "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1130         // MUL_35468(block[12...15])
1131         "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1132         "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1133         // MUL_35468(block[4...7])
1134         "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1135         "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1136         // MUL_20091(block[4...7]
1137         "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1138         "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
1139         // MUL_20091(block[12...15])
1140         "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1141         "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
1142 
1143         // tmp[0 4  8 12]
1144         "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
1145         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
1146         // tmp[1 5  9 13]
1147         "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
1148         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
1149         // tmp[2 6 10 14]
1150         "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
1151         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
1152         // tmp[3 7 11 15]
1153         "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
1154         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
1155 
1156         MMI_SDC1(%[ftmp0], %[block], 0x00)
1157         MMI_SDC1(%[ftmp0], %[block], 0x08)
1158         MMI_SDC1(%[ftmp0], %[block], 0x10)
1159         MMI_SDC1(%[ftmp0], %[block], 0x18)
1160 
1161         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1162                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1163 
1164         // t[0 4  8 12]
1165         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1166         // t[1 5  9 13]
1167         "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1168         // t[2 6 10 14]
1169         "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1170         "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1171         "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
1172         "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1173         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
1174         // t[3 7 11 15]
1175         "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1176         "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1177         "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
1178         "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1179         "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
1180 
1181         "li         %[tmp0],    0x03                                \n\t"
1182         "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1183         "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
1184         "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_4]          \n\t"
1185         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
1186         "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
1187         "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_4]          \n\t"
1188         "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
1189         "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
1190         "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"
1191         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
1192         "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
1193         "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_4]          \n\t"
1194         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
1195 
1196         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1197                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1198 
1199         MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1200         MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1201         MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1202         MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1203 
1204         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1205         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1206         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
1207         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
1208 
1209         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1210         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1211         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1212         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
1213 
1214         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1215         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1216         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1217         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1218 
1219         MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1220         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1221         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1222         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1223         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1224           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1225           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1226           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1227           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1228           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
1229           RESTRICT_ASM_LOW32
1230           RESTRICT_ASM_ALL64
1231           [tmp0]"=&r"(tmp[0])
1232         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1233           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1234           [block]"r"(block),                [ff_pw_4]"f"(ff_pw_4.f),
1235           [ff_ph_4e7b]"f"(ff_ph_4e7b_u.f),  [ff_ph_22a3]"f"(ff_ph_22a3_u.f)
1236         : "memory"
1237     );
1238 #else
1239     int i, t0, t1, t2, t3;
1240     int16_t tmp[16];
1241 
1242     for (i = 0; i < 4; i++) {
1243         t0 = block[0 + i] + block[8 + i];
1244         t1 = block[0 + i] - block[8 + i];
1245         t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1246         t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1247         block[ 0 + i] = 0;
1248         block[ 4 + i] = 0;
1249         block[ 8 + i] = 0;
1250         block[12 + i] = 0;
1251 
1252         tmp[i * 4 + 0] = t0 + t3;
1253         tmp[i * 4 + 1] = t1 + t2;
1254         tmp[i * 4 + 2] = t1 - t2;
1255         tmp[i * 4 + 3] = t0 - t3;
1256     }
1257 
1258     for (i = 0; i < 4; i++) {
1259         t0 = tmp[0 + i] + tmp[8 + i];
1260         t1 = tmp[0 + i] - tmp[8 + i];
1261         t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1262         t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1263 
1264         dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1265         dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1266         dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1267         dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1268         dst   += stride;
1269     }
1270 #endif
1271 }
1272 
ff_vp8_idct_dc_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1273 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1274 {
1275 #if 1
1276     int dc = (block[0] + 4) >> 3;
1277     double ftmp[6];
1278     DECLARE_VAR_LOW32;
1279 
1280     block[0] = 0;
1281 
1282     __asm__ volatile (
1283         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1284         "mtc1       %[dc],      %[ftmp5]                            \n\t"
1285         MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1286         MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1287         MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1288         MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1289         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1290         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1291         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1292         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1293         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1294         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1295         "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
1296         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1297         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
1298         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1299         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1300         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1301         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1302         MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1303         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1304         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1305         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1306         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1307           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1308           [ftmp4]"=&f"(ftmp[4]),
1309           RESTRICT_ASM_LOW32
1310           [ftmp5]"=&f"(ftmp[5])
1311         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1312           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1313           [dc]"r"(dc)
1314         : "memory"
1315     );
1316 #else
1317     int i, dc = (block[0] + 4) >> 3;
1318 
1319     block[0] = 0;
1320 
1321     for (i = 0; i < 4; i++) {
1322         dst[0] = av_clip_uint8(dst[0] + dc);
1323         dst[1] = av_clip_uint8(dst[1] + dc);
1324         dst[2] = av_clip_uint8(dst[2] + dc);
1325         dst[3] = av_clip_uint8(dst[3] + dc);
1326         dst   += stride;
1327     }
1328 #endif
1329 }
1330 
ff_vp8_idct_dc_add4y_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1331 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1332         ptrdiff_t stride)
1333 {
1334     ff_vp8_idct_dc_add_mmi(dst +  0, block[0], stride);
1335     ff_vp8_idct_dc_add_mmi(dst +  4, block[1], stride);
1336     ff_vp8_idct_dc_add_mmi(dst +  8, block[2], stride);
1337     ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1338 }
1339 
ff_vp8_idct_dc_add4uv_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1340 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1341         ptrdiff_t stride)
1342 {
1343     ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1344     ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1345     ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1346     ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1347 }
1348 
1349 // loop filter applied to edges between macroblocks
ff_vp8_v_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1350 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1351         int flim_I, int hev_thresh)
1352 {
1353     vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1354     vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1355 }
1356 
ff_vp8_h_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1357 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1358         int flim_I, int hev_thresh)
1359 {
1360     vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1361     vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1362                            hev_thresh);
1363 }
1364 
ff_vp8_v_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1365 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1366         int flim_E, int flim_I, int hev_thresh)
1367 {
1368     vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1369     vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1370 }
1371 
ff_vp8_h_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1372 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1373         int flim_E, int flim_I, int hev_thresh)
1374 {
1375     vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1376     vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1377 }
1378 
1379 // loop filter applied to inner macroblock edges
ff_vp8_v_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1380 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1381         int flim_E, int flim_I, int hev_thresh)
1382 {
1383     int i;
1384 
1385     for (i = 0; i < 16; i++)
1386         if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1387             int hv = hev(dst + i * 1, stride, hev_thresh);
1388             if (hv)
1389                 vp8_filter_common_is4tap(dst + i * 1, stride);
1390             else
1391                 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1392         }
1393 }
1394 
ff_vp8_h_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1395 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1396         int flim_E, int flim_I, int hev_thresh)
1397 {
1398     int i;
1399 
1400     for (i = 0; i < 16; i++)
1401         if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1402             int hv = hev(dst + i * stride, 1, hev_thresh);
1403             if (hv)
1404                 vp8_filter_common_is4tap(dst + i * stride, 1);
1405             else
1406                 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1407         }
1408 }
1409 
ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1410 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1411         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1412 {
1413     vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1414     vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1415 }
1416 
ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1417 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1418         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1419 {
1420     vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1421     vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1422 }
1423 
ff_vp8_v_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1424 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1425 {
1426     int i;
1427 
1428     for (i = 0; i < 16; i++)
1429         if (vp8_simple_limit(dst + i, stride, flim))
1430             vp8_filter_common_is4tap(dst + i, stride);
1431 }
1432 
ff_vp8_h_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1433 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1434 {
1435     int i;
1436 
1437     for (i = 0; i < 16; i++)
1438         if (vp8_simple_limit(dst + i * stride, 1, flim))
1439             vp8_filter_common_is4tap(dst + i * stride, 1);
1440 }
1441 
ff_put_vp8_pixels16_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1442 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1443         ptrdiff_t srcstride, int h, int x, int y)
1444 {
1445 #if 1
1446     double ftmp[2];
1447     uint64_t tmp[2];
1448     mips_reg addr[2];
1449     DECLARE_VAR_ALL64;
1450 
1451     __asm__ volatile (
1452         "1:                                                         \n\t"
1453         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1454         MMI_ULDC1(%[ftmp0], %[src], 0x00)
1455         "ldl        %[tmp0],    0x0f(%[src])                        \n\t"
1456         "ldr        %[tmp0],    0x08(%[src])                        \n\t"
1457         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1458         "ldl        %[tmp1],    0x0f(%[addr0])                      \n\t"
1459         "ldr        %[tmp1],    0x08(%[addr0])                      \n\t"
1460         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1461         MMI_SDC1(%[ftmp0], %[dst], 0x00)
1462         "sdl        %[tmp0],    0x0f(%[dst])                        \n\t"
1463         "sdr        %[tmp0],    0x08(%[dst])                        \n\t"
1464         "addiu      %[h],       %[h],           -0x02               \n\t"
1465         MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1466         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1467         "sdl        %[tmp1],    0x0f(%[addr1])                      \n\t"
1468         "sdr        %[tmp1],    0x08(%[addr1])                      \n\t"
1469         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1470         "bnez       %[h],       1b                                  \n\t"
1471         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1472           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
1473           RESTRICT_ASM_ALL64
1474           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1475           [dst]"+&r"(dst),                  [src]"+&r"(src),
1476           [h]"+&r"(h)
1477         : [dststride]"r"((mips_reg)dststride),
1478           [srcstride]"r"((mips_reg)srcstride)
1479         : "memory"
1480     );
1481 #else
1482     int i;
1483 
1484     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1485         memcpy(dst, src, 16);
1486 #endif
1487 }
1488 
ff_put_vp8_pixels8_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1489 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1490         ptrdiff_t srcstride, int h, int x, int y)
1491 {
1492 #if 1
1493     double ftmp[1];
1494     uint64_t tmp[1];
1495     mips_reg addr[2];
1496     DECLARE_VAR_ALL64;
1497 
1498     __asm__ volatile (
1499         "1:                                                         \n\t"
1500         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1501         MMI_ULDC1(%[ftmp0], %[src], 0x00)
1502         "ldl        %[tmp0],    0x07(%[addr0])                      \n\t"
1503         "ldr        %[tmp0],    0x00(%[addr0])                      \n\t"
1504         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1505         MMI_SDC1(%[ftmp0], %[dst], 0x00)
1506         "addiu      %[h],       %[h],           -0x02               \n\t"
1507         "sdl        %[tmp0],    0x07(%[addr1])                      \n\t"
1508         "sdr        %[tmp0],    0x00(%[addr1])                      \n\t"
1509         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1510         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1511         "bnez       %[h],       1b                                  \n\t"
1512         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1513           RESTRICT_ASM_ALL64
1514           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1515           [dst]"+&r"(dst),                  [src]"+&r"(src),
1516           [h]"+&r"(h)
1517         : [dststride]"r"((mips_reg)dststride),
1518           [srcstride]"r"((mips_reg)srcstride)
1519         : "memory"
1520     );
1521 #else
1522     int i;
1523 
1524     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1525         memcpy(dst, src, 8);
1526 #endif
1527 }
1528 
ff_put_vp8_pixels4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1529 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1530         ptrdiff_t srcstride, int h, int x, int y)
1531 {
1532 #if 1
1533     double ftmp[1];
1534     uint64_t tmp[1];
1535     mips_reg addr[2];
1536     DECLARE_VAR_LOW32;
1537 
1538     __asm__ volatile (
1539         "1:                                                         \n\t"
1540         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1541         MMI_LWC1(%[ftmp0], %[src], 0x00)
1542         "lwl        %[tmp0],    0x03(%[addr0])                      \n\t"
1543         "lwr        %[tmp0],    0x00(%[addr0])                      \n\t"
1544         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1545         MMI_SWC1(%[ftmp0], %[dst], 0x00)
1546         "addiu      %[h],       %[h],           -0x02               \n\t"
1547         "swl        %[tmp0],    0x03(%[addr1])                      \n\t"
1548         "swr        %[tmp0],    0x00(%[addr1])                      \n\t"
1549         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1550         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1551         "bnez       %[h],       1b                                  \n\t"
1552         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1553           RESTRICT_ASM_LOW32
1554           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1555           [dst]"+&r"(dst),                  [src]"+&r"(src),
1556           [h]"+&r"(h)
1557         : [dststride]"r"((mips_reg)dststride),
1558           [srcstride]"r"((mips_reg)srcstride)
1559         : "memory"
1560     );
1561 #else
1562     int i;
1563 
1564     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1565         memcpy(dst, src, 4);
1566 #endif
1567 }
1568 
ff_put_vp8_epel16_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1569 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1570         ptrdiff_t srcstride, int h, int mx, int my)
1571 {
1572 #if 1
1573     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1574     double ftmp[9];
1575     uint32_t tmp[1];
1576     union av_intfloat64 filter1;
1577     union av_intfloat64 filter2;
1578     union av_intfloat64 filter3;
1579     union av_intfloat64 filter4;
1580     mips_reg src1, dst1;
1581     DECLARE_VAR_ALL64;
1582     filter1.i = filter[1];
1583     filter2.i = filter[2];
1584     filter3.i = filter[3];
1585     filter4.i = filter[4];
1586 
1587     /*
1588     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1589     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1590     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1591     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1592     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1593     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1594     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1595     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1596 
1597     dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1598     dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1599     dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1600     dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1601     dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1602     dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1603     dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1604     dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1605     */
1606     __asm__ volatile (
1607         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1608         "li         %[tmp0],    0x07                                \n\t"
1609         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1610 
1611         "1:                                                         \n\t"
1612         // 0 - 7
1613         PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1614         PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1615         PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1616         // 8 - 15
1617         PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1618 
1619         "addiu      %[h],       %[h],           -0x01               \n\t"
1620         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1621         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1622         "bnez       %[h],       1b                                  \n\t"
1623         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1624           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1625           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1626           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1627           [ftmp8]"=&f"(ftmp[8]),
1628           [tmp0]"=&r"(tmp[0]),
1629           RESTRICT_ASM_ALL64
1630           [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1631           [h]"+&r"(h),
1632           [dst]"+&r"(dst),                  [src]"+&r"(src)
1633         : [ff_pw_64]"f"(ff_pw_64.f),
1634           [srcstride]"r"((mips_reg)srcstride),
1635           [dststride]"r"((mips_reg)dststride),
1636           [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
1637           [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
1638         : "memory"
1639     );
1640 #else
1641     const uint8_t *filter = subpel_filters[mx - 1];
1642     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1643     int x, y;
1644 
1645     for (y = 0; y < h; y++) {
1646         for (x = 0; x < 16; x++)
1647             dst[x] = FILTER_4TAP(src, filter, 1);
1648         dst += dststride;
1649         src += srcstride;
1650     }
1651 #endif
1652 }
1653 
ff_put_vp8_epel8_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1654 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1655         ptrdiff_t srcstride, int h, int mx, int my)
1656 {
1657 #if 1
1658     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1659     double ftmp[9];
1660     uint32_t tmp[1];
1661     union av_intfloat64 filter1;
1662     union av_intfloat64 filter2;
1663     union av_intfloat64 filter3;
1664     union av_intfloat64 filter4;
1665     DECLARE_VAR_ALL64;
1666     filter1.i = filter[1];
1667     filter2.i = filter[2];
1668     filter3.i = filter[3];
1669     filter4.i = filter[4];
1670 
1671 
1672     /*
1673     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1674     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1675     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1676     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1677     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1678     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1679     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1680     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1681     */
1682     __asm__ volatile (
1683         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1684         "li         %[tmp0],    0x07                                \n\t"
1685         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1686 
1687         "1:                                                         \n\t"
1688         PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1689 
1690         "addiu      %[h],       %[h],           -0x01               \n\t"
1691         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1692         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1693         "bnez       %[h],       1b                                  \n\t"
1694         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1695           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1696           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1697           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1698           [ftmp8]"=&f"(ftmp[8]),
1699           [tmp0]"=&r"(tmp[0]),
1700           RESTRICT_ASM_ALL64
1701           [h]"+&r"(h),
1702           [dst]"+&r"(dst),                  [src]"+&r"(src)
1703         : [ff_pw_64]"f"(ff_pw_64.f),
1704           [srcstride]"r"((mips_reg)srcstride),
1705           [dststride]"r"((mips_reg)dststride),
1706           [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
1707           [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
1708         : "memory"
1709     );
1710 #else
1711     const uint8_t *filter = subpel_filters[mx - 1];
1712     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1713     int x, y;
1714 
1715     for (y = 0; y < h; y++) {
1716         for (x = 0; x < 8; x++)
1717             dst[x] = FILTER_4TAP(src, filter, 1);
1718         dst += dststride;
1719         src += srcstride;
1720     }
1721 #endif
1722 }
1723 
ff_put_vp8_epel4_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1724 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1725         ptrdiff_t srcstride, int h, int mx, int my)
1726 {
1727 #if 1
1728     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1729     double ftmp[6];
1730     uint32_t tmp[1];
1731     union av_intfloat64 filter1;
1732     union av_intfloat64 filter2;
1733     union av_intfloat64 filter3;
1734     union av_intfloat64 filter4;
1735     DECLARE_VAR_LOW32;
1736     filter1.i = filter[1];
1737     filter2.i = filter[2];
1738     filter3.i = filter[3];
1739     filter4.i = filter[4];
1740 
1741     /*
1742     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1743     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1744     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1745     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1746     */
1747     __asm__ volatile (
1748         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1749         "li         %[tmp0],    0x07                                \n\t"
1750         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1751 
1752         "1:                                                         \n\t"
1753         PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1754 
1755         "addiu      %[h],       %[h],           -0x01               \n\t"
1756         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1757         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1758         "bnez       %[h],       1b                                  \n\t"
1759         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1760           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1761           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1762           [tmp0]"=&r"(tmp[0]),
1763           RESTRICT_ASM_LOW32
1764           [h]"+&r"(h),
1765           [dst]"+&r"(dst),                  [src]"+&r"(src)
1766         : [ff_pw_64]"f"(ff_pw_64.f),
1767           [srcstride]"r"((mips_reg)srcstride),
1768           [dststride]"r"((mips_reg)dststride),
1769           [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
1770           [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
1771         : "memory"
1772     );
1773 #else
1774     const uint8_t *filter = subpel_filters[mx - 1];
1775     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1776     int x, y;
1777 
1778     for (y = 0; y < h; y++) {
1779         for (x = 0; x < 4; x++)
1780             dst[x] = FILTER_4TAP(src, filter, 1);
1781         dst += dststride;
1782         src += srcstride;
1783     }
1784 #endif
1785 }
1786 
ff_put_vp8_epel16_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1787 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1788         ptrdiff_t srcstride, int h, int mx, int my)
1789 {
1790 #if 1
1791     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1792     double ftmp[9];
1793     uint32_t tmp[1];
1794     mips_reg src1, dst1;
1795     union av_intfloat64 filter0;
1796     union av_intfloat64 filter1;
1797     union av_intfloat64 filter2;
1798     union av_intfloat64 filter3;
1799     union av_intfloat64 filter4;
1800     union av_intfloat64 filter5;
1801     DECLARE_VAR_ALL64;
1802     filter0.i = filter[0];
1803     filter1.i = filter[1];
1804     filter2.i = filter[2];
1805     filter3.i = filter[3];
1806     filter4.i = filter[4];
1807     filter5.i = filter[5];
1808 
1809     /*
1810     dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1811     dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1812     dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1813     dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1814     dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1815     dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1816     dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1817     dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1818 
1819     dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1820     dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1821     dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1822     dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1823     dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1824     dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1825     dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1826     dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1827     */
1828     __asm__ volatile (
1829         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1830         "li         %[tmp0],    0x07                                \n\t"
1831         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1832 
1833         "1:                                                         \n\t"
1834         // 0 - 7
1835         PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1836         PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1837         PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1838         // 8 - 15
1839         PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1840 
1841         "addiu      %[h],       %[h],           -0x01               \n\t"
1842         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1843         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1844         "bnez       %[h],       1b                                  \n\t"
1845         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1846           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1847           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1848           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1849           [ftmp8]"=&f"(ftmp[8]),
1850           [tmp0]"=&r"(tmp[0]),
1851           RESTRICT_ASM_ALL64
1852           [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1853           [h]"+&r"(h),
1854           [dst]"+&r"(dst),                  [src]"+&r"(src)
1855         : [ff_pw_64]"f"(ff_pw_64.f),
1856           [srcstride]"r"((mips_reg)srcstride),
1857           [dststride]"r"((mips_reg)dststride),
1858           [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
1859           [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
1860           [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
1861         : "memory"
1862     );
1863 #else
1864     const uint8_t *filter = subpel_filters[mx - 1];
1865     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1866     int x, y;
1867 
1868     for (y = 0; y < h; y++) {
1869         for (x = 0; x < 16; x++)
1870             dst[x] = FILTER_6TAP(src, filter, 1);
1871         dst += dststride;
1872         src += srcstride;
1873     }
1874 #endif
1875 }
1876 
ff_put_vp8_epel8_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1877 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1878         ptrdiff_t srcstride, int h, int mx, int my)
1879 {
1880 #if 1
1881     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1882     double ftmp[9];
1883     uint32_t tmp[1];
1884     union av_intfloat64 filter0;
1885     union av_intfloat64 filter1;
1886     union av_intfloat64 filter2;
1887     union av_intfloat64 filter3;
1888     union av_intfloat64 filter4;
1889     union av_intfloat64 filter5;
1890     DECLARE_VAR_ALL64;
1891     filter0.i = filter[0];
1892     filter1.i = filter[1];
1893     filter2.i = filter[2];
1894     filter3.i = filter[3];
1895     filter4.i = filter[4];
1896     filter5.i = filter[5];
1897 
1898     /*
1899     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1900     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1901     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1902     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1903     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1904     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1905     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1906     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1907     */
1908     __asm__ volatile (
1909         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1910         "li         %[tmp0],    0x07                                \n\t"
1911         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1912 
1913         "1:                                                         \n\t"
1914         PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1915 
1916         "addiu      %[h],       %[h],           -0x01               \n\t"
1917         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1918         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1919         "bnez       %[h],       1b                                  \n\t"
1920         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1921           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1922           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1923           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1924           [ftmp8]"=&f"(ftmp[8]),
1925           [tmp0]"=&r"(tmp[0]),
1926           RESTRICT_ASM_ALL64
1927           [h]"+&r"(h),
1928           [dst]"+&r"(dst),                  [src]"+&r"(src)
1929         : [ff_pw_64]"f"(ff_pw_64.f),
1930           [srcstride]"r"((mips_reg)srcstride),
1931           [dststride]"r"((mips_reg)dststride),
1932           [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
1933           [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
1934           [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
1935         : "memory"
1936     );
1937 #else
1938     const uint8_t *filter = subpel_filters[mx - 1];
1939     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1940     int x, y;
1941 
1942     for (y = 0; y < h; y++) {
1943         for (x = 0; x < 8; x++)
1944             dst[x] = FILTER_6TAP(src, filter, 1);
1945         dst += dststride;
1946         src += srcstride;
1947     }
1948 #endif
1949 }
1950 
ff_put_vp8_epel4_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1951 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1952         ptrdiff_t srcstride, int h, int mx, int my)
1953 {
1954 #if 1
1955     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1956     double ftmp[6];
1957     uint32_t tmp[1];
1958     union av_intfloat64 filter0;
1959     union av_intfloat64 filter1;
1960     union av_intfloat64 filter2;
1961     union av_intfloat64 filter3;
1962     union av_intfloat64 filter4;
1963     union av_intfloat64 filter5;
1964     DECLARE_VAR_LOW32;
1965     filter0.i = filter[0];
1966     filter1.i = filter[1];
1967     filter2.i = filter[2];
1968     filter3.i = filter[3];
1969     filter4.i = filter[4];
1970     filter5.i = filter[5];
1971 
1972     /*
1973     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1974     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1975     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1976     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1977     */
1978     __asm__ volatile (
1979         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1980         "li         %[tmp0],    0x07                                \n\t"
1981         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1982 
1983         "1:                                                         \n\t"
1984         PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1985 
1986         "addiu      %[h],       %[h],           -0x01               \n\t"
1987         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1988         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1989         "bnez       %[h],       1b                                  \n\t"
1990         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1991           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1992           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1993           [tmp0]"=&r"(tmp[0]),
1994           RESTRICT_ASM_LOW32
1995           [h]"+&r"(h),
1996           [dst]"+&r"(dst),                  [src]"+&r"(src)
1997         : [ff_pw_64]"f"(ff_pw_64.f),
1998           [srcstride]"r"((mips_reg)srcstride),
1999           [dststride]"r"((mips_reg)dststride),
2000           [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2001           [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2002           [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2003         : "memory"
2004     );
2005 #else
2006     const uint8_t *filter = subpel_filters[mx - 1];
2007     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2008     int x, y;
2009 
2010     for (y = 0; y < h; y++) {
2011         for (x = 0; x < 4; x++)
2012             dst[x] = FILTER_6TAP(src, filter, 1);
2013         dst += dststride;
2014         src += srcstride;
2015     }
2016 #endif
2017 }
2018 
ff_put_vp8_epel16_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2019 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2020         ptrdiff_t srcstride, int h, int mx, int my)
2021 {
2022 #if 1
2023     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2024     double ftmp[9];
2025     uint32_t tmp[1];
2026     mips_reg src0, src1, dst0;
2027     union av_intfloat64 filter1;
2028     union av_intfloat64 filter2;
2029     union av_intfloat64 filter3;
2030     union av_intfloat64 filter4;
2031     DECLARE_VAR_ALL64;
2032     filter1.i = filter[1];
2033     filter2.i = filter[2];
2034     filter3.i = filter[3];
2035     filter4.i = filter[4];
2036 
2037     /*
2038     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2039     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2040     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2041     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2042     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2043     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2044     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2045     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2046 
2047     dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2048     dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2049     dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2050     dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2051     dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2052     dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2053     dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2054     dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2055     */
2056     __asm__ volatile (
2057         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2058         "li         %[tmp0],    0x07                                \n\t"
2059         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2060 
2061         "1:                                                         \n\t"
2062         // 0 - 7
2063         PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2064         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2065         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2066         // 8 - 15
2067         PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2068 
2069         "addiu      %[h],       %[h],           -0x01               \n\t"
2070         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2071         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2072         "bnez       %[h],       1b                                  \n\t"
2073         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2074           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2075           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2076           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2077           [ftmp8]"=&f"(ftmp[8]),
2078           [tmp0]"=&r"(tmp[0]),
2079           RESTRICT_ASM_ALL64
2080           [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2081           [src1]"=&r"(src1),
2082           [h]"+&r"(h),
2083           [dst]"+&r"(dst),                  [src]"+&r"(src)
2084         : [ff_pw_64]"f"(ff_pw_64.f),
2085           [srcstride]"r"((mips_reg)srcstride),
2086           [dststride]"r"((mips_reg)dststride),
2087           [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
2088           [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
2089         : "memory"
2090     );
2091 #else
2092     const uint8_t *filter = subpel_filters[my - 1];
2093     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2094     int x, y;
2095 
2096     for (y = 0; y < h; y++) {
2097         for (x = 0; x < 16; x++)
2098             dst[x] = FILTER_4TAP(src, filter, srcstride);
2099         dst += dststride;
2100         src += srcstride;
2101     }
2102 #endif
2103 }
2104 
ff_put_vp8_epel8_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2105 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2106         ptrdiff_t srcstride, int h, int mx, int my)
2107 {
2108 #if 1
2109     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2110     double ftmp[9];
2111     uint32_t tmp[1];
2112     mips_reg src1;
2113     union av_intfloat64 filter1;
2114     union av_intfloat64 filter2;
2115     union av_intfloat64 filter3;
2116     union av_intfloat64 filter4;
2117     DECLARE_VAR_ALL64;
2118     filter1.i = filter[1];
2119     filter2.i = filter[2];
2120     filter3.i = filter[3];
2121     filter4.i = filter[4];
2122 
2123     /*
2124     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2125     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2126     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2127     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2128     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2129     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2130     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2131     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2132     */
2133     __asm__ volatile (
2134         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2135         "li         %[tmp0],    0x07                                \n\t"
2136         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2137 
2138         "1:                                                         \n\t"
2139         PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2140 
2141         "addiu      %[h],       %[h],           -0x01               \n\t"
2142         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2143         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2144         "bnez       %[h],       1b                                  \n\t"
2145         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2146           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2147           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2148           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2149           [ftmp8]"=&f"(ftmp[8]),
2150           [tmp0]"=&r"(tmp[0]),
2151           RESTRICT_ASM_ALL64
2152           [src1]"=&r"(src1),
2153           [h]"+&r"(h),
2154           [dst]"+&r"(dst),                  [src]"+&r"(src)
2155         : [ff_pw_64]"f"(ff_pw_64.f),
2156           [srcstride]"r"((mips_reg)srcstride),
2157           [dststride]"r"((mips_reg)dststride),
2158           [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
2159           [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
2160         : "memory"
2161     );
2162 #else
2163     const uint8_t *filter = subpel_filters[my - 1];
2164     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2165     int x, y;
2166 
2167     for (y = 0; y < h; y++) {
2168         for (x = 0; x < 8; x++)
2169             dst[x] = FILTER_4TAP(src, filter, srcstride);
2170         dst += dststride;
2171         src += srcstride;
2172     }
2173 #endif
2174 }
2175 
ff_put_vp8_epel4_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2176 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2177         ptrdiff_t srcstride, int h, int mx, int my)
2178 {
2179 #if 1
2180     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2181     double ftmp[6];
2182     uint32_t tmp[1];
2183     mips_reg src1;
2184     union av_intfloat64 filter1;
2185     union av_intfloat64 filter2;
2186     union av_intfloat64 filter3;
2187     union av_intfloat64 filter4;
2188     DECLARE_VAR_LOW32;
2189     filter1.i = filter[1];
2190     filter2.i = filter[2];
2191     filter3.i = filter[3];
2192     filter4.i = filter[4];
2193 
2194     /*
2195     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2196     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2197     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2198     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2199     */
2200     __asm__ volatile (
2201         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2202         "li         %[tmp0],    0x07                                \n\t"
2203         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2204 
2205         "1:                                                         \n\t"
2206         PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2207 
2208         "addiu      %[h],       %[h],           -0x01               \n\t"
2209         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2210         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2211         "bnez       %[h],       1b                                  \n\t"
2212         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2213           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2214           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2215           [tmp0]"=&r"(tmp[0]),
2216           RESTRICT_ASM_LOW32
2217           [src1]"=&r"(src1),
2218           [h]"+&r"(h),
2219           [dst]"+&r"(dst),                  [src]"+&r"(src)
2220         : [ff_pw_64]"f"(ff_pw_64.f),
2221           [srcstride]"r"((mips_reg)srcstride),
2222           [dststride]"r"((mips_reg)dststride),
2223           [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
2224           [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
2225         : "memory"
2226     );
2227 #else
2228     const uint8_t *filter = subpel_filters[my - 1];
2229     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2230     int x, y;
2231 
2232     for (y = 0; y < h; y++) {
2233         for (x = 0; x < 4; x++)
2234             dst[x] = FILTER_4TAP(src, filter, srcstride);
2235         dst += dststride;
2236         src += srcstride;
2237     }
2238 #endif
2239 }
2240 
ff_put_vp8_epel16_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2241 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2242         ptrdiff_t srcstride, int h, int mx, int my)
2243 {
2244 #if 1
2245     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2246     double ftmp[9];
2247     uint32_t tmp[1];
2248     mips_reg src0, src1, dst0;
2249     union av_intfloat64 filter0;
2250     union av_intfloat64 filter1;
2251     union av_intfloat64 filter2;
2252     union av_intfloat64 filter3;
2253     union av_intfloat64 filter4;
2254     union av_intfloat64 filter5;
2255     DECLARE_VAR_ALL64;
2256     filter0.i = filter[0];
2257     filter1.i = filter[1];
2258     filter2.i = filter[2];
2259     filter3.i = filter[3];
2260     filter4.i = filter[4];
2261     filter5.i = filter[5];
2262 
2263     /*
2264     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2265     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2266     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2267     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2268     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2269     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2270     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2271     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2272 
2273     dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2274     dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2275     dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2276     dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2277     dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2278     dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2279     dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2280     dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2281     */
2282     __asm__ volatile (
2283         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2284         "li         %[tmp0],    0x07                                \n\t"
2285         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2286 
2287         "1:                                                         \n\t"
2288         // 0 - 7
2289         PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2290         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2291         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2292         // 8 - 15
2293         PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2294 
2295         "addiu      %[h],       %[h],           -0x01               \n\t"
2296         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2297         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2298         "bnez       %[h],       1b                                  \n\t"
2299         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2300           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2301           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2302           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2303           [ftmp8]"=&f"(ftmp[8]),
2304           [tmp0]"=&r"(tmp[0]),
2305           RESTRICT_ASM_ALL64
2306           [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2307           [src1]"=&r"(src1),
2308           [h]"+&r"(h),
2309           [dst]"+&r"(dst),                  [src]"+&r"(src)
2310         : [ff_pw_64]"f"(ff_pw_64.f),
2311           [srcstride]"r"((mips_reg)srcstride),
2312           [dststride]"r"((mips_reg)dststride),
2313           [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2314           [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2315           [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2316         : "memory"
2317     );
2318 #else
2319     const uint8_t *filter = subpel_filters[my - 1];
2320     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2321     int x, y;
2322 
2323     for (y = 0; y < h; y++) {
2324         for (x = 0; x < 16; x++)
2325             dst[x] = FILTER_6TAP(src, filter, srcstride);
2326         dst += dststride;
2327         src += srcstride;
2328     }
2329 #endif
2330 }
2331 
ff_put_vp8_epel8_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2332 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2333         ptrdiff_t srcstride, int h, int mx, int my)
2334 {
2335 #if 1
2336     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2337     double ftmp[9];
2338     uint32_t tmp[1];
2339     mips_reg src1;
2340     union av_intfloat64 filter0;
2341     union av_intfloat64 filter1;
2342     union av_intfloat64 filter2;
2343     union av_intfloat64 filter3;
2344     union av_intfloat64 filter4;
2345     union av_intfloat64 filter5;
2346     DECLARE_VAR_ALL64;
2347     filter0.i = filter[0];
2348     filter1.i = filter[1];
2349     filter2.i = filter[2];
2350     filter3.i = filter[3];
2351     filter4.i = filter[4];
2352     filter5.i = filter[5];
2353 
2354     /*
2355     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2356     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2357     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2358     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2359     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2360     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2361     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2362     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2363     */
2364     __asm__ volatile (
2365         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2366         "li         %[tmp0],    0x07                                \n\t"
2367         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2368 
2369         "1:                                                         \n\t"
2370         PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2371 
2372         "addiu      %[h],       %[h],           -0x01               \n\t"
2373         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2374         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2375         "bnez       %[h],       1b                                  \n\t"
2376         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2377           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2378           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2379           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2380           [ftmp8]"=&f"(ftmp[8]),
2381           [tmp0]"=&r"(tmp[0]),
2382           RESTRICT_ASM_ALL64
2383           [src1]"=&r"(src1),
2384           [h]"+&r"(h),
2385           [dst]"+&r"(dst),                  [src]"+&r"(src)
2386         : [ff_pw_64]"f"(ff_pw_64.f),
2387           [srcstride]"r"((mips_reg)srcstride),
2388           [dststride]"r"((mips_reg)dststride),
2389           [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2390           [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2391           [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2392         : "memory"
2393     );
2394 #else
2395     const uint8_t *filter = subpel_filters[my - 1];
2396     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2397     int x, y;
2398 
2399     for (y = 0; y < h; y++) {
2400         for (x = 0; x < 8; x++)
2401             dst[x] = FILTER_6TAP(src, filter, srcstride);
2402         dst += dststride;
2403         src += srcstride;
2404     }
2405 #endif
2406 }
2407 
ff_put_vp8_epel4_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2408 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2409         ptrdiff_t srcstride, int h, int mx, int my)
2410 {
2411 #if 1
2412     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2413     double ftmp[6];
2414     uint32_t tmp[1];
2415     mips_reg src1;
2416     union av_intfloat64 filter0;
2417     union av_intfloat64 filter1;
2418     union av_intfloat64 filter2;
2419     union av_intfloat64 filter3;
2420     union av_intfloat64 filter4;
2421     union av_intfloat64 filter5;
2422     DECLARE_VAR_LOW32;
2423     filter0.i = filter[0];
2424     filter1.i = filter[1];
2425     filter2.i = filter[2];
2426     filter3.i = filter[3];
2427     filter4.i = filter[4];
2428     filter5.i = filter[5];
2429 
2430     /*
2431     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2432     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2433     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2434     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2435     */
2436     __asm__ volatile (
2437         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2438         "li         %[tmp0],    0x07                                \n\t"
2439         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2440 
2441         "1:                                                         \n\t"
2442         PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2443 
2444         "addiu      %[h],       %[h],           -0x01               \n\t"
2445         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2446         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2447         "bnez       %[h],       1b                                  \n\t"
2448         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2449           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2450           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2451           [tmp0]"=&r"(tmp[0]),
2452           RESTRICT_ASM_LOW32
2453           [src1]"=&r"(src1),
2454           [h]"+&r"(h),
2455           [dst]"+&r"(dst),                  [src]"+&r"(src)
2456         : [ff_pw_64]"f"(ff_pw_64.f),
2457           [srcstride]"r"((mips_reg)srcstride),
2458           [dststride]"r"((mips_reg)dststride),
2459           [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2460           [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2461           [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2462         : "memory"
2463     );
2464 #else
2465     const uint8_t *filter = subpel_filters[my - 1];
2466     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2467     int x, y;
2468 
2469     for (y = 0; y < h; y++) {
2470         for (x = 0; x < 4; x++)
2471             dst[x] = FILTER_6TAP(src, filter, srcstride);
2472         dst += dststride;
2473         src += srcstride;
2474     }
2475 #endif
2476 }
2477 
ff_put_vp8_epel16_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2478 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2479         ptrdiff_t srcstride, int h, int mx, int my)
2480 {
2481 #if 1
2482     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2483     uint8_t *tmp = tmp_array;
2484 
2485     src -= srcstride;
2486     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2487     tmp = tmp_array + 16;
2488     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2489 #else
2490     const uint8_t *filter = subpel_filters[mx - 1];
2491     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2492     int x, y;
2493     uint8_t tmp_array[560];
2494     uint8_t *tmp = tmp_array;
2495 
2496     src -= srcstride;
2497 
2498     for (y = 0; y < h + 3; y++) {
2499         for (x = 0; x < 16; x++)
2500             tmp[x] = FILTER_4TAP(src, filter, 1);
2501         tmp += 16;
2502         src += srcstride;
2503     }
2504 
2505     tmp    = tmp_array + 16;
2506     filter = subpel_filters[my - 1];
2507 
2508     for (y = 0; y < h; y++) {
2509         for (x = 0; x < 16; x++)
2510             dst[x] = FILTER_4TAP(tmp, filter, 16);
2511         dst += dststride;
2512         tmp += 16;
2513     }
2514 #endif
2515 }
2516 
ff_put_vp8_epel8_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2517 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2518         ptrdiff_t srcstride, int h, int mx, int my)
2519 {
2520 #if 1
2521     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2522     uint8_t *tmp = tmp_array;
2523 
2524     src -= srcstride;
2525     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2526     tmp = tmp_array + 8;
2527     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2528 #else
2529     const uint8_t *filter = subpel_filters[mx - 1];
2530     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2531     int x, y;
2532     uint8_t tmp_array[152];
2533     uint8_t *tmp = tmp_array;
2534 
2535     src -= srcstride;
2536 
2537     for (y = 0; y < h + 3; y++) {
2538         for (x = 0; x < 8; x++)
2539             tmp[x] = FILTER_4TAP(src, filter, 1);
2540         tmp += 8;
2541         src += srcstride;
2542     }
2543 
2544     tmp    = tmp_array + 8;
2545     filter = subpel_filters[my - 1];
2546 
2547     for (y = 0; y < h; y++) {
2548         for (x = 0; x < 8; x++)
2549             dst[x] = FILTER_4TAP(tmp, filter, 8);
2550         dst += dststride;
2551         tmp += 8;
2552     }
2553 #endif
2554 }
2555 
ff_put_vp8_epel4_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2556 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2557         ptrdiff_t srcstride, int h, int mx, int my)
2558 {
2559 #if 1
2560     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2561     uint8_t *tmp = tmp_array;
2562 
2563     src -= srcstride;
2564     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2565     tmp = tmp_array + 4;
2566     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2567 #else
2568     const uint8_t *filter = subpel_filters[mx - 1];
2569     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2570     int x, y;
2571     uint8_t tmp_array[44];
2572     uint8_t *tmp = tmp_array;
2573 
2574     src -= srcstride;
2575 
2576     for (y = 0; y < h + 3; y++) {
2577         for (x = 0; x < 4; x++)
2578             tmp[x] = FILTER_4TAP(src, filter, 1);
2579         tmp += 4;
2580         src += srcstride;
2581     }
2582     tmp    = tmp_array + 4;
2583     filter = subpel_filters[my - 1];
2584 
2585     for (y = 0; y < h; y++) {
2586         for (x = 0; x < 4; x++)
2587             dst[x] = FILTER_4TAP(tmp, filter, 4);
2588         dst += dststride;
2589         tmp += 4;
2590     }
2591 #endif
2592 }
2593 
ff_put_vp8_epel16_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2594 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2595         ptrdiff_t srcstride, int h, int mx, int my)
2596 {
2597 #if 1
2598     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2599     uint8_t *tmp = tmp_array;
2600 
2601     src -= 2 * srcstride;
2602     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2603     tmp    = tmp_array + 32;
2604     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2605 #else
2606     const uint8_t *filter = subpel_filters[mx - 1];
2607     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2608     int x, y;
2609     uint8_t tmp_array[592];
2610     uint8_t *tmp = tmp_array;
2611 
2612     src -= 2 * srcstride;
2613 
2614     for (y = 0; y < h + 5; y++) {
2615         for (x = 0; x < 16; x++)
2616             tmp[x] = FILTER_4TAP(src, filter, 1);
2617         tmp += 16;
2618         src += srcstride;
2619     }
2620 
2621     tmp    = tmp_array + 32;
2622     filter = subpel_filters[my - 1];
2623 
2624     for (y = 0; y < h; y++) {
2625         for (x = 0; x < 16; x++)
2626             dst[x] = FILTER_6TAP(tmp, filter, 16);
2627         dst += dststride;
2628         tmp += 16;
2629     }
2630 #endif
2631 }
2632 
ff_put_vp8_epel8_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2633 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2634         ptrdiff_t srcstride, int h, int mx, int my)
2635 {
2636 #if 1
2637     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2638     uint8_t *tmp = tmp_array;
2639 
2640     src -= 2 * srcstride;
2641     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2642     tmp    = tmp_array + 16;
2643     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2644 #else
2645     const uint8_t *filter = subpel_filters[mx - 1];
2646     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2647     int x, y;
2648     uint8_t tmp_array[168];
2649     uint8_t *tmp = tmp_array;
2650 
2651     src -= 2 * srcstride;
2652 
2653     for (y = 0; y < h + 5; y++) {
2654         for (x = 0; x < 8; x++)
2655             tmp[x] = FILTER_4TAP(src, filter, 1);
2656         tmp += 8;
2657         src += srcstride;
2658     }
2659 
2660     tmp    = tmp_array + 16;
2661     filter = subpel_filters[my - 1];
2662 
2663     for (y = 0; y < h; y++) {
2664         for (x = 0; x < 8; x++)
2665             dst[x] = FILTER_6TAP(tmp, filter, 8);
2666         dst += dststride;
2667         tmp += 8;
2668     }
2669 #endif
2670 }
2671 
ff_put_vp8_epel4_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2672 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2673         ptrdiff_t srcstride, int h, int mx, int my)
2674 {
2675 #if 1
2676     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2677     uint8_t *tmp = tmp_array;
2678 
2679     src -= 2 * srcstride;
2680     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2681     tmp    = tmp_array + 8;
2682     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2683 #else
2684     const uint8_t *filter = subpel_filters[mx - 1];
2685     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2686     int x, y;
2687     uint8_t tmp_array[52];
2688     uint8_t *tmp = tmp_array;
2689 
2690     src -= 2 * srcstride;
2691 
2692     for (y = 0; y < h + 5; y++) {
2693         for (x = 0; x < 4; x++)
2694             tmp[x] = FILTER_4TAP(src, filter, 1);
2695         tmp += 4;
2696         src += srcstride;
2697     }
2698 
2699     tmp    = tmp_array + 8;
2700     filter = subpel_filters[my - 1];
2701 
2702     for (y = 0; y < h; y++) {
2703         for (x = 0; x < 4; x++)
2704             dst[x] = FILTER_6TAP(tmp, filter, 4);
2705         dst += dststride;
2706         tmp += 4;
2707     }
2708 #endif
2709 }
2710 
ff_put_vp8_epel16_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2711 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2712         ptrdiff_t srcstride, int h, int mx, int my)
2713 {
2714 #if 1
2715     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2716     uint8_t *tmp = tmp_array;
2717 
2718     src -= srcstride;
2719     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2720     tmp    = tmp_array + 16;
2721     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2722 #else
2723     const uint8_t *filter = subpel_filters[mx - 1];
2724     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2725     int x, y;
2726     uint8_t tmp_array[560];
2727     uint8_t *tmp = tmp_array;
2728 
2729     src -= srcstride;
2730 
2731     for (y = 0; y < h + 3; y++) {
2732         for (x = 0; x < 16; x++)
2733             tmp[x] = FILTER_6TAP(src, filter, 1);
2734         tmp += 16;
2735         src += srcstride;
2736     }
2737 
2738     tmp    = tmp_array + 16;
2739     filter = subpel_filters[my - 1];
2740 
2741     for (y = 0; y < h; y++) {
2742         for (x = 0; x < 16; x++)
2743             dst[x] = FILTER_4TAP(tmp, filter, 16);
2744         dst += dststride;
2745         tmp += 16;
2746     }
2747 #endif
2748 }
2749 
ff_put_vp8_epel8_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2750 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2751         ptrdiff_t srcstride, int h, int mx, int my)
2752 {
2753 #if 1
2754     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2755     uint8_t *tmp = tmp_array;
2756 
2757     src -= srcstride;
2758     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2759     tmp    = tmp_array + 8;
2760     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2761 #else
2762     const uint8_t *filter = subpel_filters[mx - 1];
2763     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2764     int x, y;
2765     uint8_t tmp_array[152];
2766     uint8_t *tmp = tmp_array;
2767 
2768     src -= srcstride;
2769 
2770     for (y = 0; y < h + 3; y++) {
2771         for (x = 0; x < 8; x++)
2772             tmp[x] = FILTER_6TAP(src, filter, 1);
2773         tmp += 8;
2774         src += srcstride;
2775     }
2776 
2777     tmp    = tmp_array + 8;
2778     filter = subpel_filters[my - 1];
2779 
2780     for (y = 0; y < h; y++) {
2781         for (x = 0; x < 8; x++)
2782             dst[x] = FILTER_4TAP(tmp, filter, 8);
2783         dst += dststride;
2784         tmp += 8;
2785     }
2786 #endif
2787 }
2788 
ff_put_vp8_epel4_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2789 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2790         ptrdiff_t srcstride, int h, int mx, int my)
2791 {
2792 #if 1
2793     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2794     uint8_t *tmp = tmp_array;
2795 
2796     src -= srcstride;
2797     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2798     tmp    = tmp_array + 4;
2799     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2800 #else
2801     const uint8_t *filter = subpel_filters[mx - 1];
2802     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2803     int x, y;
2804     uint8_t tmp_array[44];
2805     uint8_t *tmp = tmp_array;
2806 
2807     src -= srcstride;
2808 
2809     for (y = 0; y < h + 3; y++) {
2810         for (x = 0; x < 4; x++)
2811             tmp[x] = FILTER_6TAP(src, filter, 1);
2812         tmp += 4;
2813         src += srcstride;
2814     }
2815 
2816     tmp    = tmp_array + 4;
2817     filter = subpel_filters[my - 1];
2818 
2819     for (y = 0; y < h; y++) {
2820         for (x = 0; x < 4; x++)
2821             dst[x] = FILTER_4TAP(tmp, filter, 4);
2822         dst += dststride;
2823         tmp += 4;
2824     }
2825 #endif
2826 }
2827 
ff_put_vp8_epel16_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2828 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2829         ptrdiff_t srcstride, int h, int mx, int my)
2830 {
2831 #if 1
2832     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2833     uint8_t *tmp = tmp_array;
2834 
2835     src -= 2 * srcstride;
2836     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2837     tmp    = tmp_array + 32;
2838     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2839 #else
2840     const uint8_t *filter = subpel_filters[mx - 1];
2841     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2842     int x, y;
2843     uint8_t tmp_array[592];
2844     uint8_t *tmp = tmp_array;
2845 
2846     src -= 2 * srcstride;
2847 
2848     for (y = 0; y < h + 5; y++) {
2849         for (x = 0; x < 16; x++)
2850             tmp[x] = FILTER_6TAP(src, filter, 1);
2851         tmp += 16;
2852         src += srcstride;
2853     }
2854 
2855     tmp    = tmp_array + 32;
2856     filter = subpel_filters[my - 1];
2857 
2858     for (y = 0; y < h; y++) {
2859         for (x = 0; x < 16; x++)
2860             dst[x] = FILTER_6TAP(tmp, filter, 16);
2861         dst += dststride;
2862         tmp += 16;
2863     }
2864 #endif
2865 }
2866 
ff_put_vp8_epel8_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2867 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2868         ptrdiff_t srcstride, int h, int mx, int my)
2869 {
2870 #if 1
2871     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2872     uint8_t *tmp = tmp_array;
2873 
2874     src -= 2 * srcstride;
2875     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2876     tmp    = tmp_array + 16;
2877     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2878 #else
2879     const uint8_t *filter = subpel_filters[mx - 1];
2880     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2881     int x, y;
2882     uint8_t tmp_array[168];
2883     uint8_t *tmp = tmp_array;
2884 
2885     src -= 2 * srcstride;
2886 
2887     for (y = 0; y < h + 5; y++) {
2888         for (x = 0; x < 8; x++)
2889             tmp[x] = FILTER_6TAP(src, filter, 1);
2890         tmp += 8;
2891         src += srcstride;
2892     }
2893 
2894     tmp    = tmp_array + 16;
2895     filter = subpel_filters[my - 1];
2896 
2897     for (y = 0; y < h; y++) {
2898         for (x = 0; x < 8; x++)
2899             dst[x] = FILTER_6TAP(tmp, filter, 8);
2900         dst += dststride;
2901         tmp += 8;
2902     }
2903 #endif
2904 }
2905 
ff_put_vp8_epel4_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2906 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2907         ptrdiff_t srcstride, int h, int mx, int my)
2908 {
2909 #if 1
2910     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2911     uint8_t *tmp = tmp_array;
2912 
2913     src -= 2 * srcstride;
2914     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2915     tmp    = tmp_array + 8;
2916     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2917 #else
2918     const uint8_t *filter = subpel_filters[mx - 1];
2919     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2920     int x, y;
2921     uint8_t tmp_array[52];
2922     uint8_t *tmp = tmp_array;
2923 
2924     src -= 2 * srcstride;
2925 
2926     for (y = 0; y < h + 5; y++) {
2927         for (x = 0; x < 4; x++)
2928             tmp[x] = FILTER_6TAP(src, filter, 1);
2929         tmp += 4;
2930         src += srcstride;
2931     }
2932 
2933     tmp    = tmp_array + 8;
2934     filter = subpel_filters[my - 1];
2935 
2936     for (y = 0; y < h; y++) {
2937         for (x = 0; x < 4; x++)
2938             dst[x] = FILTER_6TAP(tmp, filter, 4);
2939         dst += dststride;
2940         tmp += 4;
2941     }
2942 #endif
2943 }
2944 
ff_put_vp8_bilinear16_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2945 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2946         ptrdiff_t sstride, int h, int mx, int my)
2947 {
2948 #if 1
2949     union mmi_intfloat64 a, b;
2950     double ftmp[7];
2951     uint32_t tmp[1];
2952     mips_reg dst0, src0;
2953     DECLARE_VAR_ALL64;
2954     a.i = 8 - mx;
2955     b.i = mx;
2956 
2957     /*
2958     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2959     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2960     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2961     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2962     dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2963     dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2964     dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2965     dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2966 
2967     dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2968     dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2969     dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2970     dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2971     dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2972     dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2973     dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2974     dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2975     */
2976     __asm__ volatile (
2977         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2978         "li         %[tmp0],    0x03                                \n\t"
2979         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2980         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
2981         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
2982 
2983         "1:                                                         \n\t"
2984         // 0 - 7
2985         PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2986         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2987         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2988         // 8 - 15
2989         PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2990 
2991         "addiu      %[h],       %[h],           -0x01               \n\t"
2992         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
2993         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
2994         "bnez       %[h],       1b                                  \n\t"
2995         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2996           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2997           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2998           [ftmp6]"=&f"(ftmp[6]),
2999           [tmp0]"=&r"(tmp[0]),
3000           RESTRICT_ASM_ALL64
3001           [dst0]"=&r"(dst0),            [src0]"=&r"(src0),
3002           [h]"+&r"(h),
3003           [dst]"+&r"(dst),              [src]"+&r"(src),
3004           [a]"+&f"(a.f),                [b]"+&f"(b.f)
3005         : [sstride]"r"((mips_reg)sstride),
3006           [dstride]"r"((mips_reg)dstride),
3007           [ff_pw_4]"f"(ff_pw_4.f)
3008         : "memory"
3009     );
3010 #else
3011     int a = 8 - mx, b = mx;
3012     int x, y;
3013 
3014     for (y = 0; y < h; y++) {
3015         for (x = 0; x < 16; x++)
3016             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3017         dst += dstride;
3018         src += sstride;
3019     }
3020 #endif
3021 }
3022 
ff_put_vp8_bilinear16_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3023 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3024         ptrdiff_t sstride, int h, int mx, int my)
3025 {
3026 #if 1
3027     union mmi_intfloat64 c, d;
3028     double ftmp[7];
3029     uint32_t tmp[1];
3030     mips_reg src0, src1, dst0;
3031     DECLARE_VAR_ALL64;
3032     c.i = 8 - my;
3033     d.i = my;
3034 
3035     /*
3036     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3037     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3038     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3039     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3040     dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3041     dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3042     dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3043     dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3044     */
3045     __asm__ volatile (
3046         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3047         "li         %[tmp0],    0x03                                \n\t"
3048         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3049         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3050         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3051 
3052         "1:                                                         \n\t"
3053         // 0 - 7
3054         PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3055         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
3056         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
3057         // 8 - 15
3058         PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
3059 
3060         "addiu      %[h],       %[h],           -0x01               \n\t"
3061         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3062         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3063         "bnez       %[h],       1b                                  \n\t"
3064         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3065           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3066           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3067           [ftmp6]"=&f"(ftmp[6]),
3068           [tmp0]"=&r"(tmp[0]),
3069           RESTRICT_ASM_ALL64
3070           [src0]"=&r"(src0),            [dst0]"=&r"(dst0),
3071           [src1]"=&r"(src1),
3072           [h]"+&r"(h),
3073           [dst]"+&r"(dst),              [src]"+&r"(src),
3074           [c]"+&f"(c.f),                [d]"+&f"(d.f)
3075         : [sstride]"r"((mips_reg)sstride),
3076           [dstride]"r"((mips_reg)dstride),
3077           [ff_pw_4]"f"(ff_pw_4.f)
3078         : "memory"
3079     );
3080 #else
3081     int c = 8 - my, d = my;
3082     int x, y;
3083 
3084     for (y = 0; y < h; y++) {
3085         for (x = 0; x < 16; x++)
3086             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3087         dst += dstride;
3088         src += sstride;
3089     }
3090 #endif
3091 }
3092 
ff_put_vp8_bilinear16_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3093 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3094         ptrdiff_t sstride, int h, int mx, int my)
3095 {
3096 #if 1
3097     DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
3098     uint8_t *tmp = tmp_array;
3099 
3100     ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
3101     ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
3102 #else
3103     int a = 8 - mx, b = mx;
3104     int c = 8 - my, d = my;
3105     int x, y;
3106     uint8_t tmp_array[528];
3107     uint8_t *tmp = tmp_array;
3108 
3109     for (y = 0; y < h + 1; y++) {
3110         for (x = 0; x < 16; x++)
3111             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3112         tmp += 16;
3113         src += sstride;
3114     }
3115 
3116     tmp = tmp_array;
3117 
3118     for (y = 0; y < h; y++) {
3119         for (x = 0; x < 16; x++)
3120             dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3121         dst += dstride;
3122         tmp += 16;
3123     }
3124 #endif
3125 }
3126 
ff_put_vp8_bilinear8_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3127 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3128         ptrdiff_t sstride, int h, int mx, int my)
3129 {
3130 #if 1
3131     union mmi_intfloat64 a, b;
3132     double ftmp[7];
3133     uint32_t tmp[1];
3134     DECLARE_VAR_ALL64;
3135     a.i = 8 - mx;
3136     b.i = mx;
3137 
3138     /*
3139     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3140     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3141     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3142     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3143     dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3144     dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3145     dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3146     dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3147     */
3148     __asm__ volatile (
3149         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3150         "li         %[tmp0],    0x03                                \n\t"
3151         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3152         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3153         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3154 
3155         "1:                                                         \n\t"
3156         PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3157 
3158         "addiu      %[h],       %[h],           -0x01               \n\t"
3159         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3160         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3161         "bnez       %[h],       1b                                  \n\t"
3162         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3163           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3164           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3165           [ftmp6]"=&f"(ftmp[6]),
3166           [tmp0]"=&r"(tmp[0]),
3167           RESTRICT_ASM_ALL64
3168           [h]"+&r"(h),
3169           [dst]"+&r"(dst),              [src]"+&r"(src),
3170           [a]"+&f"(a.f),                [b]"+&f"(b.f)
3171         : [sstride]"r"((mips_reg)sstride),
3172           [dstride]"r"((mips_reg)dstride),
3173           [ff_pw_4]"f"(ff_pw_4.f)
3174         : "memory"
3175     );
3176 #else
3177     int a = 8 - mx, b = mx;
3178     int x, y;
3179 
3180     for (y = 0; y < h; y++) {
3181         for (x = 0; x < 8; x++)
3182             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3183         dst += dstride;
3184         src += sstride;
3185     }
3186 #endif
3187 }
3188 
ff_put_vp8_bilinear8_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3189 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3190         ptrdiff_t sstride, int h, int mx, int my)
3191 {
3192 #if 1
3193     union mmi_intfloat64 c, d;
3194     double ftmp[7];
3195     uint32_t tmp[1];
3196     mips_reg src1;
3197     DECLARE_VAR_ALL64;
3198     c.i = 8 - my;
3199     d.i = my;
3200 
3201     /*
3202     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3203     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3204     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3205     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3206     dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3207     dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3208     dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3209     dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3210     */
3211     __asm__ volatile (
3212         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3213         "li         %[tmp0],    0x03                                \n\t"
3214         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3215         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3216         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3217 
3218         "1:                                                         \n\t"
3219         PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3220 
3221         "addiu      %[h],       %[h],           -0x01               \n\t"
3222         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3223         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3224         "bnez       %[h],       1b                                  \n\t"
3225         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3226           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3227           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3228           [ftmp6]"=&f"(ftmp[6]),
3229           [tmp0]"=&r"(tmp[0]),
3230           RESTRICT_ASM_ALL64
3231           [src1]"=&r"(src1),
3232           [h]"+&r"(h),
3233           [dst]"+&r"(dst),              [src]"+&r"(src),
3234           [c]"+&f"(c.f),                [d]"+&f"(d.f)
3235         : [sstride]"r"((mips_reg)sstride),
3236           [dstride]"r"((mips_reg)dstride),
3237           [ff_pw_4]"f"(ff_pw_4.f)
3238         : "memory"
3239     );
3240 #else
3241     int c = 8 - my, d = my;
3242     int x, y;
3243 
3244     for (y = 0; y < h; y++) {
3245         for (x = 0; x < 8; x++)
3246             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3247         dst += dstride;
3248         src += sstride;
3249     }
3250 #endif
3251 }
3252 
ff_put_vp8_bilinear8_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3253 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3254         ptrdiff_t sstride, int h, int mx, int my)
3255 {
3256 #if 1
3257     DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3258     uint8_t *tmp = tmp_array;
3259 
3260     ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3261     ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3262 #else
3263     int a = 8 - mx, b = mx;
3264     int c = 8 - my, d = my;
3265     int x, y;
3266     uint8_t tmp_array[136];
3267     uint8_t *tmp = tmp_array;
3268 
3269     for (y = 0; y < h + 1; y++) {
3270         for (x = 0; x < 8; x++)
3271             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3272         tmp += 8;
3273         src += sstride;
3274     }
3275 
3276     tmp = tmp_array;
3277 
3278     for (y = 0; y < h; y++) {
3279         for (x = 0; x < 8; x++)
3280             dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3281         dst += dstride;
3282         tmp += 8;
3283     }
3284 #endif
3285 }
3286 
ff_put_vp8_bilinear4_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3287 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3288         ptrdiff_t sstride, int h, int mx, int my)
3289 {
3290 #if 1
3291     union mmi_intfloat64 a, b;
3292     double ftmp[5];
3293     uint32_t tmp[1];
3294     DECLARE_VAR_LOW32;
3295     DECLARE_VAR_ALL64;
3296     a.i = 8 - mx;
3297     b.i = mx;
3298 
3299     /*
3300     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3301     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3302     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3303     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3304     */
3305     __asm__ volatile (
3306         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3307         "li         %[tmp0],    0x03                                \n\t"
3308         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3309         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3310         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3311 
3312         "1:                                                         \n\t"
3313         PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3314 
3315         "addiu      %[h],       %[h],           -0x01               \n\t"
3316         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3317         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3318         "bnez       %[h],       1b                                  \n\t"
3319         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3320           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3321           [ftmp4]"=&f"(ftmp[4]),
3322           [tmp0]"=&r"(tmp[0]),
3323           RESTRICT_ASM_LOW32
3324           RESTRICT_ASM_ALL64
3325           [h]"+&r"(h),
3326           [dst]"+&r"(dst),              [src]"+&r"(src),
3327           [a]"+&f"(a.f),                [b]"+&f"(b.f)
3328         : [sstride]"r"((mips_reg)sstride),
3329           [dstride]"r"((mips_reg)dstride),
3330           [ff_pw_4]"f"(ff_pw_4.f)
3331         : "memory"
3332     );
3333 #else
3334     int a = 8 - mx, b = mx;
3335     int x, y;
3336 
3337     for (y = 0; y < h; y++) {
3338         for (x = 0; x < 4; x++)
3339             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3340         dst += dstride;
3341         src += sstride;
3342     }
3343 #endif
3344 }
3345 
ff_put_vp8_bilinear4_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3346 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3347         ptrdiff_t sstride, int h, int mx, int my)
3348 {
3349 #if 1
3350     union mmi_intfloat64 c, d;
3351     double ftmp[7];
3352     uint32_t tmp[1];
3353     mips_reg src1;
3354     DECLARE_VAR_LOW32;
3355     DECLARE_VAR_ALL64;
3356     c.i = 8 - my;
3357     d.i = my;
3358 
3359     /*
3360     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3361     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3362     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3363     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3364     */
3365     __asm__ volatile (
3366         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3367         "li         %[tmp0],    0x03                                \n\t"
3368         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3369         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3370         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3371 
3372         "1:                                                         \n\t"
3373         PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3374 
3375         "addiu      %[h],       %[h],           -0x01               \n\t"
3376         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3377         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3378         "bnez       %[h],       1b                                  \n\t"
3379         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3380           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3381           [ftmp4]"=&f"(ftmp[4]),
3382           [tmp0]"=&r"(tmp[0]),
3383           RESTRICT_ASM_LOW32
3384           RESTRICT_ASM_ALL64
3385           [src1]"=&r"(src1),
3386           [h]"+&r"(h),
3387           [dst]"+&r"(dst),              [src]"+&r"(src),
3388           [c]"+&f"(c.f),                [d]"+&f"(d.f)
3389         : [sstride]"r"((mips_reg)sstride),
3390           [dstride]"r"((mips_reg)dstride),
3391           [ff_pw_4]"f"(ff_pw_4.f)
3392         : "memory"
3393     );
3394 #else
3395     int c = 8 - my, d = my;
3396     int x, y;
3397 
3398     for (y = 0; y < h; y++) {
3399         for (x = 0; x < 4; x++)
3400             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3401         dst += dstride;
3402         src += sstride;
3403     }
3404 #endif
3405 }
3406 
ff_put_vp8_bilinear4_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3407 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3408         ptrdiff_t sstride, int h, int mx, int my)
3409 {
3410 #if 1
3411     DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3412     uint8_t *tmp = tmp_array;
3413 
3414     ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3415     ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3416 #else
3417     int a = 8 - mx, b = mx;
3418     int c = 8 - my, d = my;
3419     int x, y;
3420     uint8_t tmp_array[36];
3421     uint8_t *tmp = tmp_array;
3422 
3423     for (y = 0; y < h + 1; y++) {
3424         for (x = 0; x < 4; x++)
3425             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3426         tmp += 4;
3427         src += sstride;
3428     }
3429 
3430     tmp = tmp_array;
3431 
3432     for (y = 0; y < h; y++) {
3433         for (x = 0; x < 4; x++)
3434             dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3435         dst += dstride;
3436         tmp += 4;
3437     }
3438 #endif
3439 }
3440