• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/attributes.h"
24 #include "libavutil/avassert.h"
25 #include "libavutil/mem_internal.h"
26 
27 #include "libavcodec/vc1dsp.h"
28 #include "constants.h"
29 #include "vc1dsp_mips.h"
30 #include "hpeldsp_mips.h"
31 #include "libavutil/mips/mmiutils.h"
32 
33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)                  \
34         "li         %[tmp0],    "#r1"                                 \n\t" \
35         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
36         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
37         "li         %[tmp0],    "#r2"                                 \n\t" \
38         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
39         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
40         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
41         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
42         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
43         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
44         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
45         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
46                                                                             \
47         "li         %[tmp0],    "#r3"                                 \n\t" \
48         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
49         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
50         "li         %[tmp0],    "#r4"                                 \n\t" \
51         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
52         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
53         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
54         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
55         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
56         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
57         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
58         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
59                                                                             \
60         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
61         "paddw      %[ftmp2],   %[ftmp2],   "#c0"                     \n\t" \
62         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
63         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
64         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
65         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
66         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
67         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
68         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
69         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
70         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
71         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
72         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
73         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
74         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
75         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
76 
77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)              \
78         "li         %[tmp0],    "#r1"                                 \n\t" \
79         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
80         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
81         "li         %[tmp0],    "#r2"                                 \n\t" \
82         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
83         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
84         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
85         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
86         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
87         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
88         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
89         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
90                                                                             \
91         "li         %[tmp0],    "#r3"                                 \n\t" \
92         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
93         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
94         "li         %[tmp0],    "#r4"                                 \n\t" \
95         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
96         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
97         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
98         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
99         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
100         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
101         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
102         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
103                                                                             \
104         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
105         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
106         "paddw      %[ftmp14],  %[ftmp14],  "#c1"                     \n\t" \
107         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
108         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
109         "paddw      %[ftmp3],   %[ftmp3],   "#c1"                     \n\t" \
110         "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
111         "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
112         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
113         "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
114         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
115         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
116         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
117         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
118         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
119         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
120         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
121         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
122         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
123         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
124 
125 /* Do inverse transform on 8x8 block */
ff_vc1_inv_trans_8x8_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)126 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
127 {
128     int dc = block[0];
129     double ftmp[9];
130     mips_reg addr[1];
131     int count;
132     union mmi_intfloat64 dc_u;
133 
134     dc = (3 * dc +  1) >> 1;
135     dc = (3 * dc + 16) >> 5;
136     dc_u.i = dc;
137 
138     __asm__ volatile(
139         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
140         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
141         "li         %[count],   0x02                                    \n\t"
142 
143         "1:                                                             \n\t"
144         MMI_LDC1(%[ftmp1], %[dest], 0x00)
145         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
146         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
147         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
148         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
149         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
150         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
151 
152         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
153         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
154         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
155         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
156         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
157         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
158         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
159         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
160 
161         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
162         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
163         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
164         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
165         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
166         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
167         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
168         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
169 
170         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
171         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
172         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
173         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
174 
175         MMI_SDC1(%[ftmp1], %[dest], 0x00)
176         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
177         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
178         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
179         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
180         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
181         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
182 
183         "addiu      %[count],   %[count],       -0x01                   \n\t"
184         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
185         "bnez       %[count],   1b                                      \n\t"
186         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
187           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
188           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
189           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
190           [ftmp8]"=&f"(ftmp[8]),
191           [addr0]"=&r"(addr[0]),
192           [count]"=&r"(count),          [dest]"+&r"(dest)
193         : [linesize]"r"((mips_reg)linesize),
194           [dc]"f"(dc_u.f)
195         : "memory"
196     );
197 }
198 
199 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_8x8_mmi(int16_t block[64])200 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
201 {
202     DECLARE_ALIGNED(16, int16_t, temp[64]);
203     double ftmp[23];
204     uint64_t tmp[1];
205 
206     __asm__ volatile (
207         /* 1st loop: start */
208         "li         %[tmp0],    0x03                                    \n\t"
209         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
210 
211        // 1st part
212         MMI_LDC1(%[ftmp1], %[block], 0x00)
213         MMI_LDC1(%[ftmp11], %[block], 0x10)
214         MMI_LDC1(%[ftmp2], %[block], 0x20)
215         MMI_LDC1(%[ftmp12], %[block], 0x30)
216         MMI_LDC1(%[ftmp3], %[block], 0x40)
217         MMI_LDC1(%[ftmp13], %[block], 0x50)
218         MMI_LDC1(%[ftmp4], %[block], 0x60)
219         MMI_LDC1(%[ftmp14], %[block], 0x70)
220         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
221         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
222         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
223         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
224 
225         "punpcklhw  %[ftmp9],  %[ftmp11],  %[ftmp12]                    \n\t"
226         "punpckhhw  %[ftmp10], %[ftmp11],  %[ftmp12]                    \n\t"
227         "punpcklhw  %[ftmp11], %[ftmp13],  %[ftmp14]                    \n\t"
228         "punpckhhw  %[ftmp12], %[ftmp13],  %[ftmp14]                    \n\t"
229 
230         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
231         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
232                                0x000f0010, 0x00040009, %[ff_pw_4])
233 
234         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
235         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
236                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
237 
238         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
239         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
240                                0xfff00009, 0x000f0004, %[ff_pw_4])
241 
242         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
243         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
244                                0xfff70004, 0xfff0000f, %[ff_pw_4])
245 
246         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
247                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
248 
249         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
250                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
251 
252         MMI_SDC1(%[ftmp15], %[temp], 0x00)
253         MMI_SDC1(%[ftmp19], %[temp], 0x08)
254         MMI_SDC1(%[ftmp16], %[temp], 0x10)
255         MMI_SDC1(%[ftmp20], %[temp], 0x18)
256         MMI_SDC1(%[ftmp17], %[temp], 0x20)
257         MMI_SDC1(%[ftmp21], %[temp], 0x28)
258         MMI_SDC1(%[ftmp18], %[temp], 0x30)
259         MMI_SDC1(%[ftmp22], %[temp], 0x38)
260 
261        // 2nd part
262         MMI_LDC1(%[ftmp1], %[block], 0x08)
263         MMI_LDC1(%[ftmp11], %[block], 0x18)
264         MMI_LDC1(%[ftmp2], %[block], 0x28)
265         MMI_LDC1(%[ftmp12], %[block], 0x38)
266         MMI_LDC1(%[ftmp3], %[block], 0x48)
267         MMI_LDC1(%[ftmp13], %[block], 0x58)
268         MMI_LDC1(%[ftmp4], %[block], 0x68)
269         MMI_LDC1(%[ftmp14], %[block], 0x78)
270         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
271         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
272         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
273         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
274 
275         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
276         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
277         "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
278         "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
279 
280         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
281         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
282                                0x000f0010, 0x00040009, %[ff_pw_4])
283 
284         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
285         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
286                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
287 
288         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
289         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
290                                0xfff00009, 0x000f0004, %[ff_pw_4])
291 
292         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
293         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
294                                0xfff70004, 0xfff0000f, %[ff_pw_4])
295 
296         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
297                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
298 
299         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
300                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
301 
302         MMI_SDC1(%[ftmp19], %[temp], 0x48)
303         MMI_SDC1(%[ftmp20], %[temp], 0x58)
304         MMI_SDC1(%[ftmp21], %[temp], 0x68)
305         MMI_SDC1(%[ftmp22], %[temp], 0x78)
306         /* 1st loop: end */
307 
308         /* 2nd loop: start */
309         "li         %[tmp0],    0x07                                    \n\t"
310         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
311 
312         // 1st part
313         MMI_LDC1(%[ftmp1], %[temp], 0x00)
314         MMI_LDC1(%[ftmp11], %[temp], 0x10)
315         MMI_LDC1(%[ftmp2], %[temp], 0x20)
316         MMI_LDC1(%[ftmp12], %[temp], 0x30)
317         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
318         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
319         "punpcklhw  %[ftmp7],   %[ftmp15],  %[ftmp17]                   \n\t"
320         "punpckhhw  %[ftmp8],   %[ftmp15],  %[ftmp17]                   \n\t"
321 
322         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
323         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
324         "punpcklhw  %[ftmp11],  %[ftmp16],  %[ftmp18]                   \n\t"
325         "punpckhhw  %[ftmp12],  %[ftmp16],  %[ftmp18]                   \n\t"
326 
327         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
328         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
329                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
330 
331         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
332         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
333                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
334 
335         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
336         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
337                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
338 
339         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
340         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
341                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
342 
343         MMI_SDC1(%[ftmp15], %[block], 0x00)
344         MMI_SDC1(%[ftmp16], %[block], 0x10)
345         MMI_SDC1(%[ftmp17], %[block], 0x20)
346         MMI_SDC1(%[ftmp18], %[block], 0x30)
347         MMI_SDC1(%[ftmp19], %[block], 0x40)
348         MMI_SDC1(%[ftmp20], %[block], 0x50)
349         MMI_SDC1(%[ftmp21], %[block], 0x60)
350         MMI_SDC1(%[ftmp22], %[block], 0x70)
351 
352        // 2nd part
353         MMI_LDC1(%[ftmp1], %[temp], 0x08)
354         MMI_LDC1(%[ftmp11], %[temp], 0x18)
355         MMI_LDC1(%[ftmp2], %[temp], 0x28)
356         MMI_LDC1(%[ftmp12], %[temp], 0x38)
357         MMI_LDC1(%[ftmp3], %[temp], 0x48)
358         MMI_LDC1(%[ftmp13], %[temp], 0x58)
359         MMI_LDC1(%[ftmp4], %[temp], 0x68)
360         MMI_LDC1(%[ftmp14], %[temp], 0x78)
361         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
362         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
363         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
364         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
365 
366         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
367         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
368         "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
369         "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
370 
371         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
372         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
373                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
374 
375         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
376         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
377                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
378 
379         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
380         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
381                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
382 
383         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
384         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
385                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
386 
387         MMI_SDC1(%[ftmp15], %[block], 0x08)
388         MMI_SDC1(%[ftmp16], %[block], 0x18)
389         MMI_SDC1(%[ftmp17], %[block], 0x28)
390         MMI_SDC1(%[ftmp18], %[block], 0x38)
391         MMI_SDC1(%[ftmp19], %[block], 0x48)
392         MMI_SDC1(%[ftmp20], %[block], 0x58)
393         MMI_SDC1(%[ftmp21], %[block], 0x68)
394         MMI_SDC1(%[ftmp22], %[block], 0x78)
395         /* 2nd loop: end */
396         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
397           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
398           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
399           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
400           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
401           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
402           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
403           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
404           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
405           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
406           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
407           [ftmp22]"=&f"(ftmp[22]),
408           [tmp0]"=&r"(tmp[0])
409         : [ff_pw_1]"f"(ff_pw_32_1.f),   [ff_pw_64]"f"(ff_pw_32_64.f),
410           [ff_pw_4]"f"(ff_pw_32_4.f),   [block]"r"(block),
411           [temp]"r"(temp)
412         : "memory"
413     );
414 }
415 #endif
416 
417 /* Do inverse transform on 8x4 part of block */
ff_vc1_inv_trans_8x4_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)418 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
419 {
420     int dc = block[0];
421     double ftmp[9];
422     union mmi_intfloat64 dc_u;
423 
424     dc = ( 3 * dc +  1) >> 1;
425     dc = (17 * dc + 64) >> 7;
426     dc_u.i = dc;
427 
428     __asm__ volatile(
429         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
430         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
431 
432         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
433         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
434         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
435         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
436 
437         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
438         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
439         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
440         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
441         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
442         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
443         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
444         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
445 
446         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
447         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
448         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
449         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
450         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
451         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
452         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
453         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
454 
455         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
456         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
457         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
458         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
459 
460         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
461         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
462         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
463         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
464         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
465           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
466           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
467           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
468           [ftmp8]"=&f"(ftmp[8])
469         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
470           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
471           [dc]"f"(dc_u.f)
472         : "memory"
473     );
474 }
475 
476 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_8x4_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)477 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
478 {
479     int16_t *src = block;
480     int16_t *dst = block;
481     double ftmp[16];
482     uint32_t tmp[1];
483     int16_t count = 4;
484     int16_t coeff[64] = {12, 16,  16,  15,  12,   9,   6,   4,
485                          12, 15,   6,  -4, -12, -16, -16,  -9,
486                          12,  9,  -6, -16, -12,   4,  16,  15,
487                          12,  4, -16,  -9,  12,  15,  -6, -16,
488                          12, -4, -16,   9,  12, -15,  -6,  16,
489                          12, -9,  -6,  16, -12,  -4,  16, -15,
490                          12, -15,  6,   4, -12,  16, -16,   9,
491                          12, -16, 16, -15,  12,  -9,   6,  -4};
492 
493     // 1st loop
494     __asm__ volatile (
495         "li         %[tmp0],    0x03                                    \n\t"
496         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
497 
498         "1:                                                             \n\t"
499         MMI_LDC1(%[ftmp1], %[src], 0x00)
500         MMI_LDC1(%[ftmp2], %[src], 0x08)
501 
502         /* ftmp11: dst1,dst0 */
503         MMI_LDC1(%[ftmp3], %[coeff], 0x00)
504         MMI_LDC1(%[ftmp4], %[coeff], 0x08)
505         MMI_LDC1(%[ftmp5], %[coeff], 0x10)
506         MMI_LDC1(%[ftmp6], %[coeff], 0x18)
507         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
508         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
509         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
510         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
511         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
512         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
513         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
514         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
515         "paddw      %[ftmp11],  %[ftmp7],   %[ftmp8]                    \n\t"
516         "paddw      %[ftmp11],  %[ftmp11],  %[ff_pw_4]                  \n\t"
517 
518         /* ftmp12: dst3,dst2 */
519         MMI_LDC1(%[ftmp3], %[coeff], 0x20)
520         MMI_LDC1(%[ftmp4], %[coeff], 0x28)
521         MMI_LDC1(%[ftmp5], %[coeff], 0x30)
522         MMI_LDC1(%[ftmp6], %[coeff], 0x38)
523         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
524         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
525         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
526         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
527         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
528         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
529         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
530         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
531         "paddw      %[ftmp12],  %[ftmp7],   %[ftmp8]                    \n\t"
532         "paddw      %[ftmp12],  %[ftmp12],  %[ff_pw_4]                  \n\t"
533 
534         /* ftmp13: dst5,dst4 */
535         MMI_LDC1(%[ftmp3], %[coeff], 0x40)
536         MMI_LDC1(%[ftmp4], %[coeff], 0x48)
537         MMI_LDC1(%[ftmp5], %[coeff], 0x50)
538         MMI_LDC1(%[ftmp6], %[coeff], 0x58)
539         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
540         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
541         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
542         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
543         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
544         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
545         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
546         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
547         "paddw      %[ftmp13],  %[ftmp7],   %[ftmp8]                    \n\t"
548         "paddw      %[ftmp13],  %[ftmp13],  %[ff_pw_4]                  \n\t"
549 
550         /* ftmp14: dst7,dst6 */
551         MMI_LDC1(%[ftmp3], %[coeff], 0x60)
552         MMI_LDC1(%[ftmp4], %[coeff], 0x68)
553         MMI_LDC1(%[ftmp5], %[coeff], 0x70)
554         MMI_LDC1(%[ftmp6], %[coeff], 0x78)
555         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
556         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
557         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
558         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
559         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
560         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
561         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
562         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
563         "paddw      %[ftmp14],  %[ftmp7],   %[ftmp8]                    \n\t"
564         "paddw      %[ftmp14],  %[ftmp14],  %[ff_pw_4]                  \n\t"
565 
566         /* ftmp9: dst3,dst2,dst1,dst0    ftmp10: dst7,dst6,dst5,dst4 */
567         "psraw      %[ftmp11],  %[ftmp11],  %[ftmp0]                    \n\t"
568         "psraw      %[ftmp12],  %[ftmp12],  %[ftmp0]                    \n\t"
569         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                    \n\t"
570         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                    \n\t"
571         "punpcklhw  %[ftmp7],   %[ftmp11],  %[ftmp12]                   \n\t"
572         "punpckhhw  %[ftmp8],   %[ftmp11],  %[ftmp12]                   \n\t"
573         "punpcklhw  %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
574         "punpcklhw  %[ftmp7],   %[ftmp13],  %[ftmp14]                   \n\t"
575         "punpckhhw  %[ftmp8],   %[ftmp13],  %[ftmp14]                   \n\t"
576         "punpcklhw  %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
577         MMI_SDC1(%[ftmp9], %[dst], 0x00)
578         MMI_SDC1(%[ftmp10], %[dst], 0x08)
579 
580         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
581         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
582         "addiu      %[count],   %[count],   -0x01                       \n\t"
583         "bnez       %[count],   1b                                      \n\t"
584         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
585           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
586           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
587           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
588           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
589           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
590           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
591           [ftmp14]"=&f"(ftmp[14]),      [tmp0]"=&r"(tmp[0]),
592           [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
593         : [ff_pw_4]"f"(ff_pw_32_4.f),   [coeff]"r"(coeff)
594         : "memory"
595     );
596 
597     src = block;
598 
599     // 2nd loop
600     __asm__ volatile (
601         "li         %[tmp0],    0x44                                    \n\t"
602         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
603 
604         // 1st part
605         "li         %[tmp0],    0x07                                    \n\t"
606         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
607         MMI_LDC1(%[ftmp1], %[src], 0x00)
608         MMI_LDC1(%[ftmp2], %[src], 0x10)
609         MMI_LDC1(%[ftmp3], %[src], 0x20)
610         MMI_LDC1(%[ftmp4], %[src], 0x30)
611         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
612         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
613         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
614         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
615 
616         /* ftmp11: dst03,dst02,dst01,dst00 */
617         "li         %[tmp0],    0x00160011                              \n\t"
618         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
619         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
620         "li         %[tmp0],    0x000a0011                              \n\t"
621         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
622         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
623         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
624         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
625         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
626         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
627         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
628         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
629         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
630         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
631         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
632         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
633         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
634         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
635         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
636 
637         /* ftmp12: dst13,dst12,dst11,dst10 */
638         "li         %[tmp0],    0x000a0011                              \n\t"
639         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
640         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
641         "li         %[tmp0],    0xffeaffef                              \n\t"
642         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
643         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
644         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
645         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
646         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
647         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
648         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
649         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
650         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
651         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
652         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
653         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
654         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
655         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
656         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
657 
658         /* ftmp13: dst23,dst22,dst21,dst20 */
659         "li         %[tmp0],    0xfff60011                              \n\t"
660         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
661         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
662         "li         %[tmp0],    0x0016ffef                              \n\t"
663         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
664         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
665         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
666         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
667         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
668         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
669         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
670         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
671         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
672         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
673         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
674         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
675         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
676         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
677         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
678 
679         /* ftmp14: dst33,dst32,dst31,dst30 */
680         "li         %[tmp0],    0xffea0011                              \n\t"
681         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
682         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
683         "li         %[tmp0],    0xfff60011                              \n\t"
684         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
685         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
686         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
687         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
688         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
689         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
690         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
691         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
692         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
693         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
694         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
695         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
696         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
697         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
698         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
699 
700         MMI_LWC1(%[ftmp1], %[dest], 0x00)
701         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
702         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
703         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
704         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
705         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
706         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
707         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
708         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
709         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
710         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
711         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
712         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
713         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
714         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
715         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
716         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
717         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
718         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
719         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
720         MMI_SWC1(%[ftmp1], %[dest], 0x00)
721         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
722         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
723         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
724         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
725         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
726         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
727 
728         // 2nd part
729         "li         %[tmp0],    0x07                                    \n\t"
730         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
731         MMI_LDC1(%[ftmp1], %[src], 0x08)
732         MMI_LDC1(%[ftmp2], %[src], 0x18)
733         MMI_LDC1(%[ftmp3], %[src], 0x28)
734         MMI_LDC1(%[ftmp4], %[src], 0x38)
735         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
736         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
737         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
738         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
739 
740         /* ftmp11: dst03,dst02,dst01,dst00 */
741         "li         %[tmp0],    0x00160011                              \n\t"
742         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
743         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
744         "li         %[tmp0],    0x000a0011                              \n\t"
745         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
746         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
747         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
748         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
749         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
750         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
751         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
752         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
753         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
754         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
755         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
756         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
757         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
758         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
759         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
760 
761         /* ftmp12: dst13,dst12,dst11,dst10 */
762         "li         %[tmp0],    0x000a0011                              \n\t"
763         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
764         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
765         "li         %[tmp0],    0xffeaffef                              \n\t"
766         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
767         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
768         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
769         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
770         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
771         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
772         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
773         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
774         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
775         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
776         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
777         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
778         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
779         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
780         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
781 
782         /* ftmp13: dst23,dst22,dst21,dst20 */
783         "li         %[tmp0],    0xfff60011                              \n\t"
784         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
785         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
786         "li         %[tmp0],    0x0016ffef                              \n\t"
787         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
788         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
789         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
790         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
791         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
792         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
793         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
794         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
795         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
796         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
797         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
798         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
799         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
800         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
801         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
802 
803         /* ftmp14: dst33,dst32,dst31,dst30 */
804         "li         %[tmp0],    0xffea0011                              \n\t"
805         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
806         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
807         "li         %[tmp0],    0xfff60011                              \n\t"
808         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
809         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
810         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
811         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
812         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
813         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
814         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
815         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
816         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
817         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
818         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
819         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
820         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
821         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
822         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
823 
824         MMI_LWC1(%[ftmp1], %[dest], 0x04)
825         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
826         MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
827         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
828         MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
829         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
830         MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
831         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
832         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
833         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
834         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
835         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
836         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
837         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
838         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
839         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
840         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
841         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
842         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
843         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
844         MMI_SWC1(%[ftmp1], %[dest], 0x04)
845         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
846         MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
847         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
848         MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
849         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
850         MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
851 
852         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
853           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
854           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
855           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
856           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
857           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
858           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
859           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
860           [tmp0]"=&r"(tmp[0])
861         : [ff_pw_64]"f"(ff_pw_32_64.f),
862           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
863         :"memory"
864     );
865 }
866 #endif
867 
868 /* Do inverse transform on 4x8 parts of block */
ff_vc1_inv_trans_4x8_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)869 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
870 {
871     int dc = block[0];
872     double ftmp[9];
873     union mmi_intfloat64 dc_u;
874     DECLARE_VAR_LOW32;
875 
876     dc = (17 * dc +  4) >> 3;
877     dc = (12 * dc + 64) >> 7;
878     dc_u.i = dc;
879 
880     __asm__ volatile(
881         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
882         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
883 
884         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
885         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
886         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
887         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
888         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
889         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
890         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
891         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
892 
893         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
894         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
895         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
896         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
897         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
898         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
899         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
900         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
901 
902         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
903         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
904         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
905         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
906         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
907         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
908         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
909         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
910 
911         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
912         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
913         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
914         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
915         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
916         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
917         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
918         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
919 
920         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
921         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
922         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
923         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
924         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
925         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
926         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
927         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
928         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
929           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
930           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
931           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
932           RESTRICT_ASM_LOW32
933           [ftmp8]"=&f"(ftmp[8])
934         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
935           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
936           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
937           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
938           [dc]"f"(dc_u.f)
939         : "memory"
940     );
941 }
942 
943 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_4x8_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)944 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
945 {
946     int16_t *src = block;
947     int16_t *dst = block;
948     double ftmp[23];
949     uint64_t count = 8, tmp[1];
950     int16_t coeff[16] = {17, 22, 17, 10,
951                          17, 10,-17,-22,
952                          17,-10,-17, 22,
953                          17,-22, 17,-10};
954 
955     // 1st loop
956     __asm__ volatile (
957 
958         "li         %[tmp0],    0x03                                    \n\t"
959         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
960 
961         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
962         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
963         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
964         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
965         "1:                                                             \n\t"
966         /* ftmp8: dst3,dst2,dst1,dst0 */
967         MMI_LDC1(%[ftmp1], %[src], 0x00)
968         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
969         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
970         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
971         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
972         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
973         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
974         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
975         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
976         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
977         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
978         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
979         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
980         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
981         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
982         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
983         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
984         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
985         MMI_SDC1(%[ftmp8], %[dst], 0x00)
986 
987         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
988         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
989         "addiu      %[count],   %[count],   -0x01                       \n\t"
990         "bnez       %[count],   1b                                      \n\t"
991         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
992           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
993           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
994           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
995           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
996           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
997           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
998           [src]"+&r"(src),              [dst]"+&r"(dst)
999         : [ff_pw_4]"f"(ff_pw_32_4.f),   [coeff]"r"(coeff)
1000         : "memory"
1001     );
1002 
1003     src = block;
1004 
1005     // 2nd loop
1006     __asm__ volatile (
1007         "li         %[tmp0],    0x07                                    \n\t"
1008         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1009 
1010         MMI_LDC1(%[ftmp1], %[src], 0x00)
1011         MMI_LDC1(%[ftmp2], %[src], 0x20)
1012         MMI_LDC1(%[ftmp3], %[src], 0x40)
1013         MMI_LDC1(%[ftmp4], %[src], 0x60)
1014         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1015         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1016         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1017         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1018 
1019         MMI_LDC1(%[ftmp1], %[src], 0x10)
1020         MMI_LDC1(%[ftmp2], %[src], 0x30)
1021         MMI_LDC1(%[ftmp3], %[src], 0x50)
1022         MMI_LDC1(%[ftmp4], %[src], 0x70)
1023         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1024         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1025         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
1026         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
1027 
1028         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1029         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1030                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1031 
1032         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1033         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1034                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1035 
1036         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1037         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1038                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1039 
1040         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1041         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1042                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1043 
1044         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1045         PTR_ADDU  "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1046         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1047         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1048         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1049         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1050         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1051         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1052         MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1053         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1054         MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1055         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1056         MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1057         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1058         MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1059         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1060         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1061         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1062         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1063         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1064         "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1065         "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1066         "punpcklbh  %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1067         "punpcklbh  %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1068 
1069         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp15]                   \n\t"
1070         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp16]                   \n\t"
1071         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp17]                   \n\t"
1072         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp18]                   \n\t"
1073         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp19]                   \n\t"
1074         "paddh      %[ftmp6],   %[ftmp6],   %[ftmp20]                   \n\t"
1075         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp21]                   \n\t"
1076         "paddh      %[ftmp8],   %[ftmp8],   %[ftmp22]                   \n\t"
1077 
1078         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1079         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1080         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1081         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1082         "packushb   %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1083         "packushb   %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1084         "packushb   %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1085         "packushb   %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1086 
1087         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1088         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1089         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1090         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1091         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1092         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1093         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1094         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1095         MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1096         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1097         MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1098         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1099         MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1100         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1101         MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1102 
1103         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1104           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1105           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1106           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1107           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1108           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1109           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1110           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1111           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
1112           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
1113           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
1114           [ftmp22]"=&f"(ftmp[22]),
1115           [tmp0]"=&r"(tmp[0])
1116         : [ff_pw_1]"f"(ff_pw_32_1.f),   [ff_pw_64]"f"(ff_pw_32_64.f),
1117           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1118         : "memory"
1119     );
1120 }
1121 #endif
1122 
1123 /* Do inverse transform on 4x4 part of block */
ff_vc1_inv_trans_4x4_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)1124 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1125 {
1126     int dc = block[0];
1127     double ftmp[5];
1128     union mmi_intfloat64 dc_u;
1129     DECLARE_VAR_LOW32;
1130 
1131     dc = (17 * dc +  4) >> 3;
1132     dc = (17 * dc + 64) >> 7;
1133     dc_u.i = dc;
1134 
1135     __asm__ volatile(
1136         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1137         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
1138 
1139         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1143 
1144         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1145         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1146         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1147         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1148 
1149         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
1150         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
1151         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
1152         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
1153 
1154         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1155         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1156         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1157         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1158 
1159         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1164           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1165           RESTRICT_ASM_LOW32
1166           [ftmp4]"=&f"(ftmp[4])
1167         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
1168           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
1169           [dc]"f"(dc_u.f)
1170         : "memory"
1171     );
1172 }
1173 
ff_vc1_inv_trans_4x4_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)1174 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1175 {
1176     int16_t *src = block;
1177     int16_t *dst = block;
1178     double ftmp[16];
1179     uint32_t count = 4, tmp[1];
1180     int16_t coeff[16] = {17, 22, 17, 10,
1181                          17, 10,-17,-22,
1182                          17,-10,-17, 22,
1183                          17,-22, 17,-10};
1184     // 1st loop
1185     __asm__ volatile (
1186 
1187         "li         %[tmp0],    0x03                                    \n\t"
1188         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1189         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1190         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1191         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1192         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1193         "1:                                                             \n\t"
1194         /* ftmp8: dst3,dst2,dst1,dst0 */
1195         MMI_LDC1(%[ftmp1], %[src], 0x00)
1196         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1197         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1198         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1199         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1200         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1201         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1202         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1203         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1204         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1205         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1206         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1207         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1208         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1209         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1210         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1211         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1212         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1213         MMI_SDC1(%[ftmp8], %[dst], 0x00)
1214 
1215         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1216         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1217         "addiu      %[count],   %[count],   -0x01                       \n\t"
1218         "bnez       %[count],   1b                                      \n\t"
1219         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1220           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1221           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1222           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1223           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1224           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1225           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1226           [src]"+&r"(src),              [dst]"+&r"(dst)
1227         : [ff_pw_4]"f"(ff_pw_32_4.f),   [coeff]"r"(coeff)
1228         : "memory"
1229     );
1230 
1231     src = block;
1232 
1233     // 2nd loop
1234     __asm__ volatile (
1235         "li         %[tmp0],    0x07                                    \n\t"
1236         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1237         "li         %[tmp0],    0x44                                    \n\t"
1238         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
1239 
1240         MMI_LDC1(%[ftmp1], %[src], 0x00)
1241         MMI_LDC1(%[ftmp2], %[src], 0x10)
1242         MMI_LDC1(%[ftmp3], %[src], 0x20)
1243         MMI_LDC1(%[ftmp4], %[src], 0x30)
1244         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1245         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1246         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1247         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1248 
1249         /* ftmp11: dst03,dst02,dst01,dst00 */
1250         "li         %[tmp0],    0x00160011                              \n\t"
1251         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1252         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1253         "li         %[tmp0],    0x000a0011                              \n\t"
1254         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1255         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1256         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1257         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1258         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1259         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1260         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1261         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1262         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1263         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1264         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1265         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1266         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1267         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1268         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
1269 
1270         /* ftmp12: dst13,dst12,dst11,dst10 */
1271         "li         %[tmp0],    0x000a0011                              \n\t"
1272         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1273         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1274         "li         %[tmp0],    0xffeaffef                              \n\t"
1275         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1276         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1277         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1278         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1279         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1280         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1281         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1282         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1283         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1284         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1285         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1286         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1287         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1288         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1289         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
1290 
1291         /* ftmp13: dst23,dst22,dst21,dst20 */
1292         "li         %[tmp0],    0xfff60011                              \n\t"
1293         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1294         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1295         "li         %[tmp0],    0x0016ffef                              \n\t"
1296         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1297         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1298         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1299         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1300         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1301         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1302         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1303         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1304         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1305         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1306         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1307         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1308         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1309         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1310         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
1311 
1312         /* ftmp14: dst33,dst32,dst31,dst30 */
1313         "li         %[tmp0],    0xffea0011                              \n\t"
1314         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1315         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1316         "li         %[tmp0],    0xfff60011                              \n\t"
1317         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1318         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1319         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1320         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1321         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1322         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1323         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1324         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1325         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1326         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1327         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1328         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1329         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1330         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1331         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
1332 
1333         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1334         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1335         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1336         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1337         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1338         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1339         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1340         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1341         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1342         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1343         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1344         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1345         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
1346         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
1347         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
1348         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
1349         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1350         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1351         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1352         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1353 
1354         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1355         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1356         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1357         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1358         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1359         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1360         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1361 
1362         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1363           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1364           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1365           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1366           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1367           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1368           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1369           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1370           [tmp0]"=&r"(tmp[0])
1371         : [ff_pw_64]"f"(ff_pw_32_64.f),
1372           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1373         :"memory"
1374     );
1375 }
1376 
1377 /* Apply overlap transform to horizontal edge */
ff_vc1_h_overlap_mmi(uint8_t * src,ptrdiff_t stride)1378 void ff_vc1_h_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1379 {
1380     int i;
1381     int a, b, c, d;
1382     int d1, d2;
1383     int rnd = 1;
1384     for (i = 0; i < 8; i++) {
1385         a  = src[-2];
1386         b  = src[-1];
1387         c  = src[0];
1388         d  = src[1];
1389         d1 = (a - d + 3 + rnd) >> 3;
1390         d2 = (a - d + b - c + 4 - rnd) >> 3;
1391 
1392         src[-2] = a - d1;
1393         src[-1] = av_clip_uint8(b - d2);
1394         src[0]  = av_clip_uint8(c + d2);
1395         src[1]  = d + d1;
1396         src    += stride;
1397         rnd     = !rnd;
1398     }
1399 }
1400 
ff_vc1_h_s_overlap_mmi(int16_t * left,int16_t * right,ptrdiff_t left_stride,ptrdiff_t right_stride,int flags)1401 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags)
1402 {
1403     int i;
1404     int a, b, c, d;
1405     int d1, d2;
1406     int rnd1 = flags & 2 ? 3 : 4;
1407     int rnd2 = 7 - rnd1;
1408     for (i = 0; i < 8; i++) {
1409         a  = left[6];
1410         b  = left[7];
1411         c  = right[0];
1412         d  = right[1];
1413         d1 = a - d;
1414         d2 = a - d + b - c;
1415 
1416         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1417         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1418         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1419         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1420 
1421         right += right_stride;
1422         left  += left_stride;
1423         if (flags & 1) {
1424             rnd2   = 7 - rnd2;
1425             rnd1   = 7 - rnd1;
1426         }
1427     }
1428 }
1429 
1430 /* Apply overlap transform to vertical edge */
ff_vc1_v_overlap_mmi(uint8_t * src,ptrdiff_t stride)1431 void ff_vc1_v_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1432 {
1433     int i;
1434     int a, b, c, d;
1435     int d1, d2;
1436     int rnd = 1;
1437     for (i = 0; i < 8; i++) {
1438         a  = src[-2 * stride];
1439         b  = src[-stride];
1440         c  = src[0];
1441         d  = src[stride];
1442         d1 = (a - d + 3 + rnd) >> 3;
1443         d2 = (a - d + b - c + 4 - rnd) >> 3;
1444 
1445         src[-2 * stride] = a - d1;
1446         src[-stride]     = av_clip_uint8(b - d2);
1447         src[0]           = av_clip_uint8(c + d2);
1448         src[stride]      = d + d1;
1449         src++;
1450         rnd = !rnd;
1451     }
1452 }
1453 
ff_vc1_v_s_overlap_mmi(int16_t * top,int16_t * bottom)1454 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1455 {
1456     int i;
1457     int a, b, c, d;
1458     int d1, d2;
1459     int rnd1 = 4, rnd2 = 3;
1460     for (i = 0; i < 8; i++) {
1461         a  = top[48];
1462         b  = top[56];
1463         c  = bottom[0];
1464         d  = bottom[8];
1465         d1 = a - d;
1466         d2 = a - d + b - c;
1467 
1468         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1469         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1470         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1471         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1472 
1473         bottom++;
1474         top++;
1475         rnd2 = 7 - rnd2;
1476         rnd1 = 7 - rnd1;
1477     }
1478 }
1479 
1480 /**
1481  * VC-1 in-loop deblocking filter for one line
1482  * @param src source block type
1483  * @param stride block stride
1484  * @param pq block quantizer
1485  * @return whether other 3 pairs should be filtered or not
1486  * @see 8.6
1487  */
vc1_filter_line(uint8_t * src,int stride,int pq)1488 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1489 {
1490     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1491               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1492     int a0_sign = a0 >> 31;        /* Store sign */
1493 
1494     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1495     if (a0 < pq) {
1496         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1497                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1498         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1499                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1500         if (a1 < a0 || a2 < a0) {
1501             int clip      = src[-1 * stride] - src[0 * stride];
1502             int clip_sign = clip >> 31;
1503 
1504             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1505             if (clip) {
1506                 int a3     = FFMIN(a1, a2);
1507                 int d      = 5 * (a3 - a0);
1508                 int d_sign = (d >> 31);
1509 
1510                 d       = ((d ^ d_sign) - d_sign) >> 3;
1511                 d_sign ^= a0_sign;
1512 
1513                 if (d_sign ^ clip_sign)
1514                     d = 0;
1515                 else {
1516                     d = FFMIN(d, clip);
1517                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1518                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1519                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1520                 }
1521                 return 1;
1522             }
1523         }
1524     }
1525     return 0;
1526 }
1527 
1528 /**
1529  * VC-1 in-loop deblocking filter
1530  * @param src source block type
1531  * @param step distance between horizontally adjacent elements
1532  * @param stride distance between vertically adjacent elements
1533  * @param len edge length to filter (4 or 8 pixels)
1534  * @param pq block quantizer
1535  * @see 8.6
1536  */
vc1_loop_filter(uint8_t * src,int step,int stride,int len,int pq)1537 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1538                                    int len, int pq)
1539 {
1540     int i;
1541     int filt3;
1542 
1543     for (i = 0; i < len; i += 4) {
1544         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1545         if (filt3) {
1546             vc1_filter_line(src + 0 * step, stride, pq);
1547             vc1_filter_line(src + 1 * step, stride, pq);
1548             vc1_filter_line(src + 3 * step, stride, pq);
1549         }
1550         src += step * 4;
1551     }
1552 }
1553 
ff_vc1_v_loop_filter4_mmi(uint8_t * src,ptrdiff_t stride,int pq)1554 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1555 {
1556     vc1_loop_filter(src, 1, stride, 4, pq);
1557 }
1558 
ff_vc1_h_loop_filter4_mmi(uint8_t * src,ptrdiff_t stride,int pq)1559 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1560 {
1561     vc1_loop_filter(src, stride, 1, 4, pq);
1562 }
1563 
ff_vc1_v_loop_filter8_mmi(uint8_t * src,ptrdiff_t stride,int pq)1564 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1565 {
1566     vc1_loop_filter(src, 1, stride, 8, pq);
1567 }
1568 
ff_vc1_h_loop_filter8_mmi(uint8_t * src,ptrdiff_t stride,int pq)1569 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1570 {
1571     vc1_loop_filter(src, stride, 1, 8, pq);
1572 }
1573 
ff_vc1_v_loop_filter16_mmi(uint8_t * src,ptrdiff_t stride,int pq)1574 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1575 {
1576     vc1_loop_filter(src, 1, stride, 16, pq);
1577 }
1578 
ff_vc1_h_loop_filter16_mmi(uint8_t * src,ptrdiff_t stride,int pq)1579 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1580 {
1581     vc1_loop_filter(src, stride, 1, 16, pq);
1582 }
1583 
ff_put_vc1_mspel_mc00_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1584 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1585                                ptrdiff_t stride, int rnd)
1586 {
1587     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1588 }
ff_put_vc1_mspel_mc00_16_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1589 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1590                                   ptrdiff_t stride, int rnd)
1591 {
1592     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1593 }
ff_avg_vc1_mspel_mc00_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1594 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1595                                ptrdiff_t stride, int rnd)
1596 {
1597     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1598 }
ff_avg_vc1_mspel_mc00_16_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1599 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1600                                   ptrdiff_t stride, int rnd)
1601 {
1602     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1603 }
1604 
1605 #define OP_PUT(S, D)
1606 #define OP_AVG(S, D)                                                        \
1607     "ldc1       $f16,   "#S"                        \n\t"                   \
1608     "pavgb      "#D",   "#D",   $f16                \n\t"
1609 
1610 /** Add rounder from $f14 to $f6 and pack result at destination */
1611 #define NORMALIZE_MMI(SHIFT)                                                \
1612     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1613     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1614     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1615     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1616 
1617 #define TRANSFER_DO_PACK(OP)                                                \
1618     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1619     OP((%[dst]), $f6)                                                       \
1620     "sdc1       $f6,    0x00(%[dst])                \n\t"
1621 
1622 #define TRANSFER_DONT_PACK(OP)                                              \
1623      OP(0(%[dst]), $f6)                                                     \
1624      OP(8(%[dst]), $f8)                                                     \
1625      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1626      "sdc1      $f8,    0x08(%[dst])                \n\t"
1627 
1628 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1629 #define DO_UNPACK(reg)                                                      \
1630     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1631 #define DONT_UNPACK(reg)
1632 
1633 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1634 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1635     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1636     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1637     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1638 
1639 
1640 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1641     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1642     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1643     MMI_ULWC1(R0, $9, 0x00)                                                 \
1644     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1645     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1646     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1647     MMI_ULWC1(R3, $9, 0x00)                                                 \
1648     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1649     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1650     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1651     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1652     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1653     MMI_SDC1(R1, %[dst], OFF)                                               \
1654     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1655 
1656 /** Sacrificing $f12 makes it possible to pipeline loads from src */
vc1_put_ver_16b_shift2_mmi(int16_t * dst,const uint8_t * src,mips_reg stride,int rnd,int64_t shift)1657 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1658                                        const uint8_t *src, mips_reg stride,
1659                                        int rnd, int64_t shift)
1660 {
1661     union mmi_intfloat64 shift_u;
1662     DECLARE_VAR_LOW32;
1663     DECLARE_VAR_ADDRT;
1664     shift_u.i = shift;
1665 
1666     __asm__ volatile(
1667         "pxor       $f0,    $f0,    $f0             \n\t"
1668         "li         $8,     0x03                    \n\t"
1669         LOAD_ROUNDER_MMI("%[rnd]")
1670         "1:                                         \n\t"
1671         MMI_ULWC1($f4, %[src], 0x00)
1672         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1673         MMI_ULWC1($f6, %[src], 0x00)
1674         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1675         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1676         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1677         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1678         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1679         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1680         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1681         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1682         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1683         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1684         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1685         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1686         "addiu      $8,     $8,    -0x01            \n\t"
1687         "bnez       $8,     1b                      \n\t"
1688         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1689           [src]"+r"(src),               [dst]"+r"(dst)
1690         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1691           [shift]"f"(shift_u.f),        [rnd]"m"(rnd),
1692           [stride2]"r"(9*stride-4)
1693         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
1694           "$f14", "$f16", "memory"
1695     );
1696 }
1697 
1698 /**
1699  * Data is already unpacked, so some operations can directly be made from
1700  * memory.
1701  */
1702 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1703 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1704                                              const int16_t *src, int rnd)   \
1705 {                                                                           \
1706     int h = 8;                                                              \
1707     DECLARE_VAR_ALL64;                                                      \
1708     DECLARE_VAR_ADDRT;                                                      \
1709                                                                             \
1710     src -= 1;                                                               \
1711     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1712                                                                             \
1713     __asm__ volatile(                                                       \
1714         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1715         "1:                                         \n\t"                   \
1716         MMI_ULDC1($f2, %[src], 0x00)                                        \
1717         MMI_ULDC1($f4, %[src], 0x08)                                        \
1718         MMI_ULDC1($f6, %[src], 0x02)                                        \
1719         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1720         MMI_ULDC1($f0, %[src], 0x06)                                        \
1721         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1722         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1723         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1724         MMI_ULDC1($f0, %[src], 0x04)                                        \
1725         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1726         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1727         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1728         "pmullh     $f6,    $f6,    %[ff_pw_9]      \n\t"                   \
1729         "pmullh     $f8,    $f8,    %[ff_pw_9]      \n\t"                   \
1730         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1731         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1732         "li         $8,     0x07                    \n\t"                   \
1733         "mtc1       $8,     $f16                    \n\t"                   \
1734         NORMALIZE_MMI("$f16")                                               \
1735         /* Remove bias */                                                   \
1736         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1737         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1738         TRANSFER_DO_PACK(OP)                                                \
1739         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1740         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1741         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1742         "bnez       %[h],   1b                      \n\t"                   \
1743         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1744           [h]"+r"(h),                                                       \
1745           [src]"+r"(src),               [dst]"+r"(dst)                      \
1746         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1747           [ff_pw_9]"f"(ff_pw_9.f),      [ff_pw_128]"f"(ff_pw_128.f)         \
1748         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f14",                  \
1749           "$f16", "memory"                                                  \
1750     );                                                                      \
1751 }
1752 
1753 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1754 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1755 
1756 /**
1757  * Purely vertical or horizontal 1/2 shift interpolation.
1758  * Sacrify $f12 for *9 factor.
1759  */
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1762                                      mips_reg stride, int rnd,              \
1763                                      mips_reg offset)                       \
1764 {                                                                           \
1765     DECLARE_VAR_LOW32;                                                      \
1766     DECLARE_VAR_ADDRT;                                                      \
1767                                                                             \
1768     rnd = 8 - rnd;                                                          \
1769                                                                             \
1770     __asm__ volatile(                                                       \
1771         "pxor       $f0,    $f0,    $f0             \n\t"                   \
1772         "li         $10,    0x08                    \n\t"                   \
1773         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1774         "1:                                         \n\t"                   \
1775         MMI_ULWC1($f6, %[src], 0x00)                                        \
1776         MMI_ULWC1($f8, %[src], 0x04)                                        \
1777         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1778         MMI_ULWC1($f2, $9, 0x00)                                            \
1779         MMI_ULWC1($f4, $9, 0x04)                                            \
1780         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1781         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1782         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1783         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1784         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1785         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1786         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1787         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1788         MMI_ULWC1($f2, $9, 0x00)                                            \
1789         MMI_ULWC1($f4, $9, 0x04)                                            \
1790         "pmullh     $f6,    $f6,    %[ff_pw_9]      \n\t" /* 0,9,9,0*/      \
1791         "pmullh     $f8,    $f8,    %[ff_pw_9]      \n\t" /* 0,9,9,0*/      \
1792         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1793         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1794         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1795         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1796         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1797         MMI_ULWC1($f2, $9, 0x00)                                            \
1798         MMI_ULWC1($f4, $9, 0x04)                                            \
1799         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1800         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1801         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1802         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1803         "li         $8,     0x04                    \n\t"                   \
1804         "mtc1       $8,     $f16                    \n\t"                   \
1805         NORMALIZE_MMI("$f16")                                               \
1806         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1807         OP((%[dst]), $f6)                                                   \
1808         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1809         "addiu      $10,    $10,   -0x01            \n\t"                   \
1810         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1811         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1812         "bnez       $10,    1b                      \n\t"                   \
1813         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1814           [src]"+r"(src),               [dst]"+r"(dst)                      \
1815         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1816           [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1817           [stride1]"r"(stride-offset),                                      \
1818           [ff_pw_9]"f"(ff_pw_9.f)                                           \
1819         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1820           "$f14", "$f16", "memory"                                          \
1821     );                                                                      \
1822 }
1823 
1824 VC1_SHIFT2(OP_PUT, put_)
1825 VC1_SHIFT2(OP_AVG, avg_)
1826 
1827 /**
1828  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1829  *
1830  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1831  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1832  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1833  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1834  * @param A2      Stride address of 2nd tap
1835  * @param A3      Stride address of 3rd tap
1836  * @param A4      Stride address of 4th tap
1837  */
1838 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1839     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1840     LOAD($f2, $9, M*0)                                                      \
1841     LOAD($f4, $9, M*4)                                                      \
1842     UNPACK("$f2")                                                           \
1843     UNPACK("$f4")                                                           \
1844     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1845     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1846     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1847     LOAD($f6, $9, M*0)                                                      \
1848     LOAD($f8, $9, M*4)                                                      \
1849     UNPACK("$f6")                                                           \
1850     UNPACK("$f8")                                                           \
1851     "pmullh     $f6,    $f6,    %[ff_pw_18]     \n\t" /* *18 */             \
1852     "pmullh     $f8,    $f8,    %[ff_pw_18]     \n\t" /* *18 */             \
1853     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1854     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1855     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1856     LOAD($f2, $9, M*0)                                                      \
1857     LOAD($f4, $9, M*4)                                                      \
1858     UNPACK("$f2")                                                           \
1859     UNPACK("$f4")                                                           \
1860     "li         $8,     0x02                    \n\t"                       \
1861     "mtc1       $8,     $f16                    \n\t"                       \
1862     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1863     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1864     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1865     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1866     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1867     LOAD($f2, $9, M*0)                                                      \
1868     LOAD($f4, $9, M*4)                                                      \
1869     UNPACK("$f2")                                                           \
1870     UNPACK("$f4")                                                           \
1871     "pmullh     $f2,    $f2,    %[ff_pw_53]     \n\t" /* *53 */             \
1872     "pmullh     $f4,    $f4,    %[ff_pw_53]     \n\t" /* *53 */             \
1873     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1874     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1875 
1876 /**
1877  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1878  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1879  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1880  *
1881  * @param  NAME   Either 1 or 3
1882  * @see MSPEL_FILTER13_CORE for information on A1->A4
1883  */
1884 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1885 static void                                                                 \
1886 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1887                                  mips_reg src_stride,                       \
1888                                  int rnd, int64_t shift)                    \
1889 {                                                                           \
1890     int h = 8;                                                              \
1891     union mmi_intfloat64 shift_u;                                           \
1892     DECLARE_VAR_LOW32;                                                      \
1893     DECLARE_VAR_ADDRT;                                                      \
1894     shift_u.i = shift;                                                      \
1895                                                                             \
1896     src -= src_stride;                                                      \
1897                                                                             \
1898     __asm__ volatile(                                                       \
1899         "pxor       $f0,    $f0,    $f0             \n\t"                   \
1900         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1901         ".p2align 3                                 \n\t"                   \
1902         "1:                                         \n\t"                   \
1903         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1904         NORMALIZE_MMI("%[shift]")                                           \
1905         TRANSFER_DONT_PACK(OP_PUT)                                          \
1906         /* Last 3 (in fact 4) bytes on the line */                          \
1907         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1908         MMI_ULWC1($f2, $9, 0x08)                                            \
1909         DO_UNPACK("$f2")                                                    \
1910         "mov.d      $f6,    $f2                     \n\t"                   \
1911         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1912         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1913         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1914         MMI_ULWC1($f6, $9, 0x08)                                            \
1915         DO_UNPACK("$f6")                                                    \
1916         "pmullh     $f6,    $f6,    %[ff_pw_18]     \n\t" /* *18 */         \
1917         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1918         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1919         MMI_ULWC1($f2, $9, 0x08)                                            \
1920         DO_UNPACK("$f2")                                                    \
1921         "pmullh     $f2,    $f2,    %[ff_pw_53]     \n\t" /* *53 */         \
1922         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1923         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1924         MMI_ULWC1($f2, $9, 0x08)                                            \
1925         DO_UNPACK("$f2")                                                    \
1926         "li         $8,     0x02                    \n\t"                   \
1927         "mtc1       $8,     $f16                    \n\t"                   \
1928         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1929         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1930         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1931         "li         $8,     0x06                    \n\t"                   \
1932         "mtc1       $8,     $f16                    \n\t"                   \
1933         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1934         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1935         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1936         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1937         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1938         "bnez       %[h],   1b                      \n\t"                   \
1939         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1940           [h]"+r"(h),                                                       \
1941           [src]"+r"(src),               [dst]"+r"(dst)                      \
1942         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1943           [stride_x3]"r"(3*src_stride),                                     \
1944           [rnd]"m"(rnd),                [shift]"f"(shift_u.f),              \
1945           [ff_pw_53]"f"(ff_pw_53.f),    [ff_pw_18]"f"(ff_pw_18.f),          \
1946           [ff_pw_3]"f"(ff_pw_3.f)                                           \
1947         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8",                    \
1948           "$f14", "$f16", "memory"                                          \
1949     );                                                                      \
1950 }
1951 
1952 /**
1953  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1954  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1955  *
1956  * @param  NAME   Either 1 or 3
1957  * @see MSPEL_FILTER13_CORE for information on A1->A4
1958  */
1959 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1960 static void                                                                 \
1961 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1962                                        const int16_t *src, int rnd)         \
1963 {                                                                           \
1964     int h = 8;                                                              \
1965     DECLARE_VAR_ALL64;                                                      \
1966     DECLARE_VAR_ADDRT;                                                      \
1967                                                                             \
1968     src -= 1;                                                               \
1969     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1970                                                                             \
1971     __asm__ volatile(                                                       \
1972         "pxor       $f0,    $f0,    $f0             \n\t"                   \
1973         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1974         ".p2align 3                                 \n\t"                   \
1975         "1:                                         \n\t"                   \
1976         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1977         "li         $8,     0x07                    \n\t"                   \
1978         "mtc1       $8,     $f16                    \n\t"                   \
1979         NORMALIZE_MMI("$f16")                                               \
1980         /* Remove bias */                                                   \
1981         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1982         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1983         TRANSFER_DO_PACK(OP)                                                \
1984         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1985         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1986         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1987         "bnez       %[h],   1b                      \n\t"                   \
1988         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1989           [h]"+r"(h),                                                       \
1990           [src]"+r"(src),               [dst]"+r"(dst)                      \
1991         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1992           [ff_pw_53]"f"(ff_pw_53.f),    [ff_pw_18]"f"(ff_pw_18.f),          \
1993           [ff_pw_3]"f"(ff_pw_3.f),      [ff_pw_128]"f"(ff_pw_128.f)         \
1994         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8",                    \
1995           "$f14", "$f16", "memory"                                          \
1996     );                                                                      \
1997 }
1998 
1999 /**
2000  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2001  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2002  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2003  *
2004  * @param  NAME   Either 1 or 3
2005  * @see MSPEL_FILTER13_CORE for information on A1->A4
2006  */
2007 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
2008 static void                                                                 \
2009 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
2010                               mips_reg stride, int rnd, mips_reg offset)    \
2011 {                                                                           \
2012     int h = 8;                                                              \
2013     DECLARE_VAR_LOW32;                                                      \
2014     DECLARE_VAR_ADDRT;                                                      \
2015                                                                             \
2016     src -= offset;                                                          \
2017     rnd = 32-rnd;                                                           \
2018                                                                             \
2019     __asm__ volatile (                                                      \
2020         "pxor       $f0,    $f0,    $f0             \n\t"                   \
2021         LOAD_ROUNDER_MMI("%[rnd]")                                          \
2022         ".p2align 3                                 \n\t"                   \
2023         "1:                                         \n\t"                   \
2024         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
2025         "li         $8,     0x06                    \n\t"                   \
2026         "mtc1       $8,     $f16                    \n\t"                   \
2027         NORMALIZE_MMI("$f16")                                               \
2028         TRANSFER_DO_PACK(OP)                                                \
2029         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
2030         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
2031         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
2032         "bnez       %[h],   1b                      \n\t"                   \
2033         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
2034           [h]"+r"(h),                                                       \
2035           [src]"+r"(src),               [dst]"+r"(dst)                      \
2036         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
2037           [offset_x3]"r"(3*offset),     [stride]"r"(stride),                \
2038           [rnd]"m"(rnd),                                                    \
2039           [ff_pw_53]"f"(ff_pw_53.f),    [ff_pw_18]"f"(ff_pw_18.f),          \
2040           [ff_pw_3]"f"(ff_pw_3.f)                                           \
2041         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8",                    \
2042           "$f14", "$f16", "memory"                                          \
2043     );                                                                      \
2044 }
2045 
2046 
2047 /** 1/4 shift bicubic interpolation */
2048 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2049 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2050 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2051 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2052 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2053 
2054 /** 3/4 shift bicubic interpolation */
2055 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2056 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2057 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2058 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2059 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2060 
2061 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2062              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2063               int64_t shift);
2064 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2065              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2066 typedef void (*vc1_mspel_mc_filter_8bits)
2067              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2068               mips_reg offset);
2069 
2070 /**
2071  * Interpolate fractional pel values by applying proper vertical then
2072  * horizontal filter.
2073  *
2074  * @param  dst     Destination buffer for interpolated pels.
2075  * @param  src     Source buffer.
2076  * @param  stride  Stride for both src and dst buffers.
2077  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
2078  * @param  hmode   Vertical filter.
2079  * @param  rnd     Rounding bias.
2080  */
2081 #define VC1_MSPEL_MC(OP)                                                    \
2082 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2083                                int hmode, int vmode, int rnd)               \
2084 {                                                                           \
2085     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2086          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
2087                  vc1_put_ver_16b_shift2_mmi,                                \
2088                  vc1_put_ver_16b_shift3_mmi };                              \
2089     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2090          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
2091                  OP ## vc1_hor_16b_shift2_mmi,                              \
2092                  OP ## vc1_hor_16b_shift3_mmi };                            \
2093     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
2094          { NULL, OP ## vc1_shift1_mmi,                                      \
2095                  OP ## vc1_shift2_mmi,                                      \
2096                  OP ## vc1_shift3_mmi };                                    \
2097                                                                             \
2098     if (vmode) { /* Vertical filter to apply */                             \
2099         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
2100             static const int shift_value[] = { 0, 5, 1, 5 };                \
2101             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
2102             int    r;                                                       \
2103             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
2104                                                                             \
2105             r = (1<<(shift-1)) + rnd-1;                                     \
2106             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
2107                                                                             \
2108             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
2109             return;                                                         \
2110         }                                                                   \
2111         else { /* No horizontal filter, output 8 lines to dst */            \
2112             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
2113             return;                                                         \
2114         }                                                                   \
2115     }                                                                       \
2116                                                                             \
2117     /* Horizontal mode with no vertical mode */                             \
2118     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
2119 }                                                                           \
2120 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
2121                                   int stride, int hmode, int vmode, int rnd)\
2122 {                                                                           \
2123     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2124     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2125     dst += 8*stride; src += 8*stride;                                       \
2126     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2127     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2128 }
2129 
2130 VC1_MSPEL_MC(put_)
VC1_MSPEL_MC(avg_)2131 VC1_MSPEL_MC(avg_)
2132 
2133 /** Macro to ease bicubic filter interpolation functions declarations */
2134 #define DECLARE_FUNCTION(a, b)                                              \
2135 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2136                                            const uint8_t *src,              \
2137                                            ptrdiff_t stride,                \
2138                                            int rnd)                         \
2139 {                                                                           \
2140      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2141 }                                                                           \
2142 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2143                                            const uint8_t *src,              \
2144                                            ptrdiff_t stride,                \
2145                                            int rnd)                         \
2146 {                                                                           \
2147      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2148 }                                                                           \
2149 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2150                                               const uint8_t *src,           \
2151                                               ptrdiff_t stride,             \
2152                                               int rnd)                      \
2153 {                                                                           \
2154      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2155 }                                                                           \
2156 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2157                                               const uint8_t *src,           \
2158                                               ptrdiff_t stride,             \
2159                                               int rnd)                      \
2160 {                                                                           \
2161      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2162 }
2163 
2164 DECLARE_FUNCTION(0, 1)
2165 DECLARE_FUNCTION(0, 2)
2166 DECLARE_FUNCTION(0, 3)
2167 
2168 DECLARE_FUNCTION(1, 0)
2169 DECLARE_FUNCTION(1, 1)
2170 DECLARE_FUNCTION(1, 2)
2171 DECLARE_FUNCTION(1, 3)
2172 
2173 DECLARE_FUNCTION(2, 0)
2174 DECLARE_FUNCTION(2, 1)
2175 DECLARE_FUNCTION(2, 2)
2176 DECLARE_FUNCTION(2, 3)
2177 
2178 DECLARE_FUNCTION(3, 0)
2179 DECLARE_FUNCTION(3, 1)
2180 DECLARE_FUNCTION(3, 2)
2181 DECLARE_FUNCTION(3, 3)
2182 
2183 #define CHROMA_MC_8_MMI                                                     \
2184         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
2185         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2186         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
2187         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2188         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
2189         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2190         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
2191         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2192                                                                             \
2193         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2194         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
2195         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2196         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
2197         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2198         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
2199         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2200         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
2201                                                                             \
2202         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2203         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2204         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2205         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2206                                                                             \
2207         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
2208         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
2209         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
2210         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
2211                                                                             \
2212         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
2213         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
2214         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
2215 
2216 
2217 #define CHROMA_MC_4_MMI                                                     \
2218         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2219         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2220         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2221         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2222                                                                             \
2223         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2224         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2225         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2226         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2227                                                                             \
2228         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2229         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2230         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2231         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2232                                                                             \
2233         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
2234         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
2235 
2236 
2237 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2238                                       uint8_t *src /* align 1 */,
2239                                       ptrdiff_t stride, int h, int x, int y)
2240 {
2241     union mmi_intfloat64 A, B, C, D;
2242     double ftmp[10];
2243     uint32_t tmp[1];
2244     DECLARE_VAR_ALL64;
2245     DECLARE_VAR_ADDRT;
2246     A.i = (8 - x) * (8 - y);
2247     B.i =     (x) * (8 - y);
2248     C.i = (8 - x) *     (y);
2249     D.i =     (x) *     (y);
2250 
2251     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2252 
2253     __asm__ volatile(
2254         "li         %[tmp0],    0x06                                    \n\t"
2255         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2256         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2257         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2258         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2259         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2260         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2261 
2262         "1:                                                             \n\t"
2263         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2264         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2265         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2266         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2267         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2268 
2269         CHROMA_MC_8_MMI
2270 
2271         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2272         "addiu      %[h],       %[h],      -0x01                        \n\t"
2273         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2274         "bnez       %[h],       1b                                      \n\t"
2275         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2276           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2277           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2278           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2279           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2280           RESTRICT_ASM_ALL64
2281           RESTRICT_ASM_ADDRT
2282           [tmp0]"=&r"(tmp[0]),
2283           [src]"+&r"(src),              [dst]"+&r"(dst),
2284           [h]"+&r"(h)
2285         : [stride]"r"((mips_reg)stride),
2286           [A]"f"(A.f),                  [B]"f"(B.f),
2287           [C]"f"(C.f),                  [D]"f"(D.f),
2288           [ff_pw_28]"f"(ff_pw_28.f)
2289         : "memory"
2290     );
2291 }
2292 
ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)2293 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2294                                       uint8_t *src /* align 1 */,
2295                                       ptrdiff_t stride, int h, int x, int y)
2296 {
2297     union mmi_intfloat64 A, B, C, D;
2298     double ftmp[6];
2299     uint32_t tmp[1];
2300     DECLARE_VAR_LOW32;
2301     DECLARE_VAR_ADDRT;
2302     A.i = (8 - x) * (8 - y);
2303     B.i =     (x) * (8 - y);
2304     C.i = (8 - x) *     (y);
2305     D.i =     (x) *     (y);
2306 
2307     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2308 
2309     __asm__ volatile(
2310         "li         %[tmp0],    0x06                                    \n\t"
2311         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2312         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2313         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2314         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2315         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2316         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2317 
2318         "1:                                                             \n\t"
2319         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2320         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2321         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2322         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2323         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2324 
2325         CHROMA_MC_4_MMI
2326 
2327         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2328         "addiu      %[h],       %[h],      -0x01                        \n\t"
2329         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2330         "bnez       %[h],       1b                                      \n\t"
2331         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2332           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2333           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2334           [tmp0]"=&r"(tmp[0]),
2335           RESTRICT_ASM_LOW32
2336           RESTRICT_ASM_ADDRT
2337           [src]"+&r"(src),              [dst]"+&r"(dst),
2338           [h]"+&r"(h)
2339         : [stride]"r"((mips_reg)stride),
2340           [A]"f"(A.f),                  [B]"f"(B.f),
2341           [C]"f"(C.f),                  [D]"f"(D.f),
2342           [ff_pw_28]"f"(ff_pw_28.f)
2343         : "memory"
2344     );
2345 }
2346 
ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)2347 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2348                                       uint8_t *src /* align 1 */,
2349                                       ptrdiff_t stride, int h, int x, int y)
2350 {
2351     union mmi_intfloat64 A, B, C, D;
2352     double ftmp[10];
2353     uint32_t tmp[1];
2354     DECLARE_VAR_ALL64;
2355     DECLARE_VAR_ADDRT;
2356     A.i = (8 - x) * (8 - y);
2357     B.i =     (x) * (8 - y);
2358     C.i = (8 - x) *     (y);
2359     D.i =     (x) *     (y);
2360 
2361     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2362 
2363     __asm__ volatile(
2364         "li         %[tmp0],    0x06                                    \n\t"
2365         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2366         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2367         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2368         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2369         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2370         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2371 
2372         "1:                                                             \n\t"
2373         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2374         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2375         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2376         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2377         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2378 
2379         CHROMA_MC_8_MMI
2380 
2381         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2382         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2383 
2384         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2385         "addiu      %[h],       %[h],      -0x01                        \n\t"
2386         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2387         "bnez       %[h],       1b                                      \n\t"
2388         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2389           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2390           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2391           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2392           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2393           [tmp0]"=&r"(tmp[0]),
2394           RESTRICT_ASM_ALL64
2395           RESTRICT_ASM_ADDRT
2396           [src]"+&r"(src),              [dst]"+&r"(dst),
2397           [h]"+&r"(h)
2398         : [stride]"r"((mips_reg)stride),
2399           [A]"f"(A.f),                 [B]"f"(B.f),
2400           [C]"f"(C.f),                 [D]"f"(D.f),
2401           [ff_pw_28]"f"(ff_pw_28.f)
2402         : "memory"
2403     );
2404 }
2405 
ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)2406 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2407                                       uint8_t *src /* align 1 */,
2408                                       ptrdiff_t stride, int h, int x, int y)
2409 {
2410     union mmi_intfloat64 A, B, C, D;
2411     double ftmp[6];
2412     uint32_t tmp[1];
2413     DECLARE_VAR_LOW32;
2414     DECLARE_VAR_ADDRT;
2415     A.i = (8 - x) * (8 - y);
2416     B.i = (x) * (8 - y);
2417     C.i = (8 - x) * (y);
2418     D.i = (x) * (y);
2419 
2420     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2421 
2422     __asm__ volatile(
2423         "li         %[tmp0],    0x06                                    \n\t"
2424         "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2425         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2426         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2427         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2428         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2429         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2430 
2431         "1:                                                             \n\t"
2432         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2433         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2434         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2435         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2436         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2437 
2438         CHROMA_MC_4_MMI
2439 
2440         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2441         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2442 
2443         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2444         "addiu      %[h],       %[h],      -0x01                        \n\t"
2445         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2446         "bnez       %[h],       1b                                      \n\t"
2447         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2448           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2449           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2450           [tmp0]"=&r"(tmp[0]),
2451           RESTRICT_ASM_LOW32
2452           RESTRICT_ASM_ADDRT
2453           [src]"+&r"(src),              [dst]"+&r"(dst),
2454           [h]"+&r"(h)
2455         : [stride]"r"((mips_reg)stride),
2456           [A]"f"(A.f),                  [B]"f"(B.f),
2457           [C]"f"(C.f),                  [D]"f"(D.f),
2458           [ff_pw_28]"f"(ff_pw_28.f)
2459         : "memory"
2460     );
2461 }
2462