• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_ports/asmdefs_mmi.h"
14 
15 #define TRANSPOSE_4H \
16   "pxor          %[ftmp0],    %[ftmp0],    %[ftmp0]          \n\t" \
17   MMI_LI(%[tmp0], 0x93)                                            \
18   "mtc1          %[tmp0],     %[ftmp10]                      \n\t" \
19   "punpcklhw     %[ftmp5],    %[ftmp1],    %[ftmp0]          \n\t" \
20   "punpcklhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
21   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
22   "por           %[ftmp5],    %[ftmp5],    %[ftmp9]          \n\t" \
23   "punpckhhw     %[ftmp6],    %[ftmp1],    %[ftmp0]          \n\t" \
24   "punpckhhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
25   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
26   "por           %[ftmp6],    %[ftmp6],    %[ftmp9]          \n\t" \
27   "punpcklhw     %[ftmp7],    %[ftmp3],    %[ftmp0]          \n\t" \
28   "punpcklhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
29   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
30   "por           %[ftmp7],    %[ftmp7],    %[ftmp9]          \n\t" \
31   "punpckhhw     %[ftmp8],    %[ftmp3],    %[ftmp0]          \n\t" \
32   "punpckhhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
33   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
34   "por           %[ftmp8],    %[ftmp8],    %[ftmp9]          \n\t" \
35   "punpcklwd     %[ftmp1],    %[ftmp5],    %[ftmp7]          \n\t" \
36   "punpckhwd     %[ftmp2],    %[ftmp5],    %[ftmp7]          \n\t" \
37   "punpcklwd     %[ftmp3],    %[ftmp6],    %[ftmp8]          \n\t" \
38   "punpckhwd     %[ftmp4],    %[ftmp6],    %[ftmp8]          \n\t"
39 
vp8_short_idct4x4llm_mmi(int16_t * input,unsigned char * pred_ptr,int pred_stride,unsigned char * dst_ptr,int dst_stride)40 void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
41                               int pred_stride, unsigned char *dst_ptr,
42                               int dst_stride) {
43   double ftmp[12];
44   uint64_t tmp[1];
45   double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
46 
47   __asm__ volatile (
48     "dli        %[tmp0],    0x0004000400040004                  \n\t"
49     "dmtc1      %[tmp0],    %[ff_ph_04]                         \n\t"
50     "dli        %[tmp0],    0x4e7b4e7b4e7b4e7b                  \n\t"
51     "dmtc1      %[tmp0],    %[ff_ph_4e7b]                       \n\t"
52     "dli        %[tmp0],    0x22a322a322a322a3                  \n\t"
53     "dmtc1      %[tmp0],    %[ff_ph_22a3]                       \n\t"
54     MMI_LI(%[tmp0], 0x02)
55     "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
56     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
57 
58     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
59     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
60     "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
61     "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
62     "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
63     "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
64     "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
65     "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
66 
67     // ip[0...3] + ip[8...11]
68     "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
69     // ip[0...3] - ip[8...11]
70     "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
71     // (ip[12...15] * sinpi8sqrt2) >> 16
72     "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
73     "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
74     // (ip[ 4... 7] * sinpi8sqrt2) >> 16
75     "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
76     "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
77     // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
78     "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
79     "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
80     // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
81     "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
82     "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
83 
84     "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
85     "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
86     "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
87     "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
88     "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
89     "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
90     "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
91     "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
92 
93     TRANSPOSE_4H
94     // a
95     "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
96     // b
97     "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
98     // c
99     "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
100     "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
101     "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
102     "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
103     "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
104     // d
105     "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
106     "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
107     "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
108     "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
109     "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
110 
111     MMI_LI(%[tmp0], 0x03)
112     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
113     // a + d
114     "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
115     "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_04]         \n\t"
116     "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
117     // b + c
118     "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
119     "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_04]         \n\t"
120     "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
121     // b - c
122     "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
123     "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_04]         \n\t"
124     "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
125     // a - d
126     "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
127     "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_04]         \n\t"
128     "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
129 
130     TRANSPOSE_4H
131 #if _MIPS_SIM == _ABIO32
132     "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
133     "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
134 #else
135     "gslwlc1    %[ftmp5],   0x03(%[pred_ptr])                   \n\t"
136     "gslwrc1    %[ftmp5],   0x00(%[pred_ptr])                   \n\t"
137 #endif
138     "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
139     "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
140     "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
141     "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                    \n\t"
142     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                    \n\t"
143     MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
144     MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
145 
146 #if _MIPS_SIM == _ABIO32
147     "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
148     "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
149 #else
150     "gslwlc1    %[ftmp6],   0x03(%[pred_ptr])                   \n\t"
151     "gslwrc1    %[ftmp6],   0x00(%[pred_ptr])                   \n\t"
152 #endif
153     "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
154     "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
155     "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
156     "gsswlc1    %[ftmp2],   0x03(%[dst_ptr])                    \n\t"
157     "gsswrc1    %[ftmp2],   0x00(%[dst_ptr])                    \n\t"
158     MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
159     MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
160 
161 #if _MIPS_SIM == _ABIO32
162     "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
163     "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
164 #else
165     "gslwlc1    %[ftmp7],   0x03(%[pred_ptr])                   \n\t"
166     "gslwrc1    %[ftmp7],   0x00(%[pred_ptr])                   \n\t"
167 #endif
168     "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
169     "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
170     "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
171     "gsswlc1    %[ftmp3],   0x03(%[dst_ptr])                    \n\t"
172     "gsswrc1    %[ftmp3],   0x00(%[dst_ptr])                    \n\t"
173     MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
174     MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
175 
176 #if _MIPS_SIM == _ABIO32
177     "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
178     "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
179 #else
180     "gslwlc1    %[ftmp8],   0x03(%[pred_ptr])                   \n\t"
181     "gslwrc1    %[ftmp8],   0x00(%[pred_ptr])                   \n\t"
182 #endif
183     "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
184     "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
185     "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
186     "gsswlc1    %[ftmp4],   0x03(%[dst_ptr])                    \n\t"
187     "gsswrc1    %[ftmp4],   0x00(%[dst_ptr])                    \n\t"
188     : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
189       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
190       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
191       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
192       [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
193       [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
194       [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
195       [ff_ph_22a3]"=&f"(ff_ph_22a3)
196     : [ip]"r"(input),
197       [pred_stride]"r"((mips_reg)pred_stride),
198       [dst_stride]"r"((mips_reg)dst_stride)
199     : "memory"
200   );
201 }
202 
vp8_dc_only_idct_add_mmi(int16_t input_dc,unsigned char * pred_ptr,int pred_stride,unsigned char * dst_ptr,int dst_stride)203 void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
204                               int pred_stride, unsigned char *dst_ptr,
205                               int dst_stride) {
206   int a0 = ((input_dc + 4) >> 3);
207   double a1, ftmp[5];
208   int low32;
209 
210   __asm__ volatile (
211     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
212     "dmtc1      %[a0],      %[a1]                           \n\t"
213     "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
214     "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
215     "mtc1       %[low32],   %[ftmp1]                        \n\t"
216     "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
217     "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
218     "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
219     "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
220     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
221 
222     MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
223     MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
224     "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
225     "mtc1       %[low32],   %[ftmp1]                        \n\t"
226     "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
227     "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
228     "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
229     "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
230     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
231 
232     MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
233     MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
234     "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
235     "mtc1       %[low32],   %[ftmp1]                        \n\t"
236     "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
237     "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
238     "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
239     "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
240     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
241 
242     MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
243     MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
244     "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
245     "mtc1       %[low32],   %[ftmp1]                        \n\t"
246     "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
247     "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
248     "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
249     "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
250     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
251     : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
252       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
253       [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
254     : [dst_stride]"r"((mips_reg)dst_stride),
255       [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
256     : "memory"
257   );
258 }
259 
vp8_short_inv_walsh4x4_mmi(int16_t * input,int16_t * mb_dqcoeff)260 void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
261   int i;
262   int16_t output[16];
263   double ff_ph_03, ftmp[12];
264   uint64_t tmp[1];
265 
266   __asm__ volatile (
267     "dli        %[tmp0],    0x0003000300030003                  \n\t"
268     "dmtc1      %[tmp0],    %[ff_ph_03]                         \n\t"
269     MMI_LI(%[tmp0], 0x03)
270     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
271     "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
272     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
273     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
274     "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
275     "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
276     "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
277     "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
278     "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
279     "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
280     "paddh      %[ftmp5],   %[ftmp1],       %[ftmp2]            \n\t"
281     "psubh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
282     "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]            \n\t"
283     "psubh      %[ftmp8],   %[ftmp3],       %[ftmp4]            \n\t"
284 
285     "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
286     "psubh      %[ftmp2],   %[ftmp5],       %[ftmp7]            \n\t"
287     "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
288     "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
289 
290     TRANSPOSE_4H
291     // a
292     "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]            \n\t"
293     // d
294     "psubh      %[ftmp6],   %[ftmp1],       %[ftmp4]            \n\t"
295     // b
296     "paddh      %[ftmp7],   %[ftmp2],       %[ftmp3]            \n\t"
297     // c
298     "psubh      %[ftmp8],   %[ftmp2],       %[ftmp3]            \n\t"
299 
300     "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
301     "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
302     "psubh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
303     "psubh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
304 
305     "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_03]         \n\t"
306     "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
307     "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_03]         \n\t"
308     "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
309     "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_03]         \n\t"
310     "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
311     "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_03]         \n\t"
312     "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
313 
314     TRANSPOSE_4H
315     "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
316     "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
317     "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
318     "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
319     "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
320     "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
321     "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
322     "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
323     : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
324       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
325       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
326       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
327       [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
328     : [ip]"r"(input), [op]"r"(output)
329     : "memory"
330   );
331 
332   for (i = 0; i < 16; i++) {
333     mb_dqcoeff[i * 16] = output[i];
334   }
335 }
336