1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file quant_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 20/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
WelsQuant4x4_mmi(int16_t * pDct,const int16_t * ff,const int16_t * mf)43 void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
44 __asm__ volatile (
45 ".set arch=loongson3a \n\t"
46 "xor $f10, $f10, $f10 \n\t"
47 "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
48 "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
49
50 "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
51 "xor $f4, $f4, $f4 \n\t"
52 "xor $f6, $f6, $f6 \n\t"
53 "pcmpgth $f4, $f4, $f0 \n\t"
54 "pcmpgth $f6, $f6, $f2 \n\t"
55 "xor $f0, $f0, $f4 \n\t"
56 "xor $f2, $f2, $f6 \n\t"
57 "psubh $f0, $f0, $f4 \n\t"
58 "psubh $f2, $f2, $f6 \n\t"
59 "paddush $f0, $f0, $f8 \n\t"
60 "paddush $f2, $f2, $f10 \n\t"
61 "pmulhuh $f0, $f0, $f12 \n\t"
62 "pmulhuh $f2, $f2, $f14 \n\t"
63 "xor $f0, $f0, $f4 \n\t"
64 "xor $f2, $f2, $f6 \n\t"
65 "psubh $f0, $f0, $f4 \n\t"
66 "psubh $f2, $f2, $f6 \n\t"
67 "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
68
69 "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
70 "xor $f4, $f4, $f4 \n\t"
71 "xor $f6, $f6, $f6 \n\t"
72 "pcmpgth $f4, $f4, $f0 \n\t"
73 "pcmpgth $f6, $f6, $f2 \n\t"
74 "xor $f0, $f0, $f4 \n\t"
75 "xor $f2, $f2, $f6 \n\t"
76 "psubh $f0, $f0, $f4 \n\t"
77 "psubh $f2, $f2, $f6 \n\t"
78 "paddush $f0, $f0, $f8 \n\t"
79 "paddush $f2, $f2, $f10 \n\t"
80 "pmulhuh $f0, $f0, $f12 \n\t"
81 "pmulhuh $f2, $f2, $f14 \n\t"
82 "xor $f0, $f0, $f4 \n\t"
83 "xor $f2, $f2, $f6 \n\t"
84 "psubh $f0, $f0, $f4 \n\t"
85 "psubh $f2, $f2, $f6 \n\t"
86 "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
87 :
88 : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
89 : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
90 );
91 }
92
WelsQuant4x4Dc_mmi(int16_t * pDct,const int16_t ff,int16_t mf)93 void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {
94 __asm__ volatile (
95 ".set arch=loongson3a \n\t"
96 "xor $f10, $f10, $f10 \n\t"
97 "dmtc1 %[mf], $f12 \n\t"
98 "pshufh $f12, $f12, $f10 \n\t"
99
100 "dmtc1 %[ff], $f8 \n\t"
101 "pshufh $f8, $f8, $f10 \n\t"
102
103 "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
104 "xor $f4, $f4, $f4 \n\t"
105 "xor $f6, $f6, $f6 \n\t"
106 "pcmpgth $f4, $f4, $f0 \n\t"
107 "pcmpgth $f6, $f6, $f2 \n\t"
108 "xor $f0, $f0, $f4 \n\t"
109 "xor $f2, $f2, $f6 \n\t"
110 "psubh $f0, $f0, $f4 \n\t"
111 "psubh $f2, $f2, $f6 \n\t"
112 "paddush $f0, $f0, $f8 \n\t"
113 "paddush $f2, $f2, $f8 \n\t"
114 "pmulhuh $f0, $f0, $f12 \n\t"
115 "pmulhuh $f2, $f2, $f12 \n\t"
116 "xor $f0, $f0, $f4 \n\t"
117 "xor $f2, $f2, $f6 \n\t"
118 "psubh $f0, $f0, $f4 \n\t"
119 "psubh $f2, $f2, $f6 \n\t"
120 "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
121
122 "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
123 "xor $f4, $f4, $f4 \n\t"
124 "xor $f6, $f6, $f6 \n\t"
125 "pcmpgth $f4, $f4, $f0 \n\t"
126 "pcmpgth $f6, $f6, $f2 \n\t"
127 "xor $f0, $f0, $f4 \n\t"
128 "xor $f2, $f2, $f6 \n\t"
129 "psubh $f0, $f0, $f4 \n\t"
130 "psubh $f2, $f2, $f6 \n\t"
131 "paddush $f0, $f0, $f8 \n\t"
132 "paddush $f2, $f2, $f8 \n\t"
133 "pmulhuh $f0, $f0, $f12 \n\t"
134 "pmulhuh $f2, $f2, $f12 \n\t"
135 "xor $f0, $f0, $f4 \n\t"
136 "xor $f2, $f2, $f6 \n\t"
137 "psubh $f0, $f0, $f4 \n\t"
138 "psubh $f2, $f2, $f6 \n\t"
139 "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
140 :
141 : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)
142 : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
143 );
144 }
145
WelsQuantFour4x4_mmi(int16_t * pDct,const int16_t * ff,const int16_t * mf)146 void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
147 __asm__ volatile (
148 ".set arch=loongson3a \n\t"
149 "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
150 "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
151
152 "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
153 "xor $f4, $f4, $f4 \n\t"
154 "xor $f6, $f6, $f6 \n\t"
155 "pcmpgth $f4, $f4, $f0 \n\t"
156 "pcmpgth $f6, $f6, $f2 \n\t"
157 "xor $f0, $f0, $f4 \n\t"
158 "xor $f2, $f2, $f6 \n\t"
159 "psubh $f0, $f0, $f4 \n\t"
160 "psubh $f2, $f2, $f6 \n\t"
161 "paddush $f0, $f0, $f8 \n\t"
162 "paddush $f2, $f2, $f10 \n\t"
163 "pmulhuh $f0, $f0, $f12 \n\t"
164 "pmulhuh $f2, $f2, $f14 \n\t"
165 "xor $f0, $f0, $f4 \n\t"
166 "xor $f2, $f2, $f6 \n\t"
167 "psubh $f0, $f0, $f4 \n\t"
168 "psubh $f2, $f2, $f6 \n\t"
169 "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
170
171 "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
172 "xor $f4, $f4, $f4 \n\t"
173 "xor $f6, $f6, $f6 \n\t"
174 "pcmpgth $f4, $f4, $f0 \n\t"
175 "pcmpgth $f6, $f6, $f2 \n\t"
176 "xor $f0, $f0, $f4 \n\t"
177 "xor $f2, $f2, $f6 \n\t"
178 "psubh $f0, $f0, $f4 \n\t"
179 "psubh $f2, $f2, $f6 \n\t"
180 "paddush $f0, $f0, $f8 \n\t"
181 "paddush $f2, $f2, $f10 \n\t"
182 "pmulhuh $f0, $f0, $f12 \n\t"
183 "pmulhuh $f2, $f2, $f14 \n\t"
184 "xor $f0, $f0, $f4 \n\t"
185 "xor $f2, $f2, $f6 \n\t"
186 "psubh $f0, $f0, $f4 \n\t"
187 "psubh $f2, $f2, $f6 \n\t"
188 "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
189
190 "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
191 "xor $f4, $f4, $f4 \n\t"
192 "xor $f6, $f6, $f6 \n\t"
193 "pcmpgth $f4, $f4, $f0 \n\t"
194 "pcmpgth $f6, $f6, $f2 \n\t"
195 "xor $f0, $f0, $f4 \n\t"
196 "xor $f2, $f2, $f6 \n\t"
197 "psubh $f0, $f0, $f4 \n\t"
198 "psubh $f2, $f2, $f6 \n\t"
199 "paddush $f0, $f0, $f8 \n\t"
200 "paddush $f2, $f2, $f10 \n\t"
201 "pmulhuh $f0, $f0, $f12 \n\t"
202 "pmulhuh $f2, $f2, $f14 \n\t"
203 "xor $f0, $f0, $f4 \n\t"
204 "xor $f2, $f2, $f6 \n\t"
205 "psubh $f0, $f0, $f4 \n\t"
206 "psubh $f2, $f2, $f6 \n\t"
207 "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
208
209 "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
210 "xor $f4, $f4, $f4 \n\t"
211 "xor $f6, $f6, $f6 \n\t"
212 "pcmpgth $f4, $f4, $f0 \n\t"
213 "pcmpgth $f6, $f6, $f2 \n\t"
214 "xor $f0, $f0, $f4 \n\t"
215 "xor $f2, $f2, $f6 \n\t"
216 "psubh $f0, $f0, $f4 \n\t"
217 "psubh $f2, $f2, $f6 \n\t"
218 "paddush $f0, $f0, $f8 \n\t"
219 "paddush $f2, $f2, $f10 \n\t"
220 "pmulhuh $f0, $f0, $f12 \n\t"
221 "pmulhuh $f2, $f2, $f14 \n\t"
222 "xor $f0, $f0, $f4 \n\t"
223 "xor $f2, $f2, $f6 \n\t"
224 "psubh $f0, $f0, $f4 \n\t"
225 "psubh $f2, $f2, $f6 \n\t"
226 "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
227
228 "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
229 "xor $f4, $f4, $f4 \n\t"
230 "xor $f6, $f6, $f6 \n\t"
231 "pcmpgth $f4, $f4, $f0 \n\t"
232 "pcmpgth $f6, $f6, $f2 \n\t"
233 "xor $f0, $f0, $f4 \n\t"
234 "xor $f2, $f2, $f6 \n\t"
235 "psubh $f0, $f0, $f4 \n\t"
236 "psubh $f2, $f2, $f6 \n\t"
237 "paddush $f0, $f0, $f8 \n\t"
238 "paddush $f2, $f2, $f10 \n\t"
239 "pmulhuh $f0, $f0, $f12 \n\t"
240 "pmulhuh $f2, $f2, $f14 \n\t"
241 "xor $f0, $f0, $f4 \n\t"
242 "xor $f2, $f2, $f6 \n\t"
243 "psubh $f0, $f0, $f4 \n\t"
244 "psubh $f2, $f2, $f6 \n\t"
245 "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
246
247 "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
248 "xor $f4, $f4, $f4 \n\t"
249 "xor $f6, $f6, $f6 \n\t"
250 "pcmpgth $f4, $f4, $f0 \n\t"
251 "pcmpgth $f6, $f6, $f2 \n\t"
252 "xor $f0, $f0, $f4 \n\t"
253 "xor $f2, $f2, $f6 \n\t"
254 "psubh $f0, $f0, $f4 \n\t"
255 "psubh $f2, $f2, $f6 \n\t"
256 "paddush $f0, $f0, $f8 \n\t"
257 "paddush $f2, $f2, $f10 \n\t"
258 "pmulhuh $f0, $f0, $f12 \n\t"
259 "pmulhuh $f2, $f2, $f14 \n\t"
260 "xor $f0, $f0, $f4 \n\t"
261 "xor $f2, $f2, $f6 \n\t"
262 "psubh $f0, $f0, $f4 \n\t"
263 "psubh $f2, $f2, $f6 \n\t"
264 "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
265
266 "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
267 "xor $f4, $f4, $f4 \n\t"
268 "xor $f6, $f6, $f6 \n\t"
269 "pcmpgth $f4, $f4, $f0 \n\t"
270 "pcmpgth $f6, $f6, $f2 \n\t"
271 "xor $f0, $f0, $f4 \n\t"
272 "xor $f2, $f2, $f6 \n\t"
273 "psubh $f0, $f0, $f4 \n\t"
274 "psubh $f2, $f2, $f6 \n\t"
275 "paddush $f0, $f0, $f8 \n\t"
276 "paddush $f2, $f2, $f10 \n\t"
277 "pmulhuh $f0, $f0, $f12 \n\t"
278 "pmulhuh $f2, $f2, $f14 \n\t"
279 "xor $f0, $f0, $f4 \n\t"
280 "xor $f2, $f2, $f6 \n\t"
281 "psubh $f0, $f0, $f4 \n\t"
282 "psubh $f2, $f2, $f6 \n\t"
283 "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
284
285 "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
286 "xor $f4, $f4, $f4 \n\t"
287 "xor $f6, $f6, $f6 \n\t"
288 "pcmpgth $f4, $f4, $f0 \n\t"
289 "pcmpgth $f6, $f6, $f2 \n\t"
290 "xor $f0, $f0, $f4 \n\t"
291 "xor $f2, $f2, $f6 \n\t"
292 "psubh $f0, $f0, $f4 \n\t"
293 "psubh $f2, $f2, $f6 \n\t"
294 "paddush $f0, $f0, $f8 \n\t"
295 "paddush $f2, $f2, $f10 \n\t"
296 "pmulhuh $f0, $f0, $f12 \n\t"
297 "pmulhuh $f2, $f2, $f14 \n\t"
298 "xor $f0, $f0, $f4 \n\t"
299 "xor $f2, $f2, $f6 \n\t"
300 "psubh $f0, $f0, $f4 \n\t"
301 "psubh $f2, $f2, $f6 \n\t"
302 "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
303 :
304 : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
305 : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
306 );
307 }
308
WelsQuantFour4x4Max_mmi(int16_t * pDct,const int16_t * ff,const int16_t * mf,int16_t * max)309 void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,
310 const int16_t *mf, int16_t *max) {
311 BACKUP_REG;
312 __asm__ volatile (
313 ".set arch=loongson3a \n\t"
314 "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
315 "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
316
317 "xor $f16, $f16, $f16 \n\t"
318 "xor $f18, $f18, $f18 \n\t"
319 "xor $f20, $f20, $f20 \n\t"
320 "xor $f22, $f22, $f22 \n\t"
321 "xor $f24, $f24, $f24 \n\t"
322 "xor $f26, $f26, $f26 \n\t"
323 "xor $f28, $f28, $f28 \n\t"
324 "xor $f30, $f30, $f30 \n\t"
325
326 "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
327 "xor $f4, $f4, $f4 \n\t"
328 "xor $f6, $f6, $f6 \n\t"
329 "pcmpgth $f4, $f4, $f0 \n\t"
330 "pcmpgth $f6, $f6, $f2 \n\t"
331 "xor $f0, $f0, $f4 \n\t"
332 "xor $f2, $f2, $f6 \n\t"
333 "psubh $f0, $f0, $f4 \n\t"
334 "psubh $f2, $f2, $f6 \n\t"
335 "paddush $f0, $f0, $f8 \n\t"
336 "paddush $f2, $f2, $f10 \n\t"
337 "pmulhuh $f0, $f0, $f12 \n\t"
338 "pmulhuh $f2, $f2, $f14 \n\t"
339 "pmaxsh $f16, $f16, $f0 \n\t"
340 "pmaxsh $f18, $f18, $f2 \n\t"
341 "xor $f0, $f0, $f4 \n\t"
342 "xor $f2, $f2, $f6 \n\t"
343 "psubh $f0, $f0, $f4 \n\t"
344 "psubh $f2, $f2, $f6 \n\t"
345 "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
346
347 "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
348 "xor $f4, $f4, $f4 \n\t"
349 "xor $f6, $f6, $f6 \n\t"
350 "pcmpgth $f4, $f4, $f0 \n\t"
351 "pcmpgth $f6, $f6, $f2 \n\t"
352 "xor $f0, $f0, $f4 \n\t"
353 "xor $f2, $f2, $f6 \n\t"
354 "psubh $f0, $f0, $f4 \n\t"
355 "psubh $f2, $f2, $f6 \n\t"
356 "paddush $f0, $f0, $f8 \n\t"
357 "paddush $f2, $f2, $f10 \n\t"
358 "pmulhuh $f0, $f0, $f12 \n\t"
359 "pmulhuh $f2, $f2, $f14 \n\t"
360 "pmaxsh $f16, $f16, $f0 \n\t"
361 "pmaxsh $f18, $f18, $f2 \n\t"
362 "xor $f0, $f0, $f4 \n\t"
363 "xor $f2, $f2, $f6 \n\t"
364 "psubh $f0, $f0, $f4 \n\t"
365 "psubh $f2, $f2, $f6 \n\t"
366 "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
367
368 "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
369 "xor $f4, $f4, $f4 \n\t"
370 "xor $f6, $f6, $f6 \n\t"
371 "pcmpgth $f4, $f4, $f0 \n\t"
372 "pcmpgth $f6, $f6, $f2 \n\t"
373 "xor $f0, $f0, $f4 \n\t"
374 "xor $f2, $f2, $f6 \n\t"
375 "psubh $f0, $f0, $f4 \n\t"
376 "psubh $f2, $f2, $f6 \n\t"
377 "paddush $f0, $f0, $f8 \n\t"
378 "paddush $f2, $f2, $f10 \n\t"
379 "pmulhuh $f0, $f0, $f12 \n\t"
380 "pmulhuh $f2, $f2, $f14 \n\t"
381 "pmaxsh $f20, $f20, $f0 \n\t"
382 "pmaxsh $f22, $f22, $f2 \n\t"
383 "xor $f0, $f0, $f4 \n\t"
384 "xor $f2, $f2, $f6 \n\t"
385 "psubh $f0, $f0, $f4 \n\t"
386 "psubh $f2, $f2, $f6 \n\t"
387 "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
388
389 "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
390 "xor $f4, $f4, $f4 \n\t"
391 "xor $f6, $f6, $f6 \n\t"
392 "pcmpgth $f4, $f4, $f0 \n\t"
393 "pcmpgth $f6, $f6, $f2 \n\t"
394 "xor $f0, $f0, $f4 \n\t"
395 "xor $f2, $f2, $f6 \n\t"
396 "psubh $f0, $f0, $f4 \n\t"
397 "psubh $f2, $f2, $f6 \n\t"
398 "paddush $f0, $f0, $f8 \n\t"
399 "paddush $f2, $f2, $f10 \n\t"
400 "pmulhuh $f0, $f0, $f12 \n\t"
401 "pmulhuh $f2, $f2, $f14 \n\t"
402 "pmaxsh $f20, $f20, $f0 \n\t"
403 "pmaxsh $f22, $f22, $f2 \n\t"
404 "xor $f0, $f0, $f4 \n\t"
405 "xor $f2, $f2, $f6 \n\t"
406 "psubh $f0, $f0, $f4 \n\t"
407 "psubh $f2, $f2, $f6 \n\t"
408 "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
409
410 "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
411 "xor $f4, $f4, $f4 \n\t"
412 "xor $f6, $f6, $f6 \n\t"
413 "pcmpgth $f4, $f4, $f0 \n\t"
414 "pcmpgth $f6, $f6, $f2 \n\t"
415 "xor $f0, $f0, $f4 \n\t"
416 "xor $f2, $f2, $f6 \n\t"
417 "psubh $f0, $f0, $f4 \n\t"
418 "psubh $f2, $f2, $f6 \n\t"
419 "paddush $f0, $f0, $f8 \n\t"
420 "paddush $f2, $f2, $f10 \n\t"
421 "pmulhuh $f0, $f0, $f12 \n\t"
422 "pmulhuh $f2, $f2, $f14 \n\t"
423 "pmaxsh $f24, $f24, $f0 \n\t"
424 "pmaxsh $f26, $f26, $f2 \n\t"
425 "xor $f0, $f0, $f4 \n\t"
426 "xor $f2, $f2, $f6 \n\t"
427 "psubh $f0, $f0, $f4 \n\t"
428 "psubh $f2, $f2, $f6 \n\t"
429 "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
430
431 "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
432 "xor $f4, $f4, $f4 \n\t"
433 "xor $f6, $f6, $f6 \n\t"
434 "pcmpgth $f4, $f4, $f0 \n\t"
435 "pcmpgth $f6, $f6, $f2 \n\t"
436 "xor $f0, $f0, $f4 \n\t"
437 "xor $f2, $f2, $f6 \n\t"
438 "psubh $f0, $f0, $f4 \n\t"
439 "psubh $f2, $f2, $f6 \n\t"
440 "paddush $f0, $f0, $f8 \n\t"
441 "paddush $f2, $f2, $f10 \n\t"
442 "pmulhuh $f0, $f0, $f12 \n\t"
443 "pmulhuh $f2, $f2, $f14 \n\t"
444 "pmaxsh $f24, $f24, $f0 \n\t"
445 "pmaxsh $f26, $f26, $f2 \n\t"
446 "xor $f0, $f0, $f4 \n\t"
447 "xor $f2, $f2, $f6 \n\t"
448 "psubh $f0, $f0, $f4 \n\t"
449 "psubh $f2, $f2, $f6 \n\t"
450 "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
451
452 "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
453 "xor $f4, $f4, $f4 \n\t"
454 "xor $f6, $f6, $f6 \n\t"
455 "pcmpgth $f4, $f4, $f0 \n\t"
456 "pcmpgth $f6, $f6, $f2 \n\t"
457 "xor $f0, $f0, $f4 \n\t"
458 "xor $f2, $f2, $f6 \n\t"
459 "psubh $f0, $f0, $f4 \n\t"
460 "psubh $f2, $f2, $f6 \n\t"
461 "paddush $f0, $f0, $f8 \n\t"
462 "paddush $f2, $f2, $f10 \n\t"
463 "pmulhuh $f0, $f0, $f12 \n\t"
464 "pmulhuh $f2, $f2, $f14 \n\t"
465 "pmaxsh $f28, $f28, $f0 \n\t"
466 "pmaxsh $f30, $f30, $f2 \n\t"
467 "xor $f0, $f0, $f4 \n\t"
468 "xor $f2, $f2, $f6 \n\t"
469 "psubh $f0, $f0, $f4 \n\t"
470 "psubh $f2, $f2, $f6 \n\t"
471 "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
472
473 "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
474 "xor $f4, $f4, $f4 \n\t"
475 "xor $f6, $f6, $f6 \n\t"
476 "pcmpgth $f4, $f4, $f0 \n\t"
477 "pcmpgth $f6, $f6, $f2 \n\t"
478 "xor $f0, $f0, $f4 \n\t"
479 "xor $f2, $f2, $f6 \n\t"
480 "psubh $f0, $f0, $f4 \n\t"
481 "psubh $f2, $f2, $f6 \n\t"
482 "paddush $f0, $f0, $f8 \n\t"
483 "paddush $f2, $f2, $f10 \n\t"
484 "pmulhuh $f0, $f0, $f12 \n\t"
485 "pmulhuh $f2, $f2, $f14 \n\t"
486 "pmaxsh $f28, $f28, $f0 \n\t"
487 "pmaxsh $f30, $f30, $f2 \n\t"
488 "xor $f0, $f0, $f4 \n\t"
489 "xor $f2, $f2, $f6 \n\t"
490 "psubh $f0, $f0, $f4 \n\t"
491 "psubh $f2, $f2, $f6 \n\t"
492 "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
493
494 "mov.d $f0, $f18 \n\t"
495 "punpckhhw $f18, $f16, $f20 \n\t"
496 "punpcklhw $f16, $f16, $f20 \n\t"
497 "punpckhhw $f2, $f0, $f22 \n\t"
498 "punpcklhw $f0, $f0, $f22 \n\t"
499
500 "mov.d $f20, $f26 \n\t"
501 "punpckhhw $f26, $f24, $f28 \n\t"
502 "punpcklhw $f24, $f24, $f28 \n\t"
503 "punpckhhw $f22, $f20, $f30 \n\t"
504 "punpcklhw $f20, $f20, $f30 \n\t"
505
506 "mov.d $f28, $f18 \n\t"
507 "punpckhwd $f18, $f16, $f24 \n\t"
508 "punpcklwd $f16, $f16, $f24 \n\t"
509 "punpckhwd $f30, $f28, $f26 \n\t"
510 "punpcklwd $f28, $f28, $f26 \n\t"
511
512 "mov.d $f24, $f2 \n\t"
513 "punpckhwd $f2, $f0, $f20 \n\t"
514 "punpcklwd $f0, $f0, $f20 \n\t"
515 "punpckhwd $f26, $f24, $f22 \n\t"
516 "punpcklwd $f24, $f24, $f22 \n\t"
517
518 "mov.d $f20, $f18 \n\t"
519 "mov.d $f18, $f0 \n\t"
520 "mov.d $f22, $f2 \n\t"
521
522 "mov.d $f0, $f30 \n\t"
523 "mov.d $f30, $f24 \n\t"
524 "mov.d $f2, $f26 \n\t"
525
526 "pmaxsh $f0, $f0, $f16 \n\t"
527 "pmaxsh $f2, $f2, $f18 \n\t"
528
529 "pmaxsh $f0, $f0, $f20 \n\t"
530 "pmaxsh $f2, $f2, $f22 \n\t"
531
532 "pmaxsh $f0, $f0, $f28 \n\t"
533 "pmaxsh $f2, $f2, $f30 \n\t"
534
535 "mov.d $f4, $f0 \n\t"
536 "mov.d $f6, $f2 \n\t"
537
538 "mov.d $f0, $f2 \n\t"
539 "mov.d $f2, $f6 \n\t"
540
541 "pmaxsh $f0, $f0, $f4 \n\t"
542 "pmaxsh $f2, $f2, $f6 \n\t"
543
544 "gssdlc1 $f0, 0x7(%[max]) \n\t"
545 "gssdrc1 $f0, 0x0(%[max]) \n\t"
546 :
547 : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),
548 [max]"r"((short *)max)
549 : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",
550 "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
551 );
552 RECOVER_REG;
553 }
554