1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file deblock_mmi.c
33 *
34 * \brief Loongson optimize
35 *
36 * \date 20/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
DeblockLumaLt4V_mmi(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTC)43 void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
44 int32_t iBeta, int8_t *pTC) {
45 unsigned char tmp[512] __attribute__((aligned(32)));
46 BACKUP_REG;
47 __asm__ volatile (
48 ".set arch=loongson3a \n\t"
49 "dsll $8, %[iStride], 0x1 \n\t"
50 "daddu $8, $8, %[iStride] \n\t"
51 "dsubu $14, %[pPix], $8 \n\t"
52
53 "dsll $8, %[iStride], 0x1 \n\t"
54 "dsubu $9, %[pPix], $8 \n\t"
55
56 "dmtc1 %[iAlpha], $f0 \n\t"
57 "dsubu $13, %[pPix], %[iStride] \n\t"
58 "daddu %[iStride], %[iStride], %[pPix] \n\t"
59 "daddu $12, $8, %[pPix] \n\t"
60
61 "punpcklhw $f0, $f0, $f0 \n\t"
62 "lb $8, 0x0(%[pTC]) \n\t"
63 "punpcklwd $f0, $f0, $f0 \n\t"
64 "mov.d $f2, $f0 \n\t"
65 "gssqc1 $f2, $f0, 432-112(%[tmp]) \n\t"
66 "dmtc1 %[iBeta], $f0 \n\t"
67 "lb %[iAlpha], 0x1(%[pTC]) \n\t"
68 "dli %[iBeta], 0xFFFF \n\t"
69 "punpcklhw $f0, $f0, $f0 \n\t"
70 "and $10, %[iAlpha], %[iBeta] \n\t"
71 "punpcklwd $f0, $f0, $f0 \n\t"
72 "mov.d $f2, $f0 \n\t"
73 "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
74 "dmtc1 $10, $f4 \n\t"
75 "mov.d $f8, $f4 \n\t"
76 "dmtc1 %[iAlpha], $f16 \n\t"
77 "and %[iAlpha], $8, %[iBeta] \n\t"
78 "dmtc1 %[iAlpha], $f20 \n\t"
79 "mov.d $f24, $f20 \n\t"
80 "mov.d $f28, $f20 \n\t"
81 "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
82 "dmtc1 %[iAlpha], $f0 \n\t"
83
84 "lb %[iAlpha], 0x3(%[pTC]) \n\t"
85 "lb %[pTC], 0x2(%[pTC]) \n\t"
86 "dmtc1 $10, $f12 \n\t"
87 "punpcklhw $f0, $f0, $f16 \n\t"
88 "and $8, %[iAlpha], %[iBeta] \n\t"
89 "punpcklhw $f24, $f24, $f8 \n\t"
90 "punpcklhw $f20, $f20, $f4 \n\t"
91 "punpcklhw $f0, $f0, $f24 \n\t"
92 "punpcklhw $f28, $f28, $f12 \n\t"
93 "punpcklhw $f28, $f28, $f20 \n\t"
94 "punpckhhw $f2, $f0, $f28 \n\t"
95 "punpcklhw $f0, $f0, $f28 \n\t"
96 "gssqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
97 "dmtc1 $8, $f0 \n\t"
98 "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
99 "mov.d $f8, $f0 \n\t"
100 "dmtc1 %[iAlpha], $f16 \n\t"
101 "and %[iAlpha], %[pTC], %[iBeta] \n\t"
102 "dmtc1 $8, $f12 \n\t"
103 "dmtc1 %[iAlpha], $f20 \n\t"
104 "punpcklhw $f20, $f20, $f0 \n\t"
105
106 "xor $f0, $f0, $f0 \n\t"
107 "dmtc1 %[iAlpha], $f24 \n\t"
108 "and %[pTC], %[pTC], %[iBeta] \n\t"
109 "punpcklhw $f24, $f24, $f8 \n\t"
110 "dmtc1 %[iAlpha], $f28 \n\t"
111 "dmtc1 %[pTC], $f4 \n\t"
112
113 "gslqc1 $f10, $f8, 0x0($9) \n\t"
114 "punpckhbh $f10, $f8, $f0 \n\t"
115 "punpcklbh $f8, $f8, $f0 \n\t"
116
117 "dli %[iAlpha], 0x4 \n\t"
118 "seh %[pTC], %[iAlpha] \n\t"
119 "punpcklhw $f28, $f28, $f12 \n\t"
120 "punpcklhw $f28, $f28, $f20 \n\t"
121 "gslqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
122 "gslqc1 $f14, $f12, 0x0($13) \n\t"
123 "gsldxc1 $f2, 0x0($12, $0) \n\t"
124 "punpckhbh $f22, $f20, $f0 \n\t"
125 "punpcklbh $f20, $f20, $f0 \n\t"
126 "gssqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
127 "punpckhbh $f22, $f2, $f0 \n\t"
128 "punpcklbh $f20, $f2, $f0 \n\t"
129 "gssqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
130 "punpcklhw $f4, $f4, $f16 \n\t"
131 "gslqc1 $f18, $f16, 0x0($14) \n\t"
132 "punpcklhw $f4, $f4, $f24 \n\t"
133 "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
134 "punpckhhw $f6, $f4, $f28 \n\t"
135 "punpcklhw $f4, $f4, $f28 \n\t"
136 "punpckhbh $f26, $f24, $f0 \n\t"
137 "punpcklbh $f24, $f24, $f0 \n\t"
138 "punpckhbh $f14, $f12, $f0 \n\t"
139 "punpcklbh $f12, $f12, $f0 \n\t"
140 "punpckhbh $f18, $f16, $f0 \n\t"
141 "punpcklbh $f16, $f16, $f0 \n\t"
142 "psubh $f28, $f12, $f16 \n\t"
143 "psubh $f30, $f14, $f18 \n\t"
144 "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
145 WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
146 "gslqc1 $f18, $f16, 432-336(%[tmp]) \n\t"
147 "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
148 "pcmpgth $f20, $f16, $f28 \n\t"
149 "pcmpgth $f22, $f18, $f30 \n\t"
150 "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
151 "psubh $f28, $f24, $f0 \n\t"
152 "psubh $f30, $f26, $f2 \n\t"
153 WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
154 "pcmpgth $f20, $f16, $f28 \n\t"
155 "pcmpgth $f22, $f18, $f30 \n\t"
156 "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
157 "pavgh $f20, $f12, $f24 \n\t"
158 "pavgh $f22, $f14, $f26 \n\t"
159 "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
160 "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
161 "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
162 "gslqc1 $f2, $f0, 432-256(%[tmp]) \n\t"
163 "psubh $f20, $f20, $f28 \n\t"
164 "psubh $f22, $f22, $f30 \n\t"
165 "psubh $f20, $f20, $f0 \n\t"
166 "psubh $f22, $f22, $f2 \n\t"
167 "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
168 "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
169 "psubh $f20, $f24, $f12 \n\t"
170 "psubh $f22, $f26, $f14 \n\t"
171 "gssqc1 $f26, $f24, 432-32(%[tmp]) \n\t"
172 "psubh $f24, $f24, $f0 \n\t"
173 "psubh $f26, $f26, $f2 \n\t"
174 "gssqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
175 WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
176 "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
177 "pcmpgth $f20, $f20, $f28 \n\t"
178 "pcmpgth $f22, $f22, $f30 \n\t"
179 WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
180 "pcmpgth $f28, $f16, $f24 \n\t"
181 "pcmpgth $f30, $f18, $f26 \n\t"
182
183 "xor $f0, $f0, $f0 \n\t"
184 "and $f20, $f20, $f28 \n\t"
185 "and $f22, $f22, $f30 \n\t"
186 "psubh $f24, $f12, $f8 \n\t"
187 "psubh $f26, $f14, $f10 \n\t"
188 WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
189 "pcmpgth $f28, $f16, $f24 \n\t"
190 "pcmpgth $f30, $f18, $f26 \n\t"
191 "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
192 "and $f20, $f20, $f28 \n\t"
193 "and $f22, $f22, $f30 \n\t"
194 "pcmpgth $f28, $f24, $f0 \n\t"
195 "pcmpgth $f30, $f26, $f0 \n\t"
196 "pcmpeqh $f24, $f24, $f0 \n\t"
197 "pcmpeqh $f26, $f26, $f0 \n\t"
198 "or $f28, $f28, $f24 \n\t"
199 "or $f30, $f30, $f26 \n\t"
200 "and $f20, $f20, $f28 \n\t"
201 "and $f22, $f22, $f30 \n\t"
202 "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
203 "dmtc1 %[pTC], $f20 \n\t"
204 "punpckhhw $f26, $f20, $f20 \n\t"
205 "punpcklhw $f24, $f20, $f20 \n\t"
206 "punpcklwd $f20, $f24, $f24 \n\t"
207 "mov.d $f22, $f20 \n\t"
208 "gssqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
209 "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
210 "psubh $f24, $f0, $f20 \n\t"
211 "dli $11, 0x2 \n\t"
212 "psubh $f26, $f0, $f22 \n\t"
213 "dmtc1 $11, $f28 \n\t"
214 "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
215 "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
216 "psllh $f20, $f20, $f28 \n\t"
217 "psllh $f22, $f22, $f28 \n\t"
218 "psubh $f28, $f8, $f0 \n\t"
219 "psubh $f30, $f10, $f2 \n\t"
220 "paddh $f28, $f28, $f20 \n\t"
221 "paddh $f30, $f30, $f22 \n\t"
222 "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
223 "paddh $f28, $f28, $f20 \n\t"
224 "paddh $f30, $f30, $f22 \n\t"
225 "dli $11, 0x3 \n\t"
226 "dmtc1 $11, $f20 \n\t"
227 "psrah $f28, $f28, $f20 \n\t"
228 "psrah $f30, $f30, $f20 \n\t"
229 "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
230 "pmaxsh $f24, $f24, $f28 \n\t"
231 "pmaxsh $f26, $f26, $f30 \n\t"
232 "gslqc1 $f2, $f0, 432-320(%[tmp]) \n\t"
233 "pminsh $f20, $f20, $f24 \n\t"
234 "pminsh $f22, $f22, $f26 \n\t"
235
236 "and $f20, $f20, $f0 \n\t"
237 "and $f22, $f22, $f2 \n\t"
238 "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
239 "gssqc1 $f22, $f20, 432-64(%[tmp]) \n\t"
240 "xor $f0, $f0, $f0 \n\t"
241 "gssqc1 $f26, $f24, 432-384(%[tmp]) \n\t"
242 "psubh $f20, $f0, $f24 \n\t"
243 "psubh $f22, $f0, $f26 \n\t"
244 "gssqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
245 "mov.d $f24, $f20 \n\t"
246 "mov.d $f26, $f22 \n\t"
247 "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
248 "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
249 "paddh $f20, $f20, $f28 \n\t"
250 "paddh $f22, $f22, $f30 \n\t"
251 "paddh $f28, $f8, $f8 \n\t"
252 "paddh $f30, $f10, $f10 \n\t"
253 "psubh $f20, $f20, $f28 \n\t"
254 "psubh $f22, $f22, $f30 \n\t"
255 "dli $11, 0x1 \n\t"
256 "dmtc1 $11, $f28 \n\t"
257 "psrah $f20, $f20, $f28 \n\t"
258 "psrah $f22, $f22, $f28 \n\t"
259 "pmaxsh $f24, $f24, $f20 \n\t"
260 "pmaxsh $f26, $f26, $f22 \n\t"
261 "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
262 "pminsh $f20, $f20, $f24 \n\t"
263 "pminsh $f22, $f22, $f26 \n\t"
264
265 "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
266 "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
267 "and $f20, $f20, $f24 \n\t"
268 "and $f22, $f22, $f26 \n\t"
269 "and $f20, $f20, $f28 \n\t"
270 "and $f22, $f22, $f30 \n\t"
271 "gslqc1 $f26, $f24, 432-240(%[tmp]) \n\t"
272 "gssqc1 $f22, $f20, 432-96(%[tmp]) \n\t"
273 "gslqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
274 "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
275 "paddh $f20, $f20, $f28 \n\t"
276 "paddh $f22, $f22, $f30 \n\t"
277 "paddh $f28, $f24, $f24 \n\t"
278 "paddh $f30, $f26, $f26 \n\t"
279 "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
280 "dli $11, 0x1 \n\t"
281 "psubh $f20, $f20, $f28 \n\t"
282 "dmtc1 $11, $f28 \n\t"
283 "psubh $f22, $f22, $f30 \n\t"
284
285 "psrah $f20, $f20, $f28 \n\t"
286 "psrah $f22, $f22, $f28 \n\t"
287 "gslqc1 $f30, $f28, 0x0(%[iStride]) \n\t"
288 "pmaxsh $f24, $f24, $f20 \n\t"
289 "pmaxsh $f26, $f26, $f22 \n\t"
290 "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
291 "pminsh $f20, $f20, $f24 \n\t"
292 "pminsh $f22, $f22, $f26 \n\t"
293 "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
294 "and $f20, $f20, $f24 \n\t"
295 "and $f22, $f22, $f26 \n\t"
296 "gslqc1 $f26, $f24, 432-256(%[tmp]) \n\t"
297 "and $f20, $f20, $f24 \n\t"
298 "and $f22, $f22, $f26 \n\t"
299 "gslqc1 $f26, $f24, 0x0($9) \n\t"
300 "punpcklbh $f28, $f30, $f0 \n\t"
301 "punpckhbh $f30, $f30, $f0 \n\t"
302 "gssqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
303
304 "gslqc1 $f30, $f28, 0x0($12) \n\t"
305 "punpcklbh $f24, $f26, $f0 \n\t"
306 "punpckhbh $f26, $f26, $f0 \n\t"
307 "gssqc1 $f22, $f20, 432-48(%[tmp]) \n\t"
308 "gslqc1 $f22, $f20, 0x0($14) \n\t"
309 "gssqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
310 "gslqc1 $f26, $f24, 0x0($13) \n\t"
311 "punpcklbh $f28, $f30, $f0 \n\t"
312 "punpckhbh $f30, $f30, $f0 \n\t"
313 "punpcklbh $f20, $f22, $f0 \n\t"
314 "punpckhbh $f22, $f22, $f0 \n\t"
315 "gssqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
316 "punpcklbh $f24, $f26, $f0 \n\t"
317 "punpckhbh $f26, $f26, $f0 \n\t"
318 "gssqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
319
320 "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
321 "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
322 "psubh $f28, $f28, $f20 \n\t"
323 "psubh $f30, $f30, $f22 \n\t"
324 "gssqc1 $f22, $f20, 432-16(%[tmp]) \n\t"
325 WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
326 "punpcklbh $f24, $f26, $f0 \n\t"
327 "punpckhbh $f26, $f26, $f0 \n\t"
328 "pcmpgth $f20, $f16, $f28 \n\t"
329 "pcmpgth $f22, $f18, $f30 \n\t"
330 "gslqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
331 "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
332
333 "psubh $f28, $f24, $f28 \n\t"
334 "psubh $f30, $f26, $f30 \n\t"
335 WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
336 "pcmpgth $f20, $f16, $f28 \n\t"
337 "pcmpgth $f22, $f18, $f30 \n\t"
338 "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
339
340 "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
341 "gssqc1 $f26, $f24, 432-80(%[tmp]) \n\t"
342 "pavgh $f20, $f20, $f24 \n\t"
343 "pavgh $f22, $f22, $f26 \n\t"
344 "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
345
346 "gslqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
347 "gslqc1 $f30, $f28, 432-256(%[tmp]) \n\t"
348 "psubh $f20, $f4, $f20 \n\t"
349 "psubh $f22, $f6, $f22 \n\t"
350 "psubh $f20, $f20, $f28 \n\t"
351 "psubh $f22, $f22, $f30 \n\t"
352 "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
353 "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
354 "gslqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
355 "psubh $f20, $f24, $f20 \n\t"
356 "psubh $f22, $f26, $f22 \n\t"
357 "psubh $f24, $f24, $f28 \n\t"
358 "psubh $f26, $f26, $f30 \n\t"
359 "gssqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
360 "mov.d $f28, $f20 \n\t"
361 "mov.d $f30, $f22 \n\t"
362 WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
363 "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
364 "pcmpgth $f20, $f20, $f28 \n\t"
365 "pcmpgth $f22, $f22, $f30 \n\t"
366 WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
367 "pcmpgth $f28, $f16, $f24 \n\t"
368 "pcmpgth $f30, $f18, $f26 \n\t"
369 "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
370
371 "and $f20, $f20, $f28 \n\t"
372 "and $f22, $f22, $f30 \n\t"
373 "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
374 "psubh $f28, $f28, $f24 \n\t"
375 "psubh $f30, $f30, $f26 \n\t"
376 "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
377 "psubh $f24, $f24, $f0 \n\t"
378 "psubh $f26, $f26, $f2 \n\t"
379 WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
380 "pcmpgth $f16, $f16, $f28 \n\t"
381 "pcmpgth $f18, $f18, $f30 \n\t"
382 "gslqc1 $f30, $f28, 432-96(%[tmp]) \n\t"
383 "and $f20, $f20, $f16 \n\t"
384 "and $f22, $f22, $f18 \n\t"
385 "xor $f0, $f0, $f0 \n\t"
386
387 "paddh $f8, $f8, $f28 \n\t"
388 "paddh $f10, $f10, $f30 \n\t"
389 "pcmpgth $f16, $f4, $f0 \n\t"
390 "pcmpgth $f18, $f6, $f0 \n\t"
391 "pcmpeqh $f28, $f4, $f0 \n\t"
392 "pcmpeqh $f30, $f6, $f0 \n\t"
393 "or $f16, $f16, $f28 \n\t"
394 "or $f18, $f18, $f30 \n\t"
395 "and $f20, $f20, $f16 \n\t"
396 "and $f22, $f22, $f18 \n\t"
397 "gslqc1 $f18, $f16, 432-224(%[tmp]) \n\t"
398 "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
399 "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
400 "dli $11, 0x2 \n\t"
401 "psubh $f28, $f0, $f16 \n\t"
402 "psubh $f30, $f0, $f18 \n\t"
403 "psubh $f2, $f0, $f6 \n\t"
404 "psubh $f0, $f0, $f4 \n\t"
405 "dmfc1 %[iAlpha], $f28 \n\t"
406 "dmtc1 $11, $f28 \n\t"
407 "psllh $f20, $f20, $f28 \n\t"
408 "psllh $f22, $f22, $f28 \n\t"
409 "dmtc1 %[iAlpha], $f28 \n\t"
410 "paddh $f24, $f24, $f20 \n\t"
411 "paddh $f26, $f26, $f22 \n\t"
412 "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
413 "paddh $f24, $f24, $f20 \n\t"
414 "paddh $f26, $f26, $f22 \n\t"
415 "gslqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
416 "dli $11, 0x3 \n\t"
417 "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
418 "dmfc1 %[iAlpha], $f0 \n\t"
419 "dmtc1 $11, $f0 \n\t"
420 "psrah $f24, $f24, $f0 \n\t"
421 "psrah $f26, $f26, $f0 \n\t"
422 "dmtc1 %[iAlpha], $f0 \n\t"
423 "pmaxsh $f28, $f28, $f24 \n\t"
424 "pmaxsh $f30, $f30, $f26 \n\t"
425 "pminsh $f16, $f16, $f28 \n\t"
426 "pminsh $f18, $f18, $f30 \n\t"
427 "gslqc1 $f30, $f28, 432-320(%[tmp]) \n\t"
428 "and $f16, $f16, $f28 \n\t"
429 "and $f18, $f18, $f30 \n\t"
430 "mov.d $f24, $f0 \n\t"
431 "mov.d $f26, $f2 \n\t"
432 "gslqc1 $f2, $f0, 432-16(%[tmp]) \n\t"
433 "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
434 "paddh $f0, $f0, $f28 \n\t"
435 "paddh $f2, $f2, $f30 \n\t"
436 "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
437 "gslqc1 $f18, $f16, 432-368(%[tmp]) \n\t"
438 "dli $11, 0x1 \n\t"
439 "paddh $f16, $f16, $f16 \n\t"
440 "paddh $f18, $f18, $f18 \n\t"
441 "psubh $f0, $f0, $f16 \n\t"
442 "psubh $f2, $f2, $f18 \n\t"
443
444 "dmtc1 $11, $f28 \n\t"
445 "gslqc1 $f18, $f16, 432-64(%[tmp]) \n\t"
446 "psrah $f0, $f0, $f28 \n\t"
447 "psrah $f2, $f2, $f28 \n\t"
448 "pmaxsh $f24, $f24, $f0 \n\t"
449 "pmaxsh $f26, $f26, $f2 \n\t"
450 "gslqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
451 "pminsh $f28, $f4, $f24 \n\t"
452 "pminsh $f30, $f6, $f26 \n\t"
453 "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
454 "and $f28, $f28, $f24 \n\t"
455 "and $f30, $f30, $f26 \n\t"
456 "dmfc1 %[iAlpha], $f24 \n\t"
457 "dmfc1 %[iBeta], $f26 \n\t"
458 "gslqc1 $f26, $f24, 432-288(%[tmp]) \n\t"
459 "and $f28, $f28, $f24 \n\t"
460 "and $f30, $f30, $f26 \n\t"
461 "paddh $f20, $f20, $f28 \n\t"
462 "paddh $f22, $f22, $f30 \n\t"
463 "packushb $f8, $f8, $f10 \n\t"
464 "packushb $f10, $f20, $f22 \n\t"
465 "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
466 "paddh $f0, $f0, $f20 \n\t"
467 "paddh $f2, $f2, $f22 \n\t"
468 "paddh $f12, $f12, $f16 \n\t"
469 "paddh $f14, $f14, $f18 \n\t"
470 "packushb $f12, $f12, $f14 \n\t"
471 "packushb $f14, $f0, $f2 \n\t"
472
473 "gslqc1 $f2, $f0, 432-32(%[tmp]) \n\t"
474 "psubh $f0, $f0, $f16 \n\t"
475 "psubh $f2, $f2, $f18 \n\t"
476 "gslqc1 $f18, $f16, 432-80(%[tmp]) \n\t"
477 "psubh $f16, $f16, $f20 \n\t"
478 "gslqc1 $f26, $f24, 432-48(%[tmp]) \n\t"
479 "psubh $f18, $f18, $f22 \n\t"
480
481 "gslqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
482 "paddh $f20, $f20, $f24 \n\t"
483 "paddh $f22, $f22, $f26 \n\t"
484 "gslqc1 $f26, $f24, 432-304(%[tmp]) \n\t"
485 "packushb $f0, $f0, $f2 \n\t"
486 "packushb $f2, $f16, $f18 \n\t"
487 "gslqc1 $f18, $f16, 432-384(%[tmp]) \n\t"
488 "paddh $f16, $f16, $f24 \n\t"
489 "paddh $f18, $f18, $f26 \n\t"
490 "gssqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
491 "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
492 "mov.d $f28, $f0 \n\t"
493 "mov.d $f30, $f2 \n\t"
494 "paddh $f0, $f0, $f0 \n\t"
495 "paddh $f2, $f2, $f2 \n\t"
496
497 "dmtc1 %[iAlpha], $f24 \n\t"
498 "dmtc1 %[iBeta], $f26 \n\t"
499
500 "psubh $f16, $f16, $f0 \n\t"
501 "psubh $f18, $f18, $f2 \n\t"
502 "dli $11, 0x1 \n\t"
503 "gslqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
504 "gssqc1 $f10, $f8, 0x0($9) \n\t"
505 "dmtc1 $11, $f8 \n\t"
506 "psrah $f16, $f16, $f8 \n\t"
507 "psrah $f18, $f18, $f8 \n\t"
508 "pmaxsh $f0, $f0, $f16 \n\t"
509 "pmaxsh $f2, $f2, $f18 \n\t"
510 "pminsh $f4, $f4, $f0 \n\t"
511 "pminsh $f6, $f6, $f2 \n\t"
512 "gslqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
513
514 "gslqc1 $f10, $f8, 428-256+4(%[tmp]) \n\t"
515 "and $f4, $f4, $f24 \n\t"
516 "and $f6, $f6, $f26 \n\t"
517 "and $f4, $f4, $f8 \n\t"
518 "and $f6, $f6, $f10 \n\t"
519 "gssqc1 $f14, $f12, 0x0($13) \n\t"
520 "paddh $f28, $f28, $f4 \n\t"
521 "paddh $f30, $f30, $f6 \n\t"
522 "packushb $f20, $f20, $f22 \n\t"
523 "packushb $f22, $f28, $f30 \n\t"
524 "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
525 "gssqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
526 : [pPix]"+&r"((unsigned char *)pPix)
527 : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
528 [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
529 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
530 "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
531 "$f22", "$f24", "$f26", "$f28", "$f30"
532 );
533 RECOVER_REG;
534 }
535
DeblockLumaTransposeH2V_mmi(uint8_t * pPixY,int32_t iStride,uint8_t * pDst)536 void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
537 uint8_t *pDst) {
538 BACKUP_REG;
539 __asm__ volatile(
540 ".set arch=loongson3a \n\t"
541 "dsll $8, %[iStride], 0x3 \n\t"
542 "daddu $8, $8, %[pPixY] \n\t"
543
544 "daddu $9, %[pPixY], %[iStride] \n\t"
545 "daddu $10, $8, %[iStride] \n\t"
546 "gsldlc1 $f0, 0x7(%[pPixY]) \n\t"
547 "gsldlc1 $f2, 0x7($8) \n\t"
548 "gsldlc1 $f4, 0x7($9) \n\t"
549 "gsldlc1 $f6, 0x7($10) \n\t"
550 "gsldrc1 $f0, 0x0(%[pPixY]) \n\t"
551 "gsldrc1 $f2, 0x0($8) \n\t"
552 "gsldrc1 $f4, 0x0($9) \n\t"
553 "gsldrc1 $f6, 0x0($10) \n\t"
554 "daddu %[pPixY], $9, %[iStride] \n\t"
555 "daddu $8, $10, %[iStride] \n\t"
556 "daddu $9, %[pPixY], %[iStride] \n\t"
557 "daddu $10, $8, %[iStride] \n\t"
558 "gsldlc1 $f8, 0x7(%[pPixY]) \n\t"
559 "gsldlc1 $f10, 0x7($8) \n\t"
560 "gsldlc1 $f12, 0x7($9) \n\t"
561 "gsldlc1 $f14, 0x7($10) \n\t"
562 "gsldrc1 $f8, 0x0(%[pPixY]) \n\t"
563 "gsldrc1 $f10, 0x0($8) \n\t"
564 "gsldrc1 $f12, 0x0($9) \n\t"
565 "gsldrc1 $f14, 0x0($10) \n\t"
566
567 "daddu %[pPixY], $9, %[iStride] \n\t"
568 "daddu $8, $10, %[iStride] \n\t"
569 "daddu $9, %[pPixY], %[iStride] \n\t"
570 "daddu $10, $8, %[iStride] \n\t"
571 "gsldlc1 $f16, 0x7(%[pPixY]) \n\t"
572 "gsldlc1 $f18, 0x7($8) \n\t"
573 "gsldlc1 $f20, 0x7($9) \n\t"
574 "gsldlc1 $f22, 0x7($10) \n\t"
575 "gsldrc1 $f16, 0x0(%[pPixY]) \n\t"
576 "gsldrc1 $f18, 0x0($8) \n\t"
577 "gsldrc1 $f20, 0x0($9) \n\t"
578 "gsldrc1 $f22, 0x0($10) \n\t"
579 "daddu %[pPixY], $9, %[iStride] \n\t"
580 "daddu $8, $10, %[iStride] \n\t"
581 "daddu $9, %[pPixY], %[iStride] \n\t"
582 "daddu $10, $8, %[iStride] \n\t"
583 "gsldlc1 $f24, 0x7(%[pPixY]) \n\t"
584 "gsldlc1 $f26, 0x7($8) \n\t"
585
586 "gsldlc1 $f28, 0x7($9) \n\t"
587 "gsldlc1 $f30, 0x7($10) \n\t"
588 "gsldrc1 $f24, 0x0(%[pPixY]) \n\t"
589 "gsldrc1 $f26, 0x0($8) \n\t"
590 "gsldrc1 $f28, 0x0($9) \n\t"
591 "gsldrc1 $f30, 0x0($10) \n\t"
592
593 MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
594 $f14, $f16, $f18, $f20, $f22, $f24,
595 $f26, $f28, $f30, $9, $10)
596
597 "gssqc1 $f18, $f16, 0x0(%[pDst]) \n\t"
598 "gssqc1 $f10, $f8, 0x10(%[pDst]) \n\t"
599 "gssqc1 $f14, $f12, 0x20(%[pDst]) \n\t"
600 "gssqc1 $f30, $f28, 0x30(%[pDst]) \n\t"
601 "gssqc1 $f22, $f20, 0x40(%[pDst]) \n\t"
602 "gssqc1 $f6, $f4, 0x50(%[pDst]) \n\t"
603 "gssqc1 $f26, $f24, 0x60(%[pDst]) \n\t"
604 "gssqc1 $f2, $f0, 0x70(%[pDst]) \n\t"
605 : [pPixY] "+&r"((unsigned char *)pPixY)
606 : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
607 : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
608 "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
609 "$f30"
610 );
611 RECOVER_REG;
612 }
613
DeblockLumaTransposeV2H_mmi(uint8_t * pPixY,int32_t iStride,uint8_t * pSrc)614 void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
615 uint8_t *pSrc) {
616 BACKUP_REG;
617 __asm__ volatile(
618 ".set arch=loongson3a \n\t"
619 "gslqc1 $f2, $f0, 0x0(%[pSrc]) \n\t"
620 "gslqc1 $f6, $f4, 0x10(%[pSrc]) \n\t"
621 "gslqc1 $f10, $f8, 0x20(%[pSrc]) \n\t"
622 "gslqc1 $f14, $f12, 0x30(%[pSrc]) \n\t"
623 "gslqc1 $f18, $f16, 0x40(%[pSrc]) \n\t"
624 "gslqc1 $f22, $f20, 0x50(%[pSrc]) \n\t"
625 "gslqc1 $f26, $f24, 0x60(%[pSrc]) \n\t"
626 "gslqc1 $f30, $f28, 0x70(%[pSrc]) \n\t"
627
628 MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
629 $f14, $f16, $f18, $f20, $f22, $f24,
630 $f26, $f28, $f30, $9, $10)
631
632 "daddu $8, %[pPixY], %[iStride] \n\t"
633 "gssdlc1 $f16, 0x7(%[pPixY]) \n\t"
634 "gssdlc1 $f8, 0x7($8) \n\t"
635 "gssdrc1 $f16, 0x0(%[pPixY]) \n\t"
636 "gssdrc1 $f8, 0x0($8) \n\t"
637 "daddu %[pPixY], $8, %[iStride] \n\t"
638 "daddu $8, %[pPixY], %[iStride] \n\t"
639 "gssdlc1 $f12, 0x7(%[pPixY]) \n\t"
640 "gssdlc1 $f28, 0x7($8) \n\t"
641 "gssdrc1 $f12, 0x0(%[pPixY]) \n\t"
642 "gssdrc1 $f28, 0x0($8) \n\t"
643
644 "daddu %[pPixY], $8, %[iStride] \n\t"
645 "daddu $8, %[pPixY], %[iStride] \n\t"
646 "gssdlc1 $f20, 0x7(%[pPixY]) \n\t"
647 "gssdlc1 $f4, 0x7($8) \n\t"
648 "gssdrc1 $f20, 0x0(%[pPixY]) \n\t"
649 "gssdrc1 $f4, 0x0($8) \n\t"
650 "daddu %[pPixY], $8, %[iStride] \n\t"
651 "daddu $8, %[pPixY], %[iStride] \n\t"
652 "gssdlc1 $f24, 0x7(%[pPixY]) \n\t"
653 "gssdlc1 $f0, 0x7($8) \n\t"
654 "gssdrc1 $f24, 0x0(%[pPixY]) \n\t"
655 "gssdrc1 $f0, 0x0($8) \n\t"
656
657 "daddu %[pPixY], $8, %[iStride] \n\t"
658 "daddu $8, %[pPixY], %[iStride] \n\t"
659 "gssdlc1 $f18, 0x7(%[pPixY]) \n\t"
660 "gssdlc1 $f10, 0x7($8) \n\t"
661 "gssdrc1 $f18, 0x0(%[pPixY]) \n\t"
662 "gssdrc1 $f10, 0x0($8) \n\t"
663 "daddu %[pPixY], $8, %[iStride] \n\t"
664 "daddu $8, %[pPixY], %[iStride] \n\t"
665 "gssdlc1 $f14, 0x7(%[pPixY]) \n\t"
666 "gssdlc1 $f30, 0x7($8) \n\t"
667 "gssdrc1 $f14, 0x0(%[pPixY]) \n\t"
668 "gssdrc1 $f30, 0x0($8) \n\t"
669
670 "daddu %[pPixY], $8, %[iStride] \n\t"
671 "daddu $8, %[pPixY], %[iStride] \n\t"
672 "gssdlc1 $f22, 0x7(%[pPixY]) \n\t"
673 "gssdlc1 $f6, 0x7($8) \n\t"
674 "gssdrc1 $f22, 0x0(%[pPixY]) \n\t"
675 "gssdrc1 $f6, 0x0($8) \n\t"
676 "daddu %[pPixY], $8, %[iStride] \n\t"
677 "daddu $8, %[pPixY], %[iStride] \n\t"
678 "gssdlc1 $f26, 0x7(%[pPixY]) \n\t"
679 "gssdlc1 $f2, 0x7($8) \n\t"
680 "gssdrc1 $f26, 0x0(%[pPixY]) \n\t"
681 "gssdrc1 $f2, 0x0($8) \n\t"
682 : [pPixY] "+&r"((unsigned char *)pPixY)
683 : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
684 : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
685 "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
686 "$f30"
687 );
688 RECOVER_REG;
689 }
690
DeblockLumaEq4V_mmi(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)691 void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
692 int32_t iBeta) {
693 unsigned char tmp[720] __attribute__((aligned(32)));
694 BACKUP_REG;
695 __asm__ volatile (
696 ".set arch=loongson3a \n\t"
697 "dsll $11, %[iStride], 0x2 \n\t"
698 "xor $f8, $f8, $f8 \n\t"
699 "daddu $14, %[iStride], %[pPix] \n\t"
700 "dsubu $8, %[pPix], $11 \n\t"
701 "gslqc1 $f14, $f12, 0x0($8) \n\t"
702 "gslqc1 $f22, $f20, 0x0(%[pPix]) \n\t"
703 "daddu $9, %[iStride], %[iStride] \n\t"
704 "daddu $10, $9, %[iStride] \n\t"
705 "move $12, $9 \n\t"
706 "dsubu $8, %[pPix], $9 \n\t"
707 "gslqc1 $f6, $f4, 0x0($8) \n\t"
708 "dsubu $9, %[pPix], %[iStride] \n\t"
709 "gslqc1 $f18, $f16, 0x0($9) \n\t"
710 "daddu $13, %[iStride], %[pPix] \n\t"
711
712 "move %[iStride], $12 \n\t"
713 "daddu $15, $12, %[pPix] \n\t"
714
715 "daddu $12, %[pPix], $10 \n\t"
716 "dsubu $11, %[pPix], $10 \n\t"
717
718 "gslqc1 $f26, $f24, 0x0($11) \n\t"
719 "daddu %[iStride], %[iStride], %[pPix] \n\t"
720 "dmtc1 %[iAlpha], $f0 \n\t"
721
722 "punpcklhw $f28, $f0, $f0 \n\t"
723 "punpcklwd $f0, $f28, $f28 \n\t"
724 "mov.d $f2, $f0 \n\t"
725 "gssqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
726 "dmtc1 %[iBeta], $f0 \n\t"
727 "gsldxc1 $f10, 0x0($15, $0) \n\t"
728 "punpcklhw $f28, $f0, $f0 \n\t"
729 "punpcklwd $f0, $f28, $f28 \n\t"
730 "punpckhbh $f30, $f10, $f8 \n\t"
731 "mov.d $f2, $f0 \n\t"
732
733 "punpcklbh $f28, $f10, $f8 \n\t"
734 "gssqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
735 "gssqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
736 "mov.d $f0, $f4 \n\t"
737 "gssqc1 $f22, $f20, 704-272(%[tmp]) \n\t"
738 "gssqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
739 "mov.d $f4, $f16 \n\t"
740 "punpckhbh $f22, $f20, $f8 \n\t"
741 "punpcklbh $f20, $f20, $f8 \n\t"
742 "punpckhbh $f6, $f4, $f8 \n\t"
743 "punpcklbh $f4, $f4, $f8 \n\t"
744
745 "psubh $f28, $f20, $f4 \n\t"
746 "psubh $f30, $f22, $f6 \n\t"
747 WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
748 "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
749 "punpckhbh $f2, $f0, $f8 \n\t"
750 "punpcklbh $f0, $f0, $f8 \n\t"
751 "gssqc1 $f18, $f16, 688-272(%[tmp]) \n\t"
752 "gslqc1 $f18, $f16, 0x0($14) \n\t"
753 "gssqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
754
755 "psubh $f28, $f4, $f0 \n\t"
756 "psubh $f30, $f6, $f2 \n\t"
757
758 "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
759 WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
760 "punpckhbh $f18, $f16, $f8 \n\t"
761 "punpcklbh $f16, $f16, $f8 \n\t"
762 "pcmpgth $f0, $f0, $f28 \n\t"
763 "pcmpgth $f2, $f2, $f30 \n\t"
764 "gssqc1 $f18, $f16, 640-384(%[tmp]) \n\t"
765 "psubh $f28, $f20, $f16 \n\t"
766 "psubh $f30, $f22, $f18 \n\t"
767 "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
768 "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
769 "punpckhbh $f26, $f24, $f8 \n\t"
770 "punpcklbh $f24, $f24, $f8 \n\t"
771 WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
772 "gssqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
773 "gssqc1 $f6, $f4, 640-144(%[tmp]) \n\t"
774 "gssqc1 $f22, $f20, 640-400(%[tmp]) \n\t"
775 "pcmpgth $f16, $f16, $f28 \n\t"
776 "pcmpgth $f18, $f18, $f30 \n\t"
777 "and $f0, $f0, $f16 \n\t"
778 "and $f2, $f2, $f18 \n\t"
779 "gslqc1 $f18, $f16, 640-320(%[tmp]) \n\t"
780 "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
781 "dli %[iAlpha], 0x2 \n\t"
782 "dli %[iBeta], 0x2 \n\t"
783 "pcmpgth $f16, $f16, $f28 \n\t"
784 "pcmpgth $f18, $f18, $f30 \n\t"
785 "and $f0, $f0, $f16 \n\t"
786 "and $f2, $f2, $f18 \n\t"
787 "dmtc1 %[iAlpha], $f16 \n\t"
788 "dmtc1 %[iBeta], $f10 \n\t"
789 "gssqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
790 "gslqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
791
792 "punpcklhw $f28, $f16, $f16 \n\t"
793 "psrah $f16, $f0, $f10 \n\t"
794 "psrah $f18, $f2, $f10 \n\t"
795 "punpcklwd $f28, $f28, $f28 \n\t"
796 "mov.d $f30, $f28 \n\t"
797 "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
798 "paddh $f16, $f16, $f28 \n\t"
799 "paddh $f18, $f18, $f30 \n\t"
800 "gssqc1 $f18, $f16, 640-576(%[tmp]) \n\t"
801 "pcmpgth $f16, $f16, $f8 \n\t"
802 "pcmpgth $f18, $f18, $f10 \n\t"
803 "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
804
805 "gssqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
806 "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
807 "psubh $f28, $f4, $f24 \n\t"
808 "psubh $f30, $f6, $f26 \n\t"
809 WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
810 "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
811 "pcmpgth $f16, $f16, $f28 \n\t"
812 "pcmpgth $f18, $f18, $f30 \n\t"
813
814 "gslqc1 $f2, $f0, 640-416(%[tmp]) \n\t"
815 "and $f16, $f16, $f8 \n\t"
816 "and $f18, $f18, $f10 \n\t"
817 "gssqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
818 "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
819 "psubh $f28, $f20, $f0 \n\t"
820 "psubh $f30, $f22, $f2 \n\t"
821 WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
822 "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
823 "pcmpgth $f16, $f16, $f28 \n\t"
824 "pcmpgth $f18, $f18, $f30 \n\t"
825
826 "and $f16, $f16, $f8 \n\t"
827 "and $f18, $f18, $f10 \n\t"
828 "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
829
830 "gslqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
831 "xor $f8, $f8, $f8 \n\t"
832 "pandn $f16, $f16, $f24 \n\t"
833 "dli %[iAlpha], 0x4 \n\t"
834 "pandn $f18, $f18, $f26 \n\t"
835 "gssqc1 $f18, $f16, 640-16(%[tmp]) \n\t"
836 "dmtc1 %[iAlpha], $f16 \n\t"
837 "punpcklhw $f28, $f16, $f16 \n\t"
838 "dli %[iAlpha], 0x1 \n\t"
839 "punpckhbh $f18, $f12, $f8 \n\t"
840 "dmtc1 %[iAlpha], $f30 \n\t"
841 "punpcklbh $f16, $f12, $f8 \n\t"
842 "psllh $f16, $f16, $f30 \n\t"
843 "psllh $f18, $f18, $f30 \n\t"
844 "paddh $f16, $f16, $f24 \n\t"
845 "paddh $f18, $f18, $f26 \n\t"
846 "gslqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
847 "paddh $f16, $f16, $f24 \n\t"
848 "paddh $f18, $f18, $f26 \n\t"
849 "paddh $f16, $f16, $f24 \n\t"
850 "paddh $f18, $f18, $f26 \n\t"
851 "paddh $f16, $f16, $f0 \n\t"
852 "paddh $f18, $f18, $f2 \n\t"
853
854 "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
855 "punpcklwd $f28, $f28, $f28 \n\t"
856 "mov.d $f30, $f28 \n\t"
857 "paddh $f16, $f16, $f4 \n\t"
858 "paddh $f18, $f18, $f6 \n\t"
859 "gssqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
860 "paddh $f16, $f16, $f20 \n\t"
861 "paddh $f18, $f18, $f22 \n\t"
862 "paddh $f16, $f16, $f28 \n\t"
863 "paddh $f18, $f18, $f30 \n\t"
864 "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
865 "gslqc1 $f2, $f0, 640-384(%[tmp]) \n\t"
866 "pandn $f24, $f24, $f28 \n\t"
867 "pandn $f26, $f26, $f30 \n\t"
868 "gssqc1 $f26, $f24, 640-80(%[tmp]) \n\t"
869 "gslqc1 $f26, $f24, 0x0($12) \n\t"
870 "dmtc1 %[iAlpha], $f10 \n\t"
871 "punpckhbh $f26, $f24, $f8 \n\t"
872 "punpcklbh $f24, $f24, $f8 \n\t"
873 "psllh $f24, $f24, $f10 \n\t"
874 "psllh $f26, $f26, $f10 \n\t"
875 "paddh $f24, $f24, $f28 \n\t"
876 "paddh $f26, $f26, $f30 \n\t"
877 "paddh $f24, $f24, $f28 \n\t"
878 "paddh $f26, $f26, $f30 \n\t"
879 "paddh $f24, $f24, $f28 \n\t"
880 "paddh $f26, $f26, $f30 \n\t"
881 "paddh $f24, $f24, $f0 \n\t"
882 "paddh $f26, $f26, $f2 \n\t"
883
884 "dli %[iAlpha], 0x3 \n\t"
885 "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
886 "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
887 "paddh $f24, $f24, $f20 \n\t"
888 "paddh $f26, $f26, $f22 \n\t"
889 "paddh $f24, $f24, $f4 \n\t"
890 "paddh $f26, $f26, $f6 \n\t"
891 "paddh $f24, $f24, $f0 \n\t"
892 "paddh $f26, $f26, $f2 \n\t"
893 "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
894 "dmtc1 %[iAlpha], $f10 \n\t"
895 "psrah $f24, $f24, $f10 \n\t"
896 "psrah $f26, $f26, $f10 \n\t"
897 "and $f24, $f24, $f0 \n\t"
898 "and $f26, $f26, $f2 \n\t"
899 "gssqc1 $f26, $f24, 640-112(%[tmp]) \n\t"
900 "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
901 "pandn $f24, $f24, $f28 \n\t"
902 "pandn $f26, $f26, $f30 \n\t"
903 "gssqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
904 "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
905 "gssqc1 $f26, $f24, 640-528(%[tmp]) \n\t"
906 "gslqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
907 "gslqc1 $f2, $f0, 640-544(%[tmp]) \n\t"
908 "dmtc1 %[iAlpha], $f10 \n\t"
909 "paddh $f24, $f24, $f28 \n\t"
910 "paddh $f26, $f26, $f30 \n\t"
911 "psrah $f16, $f16, $f10 \n\t"
912 "psrah $f18, $f18, $f10 \n\t"
913 "and $f16, $f16, $f0 \n\t"
914 "and $f18, $f18, $f2 \n\t"
915 "gslqc1 $f2, $f0, 640-624(%[tmp]) \n\t"
916 "paddh $f28, $f4, $f20 \n\t"
917 "paddh $f30, $f6, $f22 \n\t"
918 "paddh $f24, $f24, $f28 \n\t"
919 "paddh $f26, $f26, $f30 \n\t"
920 "paddh $f24, $f24, $f0 \n\t"
921 "paddh $f26, $f26, $f2 \n\t"
922 "gslqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
923 "dli %[iAlpha], 0x2 \n\t"
924
925 "dmtc1 %[iAlpha], $f10 \n\t"
926 "paddh $f20, $f20, $f4 \n\t"
927 "paddh $f22, $f22, $f6 \n\t"
928 "psrah $f24, $f24, $f10 \n\t"
929 "psrah $f26, $f26, $f10 \n\t"
930 "and $f28, $f28, $f24 \n\t"
931 "and $f30, $f30, $f26 \n\t"
932
933 "gslqc1 $f26, $f24, 640-384(%[tmp]) \n\t"
934 "gssqc1 $f30, $f28, 640-64(%[tmp]) \n\t"
935 "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
936 "pandn $f28, $f28, $f24 \n\t"
937 "pandn $f30, $f30, $f26 \n\t"
938 "gssqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
939 "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
940 "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
941 "paddh $f28, $f28, $f24 \n\t"
942 "paddh $f30, $f30, $f26 \n\t"
943 "paddh $f28, $f28, $f20 \n\t"
944 "paddh $f30, $f30, $f22 \n\t"
945 "paddh $f28, $f28, $f8 \n\t"
946 "paddh $f30, $f30, $f10 \n\t"
947 "dmtc1 %[iAlpha], $f10 \n\t"
948 "gslqc1 $f22, $f20, 640-560(%[tmp]) \n\t"
949 "psrah $f28, $f28, $f10 \n\t"
950 "psrah $f30, $f30, $f10 \n\t"
951 "and $f20, $f20, $f28 \n\t"
952 "and $f22, $f22, $f30 \n\t"
953 "gssqc1 $f22, $f20, 640-32(%[tmp]) \n\t"
954
955 "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
956 "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
957 "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
958 "paddh $f28, $f20, $f20 \n\t"
959 "paddh $f30, $f22, $f22 \n\t"
960 "paddh $f20, $f4, $f24 \n\t"
961 "paddh $f22, $f6, $f26 \n\t"
962 "paddh $f24, $f24, $f0 \n\t"
963 "paddh $f26, $f26, $f2 \n\t"
964 "paddh $f28, $f28, $f20 \n\t"
965 "paddh $f30, $f30, $f22 \n\t"
966 "paddh $f28, $f28, $f8 \n\t"
967 "paddh $f30, $f30, $f10 \n\t"
968 "dmtc1 %[iAlpha], $f10 \n\t"
969 "gslqc1 $f22, $f20, 640-544(%[tmp]) \n\t"
970 "psrah $f28, $f28, $f10 \n\t"
971 "psrah $f30, $f30, $f10 \n\t"
972 "dli %[iAlpha], 0x1 \n\t"
973 "pandn $f20, $f20, $f28 \n\t"
974 "pandn $f22, $f22, $f30 \n\t"
975 "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
976 "paddh $f28, $f28, $f4 \n\t"
977 "paddh $f30, $f30, $f6 \n\t"
978 "gslqc1 $f6, $f4, 640-400(%[tmp]) \n\t"
979 "paddh $f28, $f28, $f4 \n\t"
980 "paddh $f30, $f30, $f6 \n\t"
981 "gslqc1 $f6, $f4, 640-544(%[tmp]) \n\t"
982 "dmtc1 %[iAlpha], $f10 \n\t"
983 "gssqc1 $f22, $f20, 640-352(%[tmp]) \n\t"
984 "gslqc1 $f22, $f20, 640-368(%[tmp]) \n\t"
985 "psllh $f28, $f28, $f10 \n\t"
986 "psllh $f30, $f30, $f10 \n\t"
987 "dli %[iAlpha], 0x3 \n\t"
988 "paddh $f28, $f28, $f24 \n\t"
989 "paddh $f30, $f30, $f26 \n\t"
990 "paddh $f20, $f20, $f28 \n\t"
991 "paddh $f22, $f22, $f30 \n\t"
992 "dmtc1 %[iAlpha], $f10 \n\t"
993
994 "dli %[iAlpha], 0x2 \n\t"
995 "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
996 "psrah $f20, $f20, $f10 \n\t"
997 "psrah $f22, $f22, $f10 \n\t"
998 "and $f4, $f4, $f20 \n\t"
999 "and $f6, $f6, $f22 \n\t"
1000 "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
1001 "gssqc1 $f6, $f4, 640-96(%[tmp]) \n\t"
1002 "gslqc1 $f6, $f4, 640-384(%[tmp]) \n\t"
1003 "gslqc1 $f10, $f8, 640-400(%[tmp]) \n\t"
1004 "paddh $f24, $f4, $f4 \n\t"
1005 "paddh $f26, $f6, $f6 \n\t"
1006 "paddh $f4, $f4, $f8 \n\t"
1007 "paddh $f6, $f6, $f10 \n\t"
1008 "gslqc1 $f10, $f8, 640-144(%[tmp]) \n\t"
1009 "paddh $f28, $f28, $f20 \n\t"
1010 "paddh $f30, $f30, $f22 \n\t"
1011 "paddh $f4, $f4, $f8 \n\t"
1012 "paddh $f6, $f6, $f10 \n\t"
1013 "gslqc1 $f10, $f8, 640-592(%[tmp]) \n\t"
1014 "paddh $f24, $f24, $f28 \n\t"
1015 "paddh $f26, $f26, $f30 \n\t"
1016 "paddh $f20, $f20, $f8 \n\t"
1017 "paddh $f22, $f22, $f10 \n\t"
1018 "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
1019 "paddh $f24, $f24, $f8 \n\t"
1020 "dmtc1 %[iAlpha], $f8 \n\t"
1021 "paddh $f26, $f26, $f10 \n\t"
1022 "dli %[iAlpha], 0x1 \n\t"
1023 "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
1024 "dmtc1 %[iAlpha], $f10 \n\t"
1025 "psrah $f24, $f24, $f8 \n\t"
1026 "psrah $f26, $f26, $f8 \n\t"
1027 "psllh $f4, $f4, $f10 \n\t"
1028 "psllh $f6, $f6, $f10 \n\t"
1029 "paddh $f4, $f4, $f20 \n\t"
1030 "paddh $f6, $f6, $f22 \n\t"
1031 "dli %[iAlpha], 0x3 \n\t"
1032
1033 "gslqc1 $f22, $f20, 656-272(%[tmp]) \n\t"
1034 "pandn $f28, $f28, $f24 \n\t"
1035 "pandn $f30, $f30, $f26 \n\t"
1036 "gslqc1 $f26, $f24, 640-416(%[tmp]) \n\t"
1037 "dmtc1 %[iAlpha], $f10 \n\t"
1038 "paddh $f24, $f24, $f4 \n\t"
1039 "paddh $f26, $f26, $f6 \n\t"
1040 "gslqc1 $f6, $f4, 640-560(%[tmp]) \n\t"
1041 "psrah $f24, $f24, $f10 \n\t"
1042 "psrah $f26, $f26, $f10 \n\t"
1043 "and $f4, $f4, $f24 \n\t"
1044 "and $f6, $f6, $f26 \n\t"
1045
1046 "xor $f8, $f8, $f8 \n\t"
1047 "gslqc1 $f26, $f24, 704-272(%[tmp]) \n\t"
1048 "gssqc1 $f6, $f4, 640-128(%[tmp]) \n\t"
1049 "gslqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
1050 "punpcklbh $f4, $f6, $f8 \n\t"
1051 "punpckhbh $f6, $f6, $f8 \n\t"
1052 "gssqc1 $f6, $f4, 640-448(%[tmp]) \n\t"
1053 "gslqc1 $f6, $f4, 688-272(%[tmp]) \n\t"
1054 "punpcklbh $f4, $f6, $f8 \n\t"
1055 "punpckhbh $f6, $f6, $f8 \n\t"
1056 "punpcklbh $f24, $f26, $f8 \n\t"
1057 "punpckhbh $f26, $f26, $f8 \n\t"
1058 "gssqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
1059 "punpcklbh $f20, $f22, $f8 \n\t"
1060 "punpckhbh $f22, $f22, $f8 \n\t"
1061 "gslqc1 $f30, $f28, 0x0($14) \n\t"
1062 "gssqc1 $f6, $f4, 640-496(%[tmp]) \n\t"
1063 "gssqc1 $f26, $f24, 640-432(%[tmp]) \n\t"
1064
1065 "gsldxc1 $f0, 0x8($15, $0) \n\t"
1066 "punpcklbh $f28, $f30, $f8 \n\t"
1067 "punpckhbh $f30, $f30, $f8 \n\t"
1068 "gssqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
1069
1070 "punpcklbh $f28, $f0, $f8 \n\t"
1071 "punpckhbh $f30, $f0, $f8 \n\t"
1072 "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
1073 "gssqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
1074
1075 "psubh $f28, $f24, $f4 \n\t"
1076 "psubh $f30, $f26, $f6 \n\t"
1077 "psubh $f24, $f24, $f8 \n\t"
1078 "psubh $f26, $f26, $f10 \n\t"
1079 WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1080 "gslqc1 $f10, $f8, 640-16(%[tmp]) \n\t"
1081 "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
1082 "or $f16, $f16, $f8 \n\t"
1083 "or $f18, $f18, $f10 \n\t"
1084 WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
1085 "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
1086 "psubh $f28, $f4, $f28 \n\t"
1087 "psubh $f30, $f6, $f30 \n\t"
1088
1089 "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
1090 WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1091 "pcmpgth $f4, $f0, $f28 \n\t"
1092 "pcmpgth $f6, $f2, $f30 \n\t"
1093 "pcmpgth $f28, $f0, $f24 \n\t"
1094 "pcmpgth $f30, $f2, $f26 \n\t"
1095 "gslqc1 $f26, $f24, 640-320(%[tmp]) \n\t"
1096 "and $f4, $f4, $f28 \n\t"
1097 "and $f6, $f6, $f30 \n\t"
1098 "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
1099 "pcmpgth $f24, $f24, $f28 \n\t"
1100 "pcmpgth $f26, $f26, $f30 \n\t"
1101 "and $f4, $f4, $f24 \n\t"
1102 "and $f6, $f6, $f26 \n\t"
1103
1104 "gslqc1 $f26, $f24, 640-576(%[tmp]) \n\t"
1105 "pcmpgth $f24, $f24, $f28 \n\t"
1106 "pcmpgth $f26, $f26, $f30 \n\t"
1107 "xor $f8, $f8, $f8 \n\t"
1108 "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
1109 "punpcklbh $f12, $f14, $f8 \n\t"
1110 "punpckhbh $f14, $f14, $f8 \n\t"
1111 "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
1112 "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
1113 "psubh $f28, $f28, $f20 \n\t"
1114 "psubh $f30, $f30, $f22 \n\t"
1115 WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1116 "pcmpgth $f24, $f24, $f28 \n\t"
1117 "pcmpgth $f26, $f26, $f30 \n\t"
1118
1119 "dli %[iAlpha], 0x1 \n\t"
1120 "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
1121 "and $f24, $f24, $f8 \n\t"
1122 "and $f26, $f26, $f10 \n\t"
1123 "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
1124 "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
1125 "psubh $f28, $f28, $f8 \n\t"
1126 "psubh $f30, $f30, $f10 \n\t"
1127 "dmtc1 %[iAlpha], $f10 \n\t"
1128
1129 "psllh $f12, $f12, $f10 \n\t"
1130 "psllh $f14, $f14, $f10 \n\t"
1131 "gssqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
1132 "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
1133
1134 "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
1135 "paddh $f12, $f12, $f20 \n\t"
1136 "paddh $f14, $f14, $f22 \n\t"
1137 "paddh $f12, $f12, $f20 \n\t"
1138 "paddh $f14, $f14, $f22 \n\t"
1139 "paddh $f12, $f12, $f20 \n\t"
1140 "paddh $f14, $f14, $f22 \n\t"
1141 "paddh $f12, $f12, $f8 \n\t"
1142 "paddh $f14, $f14, $f10 \n\t"
1143 "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
1144 "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
1145 "paddh $f12, $f12, $f8 \n\t"
1146 "paddh $f14, $f14, $f10 \n\t"
1147 WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1148 "pcmpgth $f24, $f24, $f28 \n\t"
1149 "pcmpgth $f26, $f26, $f30 \n\t"
1150 "and $f24, $f24, $f0 \n\t"
1151 "and $f26, $f26, $f2 \n\t"
1152 "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
1153 "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
1154
1155 "gslqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
1156 "dli %[iAlpha], 0x3 \n\t"
1157 "gslqc1 $f30, $f28, 640-368(%[tmp]) \n\t"
1158 "and $f24, $f0, $f16 \n\t"
1159 "and $f26, $f2, $f18 \n\t"
1160 "pandn $f16, $f0, $f28 \n\t"
1161 "pandn $f18, $f2, $f30 \n\t"
1162 "or $f24, $f24, $f16 \n\t"
1163 "or $f26, $f26, $f18 \n\t"
1164 "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
1165 "paddh $f12, $f12, $f16 \n\t"
1166 "paddh $f14, $f14, $f18 \n\t"
1167 "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
1168 "paddh $f12, $f12, $f28 \n\t"
1169 "paddh $f14, $f14, $f30 \n\t"
1170 "dmtc1 %[iAlpha], $f28 \n\t"
1171 "psrah $f12, $f12, $f28 \n\t"
1172 "psrah $f14, $f14, $f28 \n\t"
1173 "and $f12, $f12, $f8 \n\t"
1174 "and $f14, $f14, $f10 \n\t"
1175 "pandn $f8, $f8, $f20 \n\t"
1176 "pandn $f10, $f10, $f22 \n\t"
1177 "or $f12, $f12, $f8 \n\t"
1178 "or $f14, $f14, $f10 \n\t"
1179 "and $f28, $f4, $f12 \n\t"
1180 "and $f30, $f6, $f14 \n\t"
1181 "gslqc1 $f14, $f12, 640-64(%[tmp]) \n\t"
1182 "gslqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
1183 "or $f12, $f12, $f8 \n\t"
1184 "or $f14, $f14, $f10 \n\t"
1185 "pandn $f8, $f4, $f20 \n\t"
1186 "pandn $f10, $f6, $f22 \n\t"
1187 "or $f28, $f28, $f8 \n\t"
1188 "or $f30, $f30, $f10 \n\t"
1189
1190 "dli %[iAlpha], 0x2 \n\t"
1191 "and $f8, $f0, $f12 \n\t"
1192 "and $f10, $f2, $f14 \n\t"
1193 "gslqc1 $f14, $f12, 640-480(%[tmp]) \n\t"
1194 "pandn $f12, $f0, $f12 \n\t"
1195 "pandn $f14, $f2, $f14 \n\t"
1196 "or $f8, $f8, $f12 \n\t"
1197 "or $f10, $f10, $f14 \n\t"
1198 "packushb $f24, $f24, $f26 \n\t"
1199 "packushb $f26, $f28, $f30 \n\t"
1200 "gssqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
1201 "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
1202 "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
1203 "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
1204 "paddh $f8, $f20, $f8 \n\t"
1205 "paddh $f10, $f22, $f10 \n\t"
1206 "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
1207 "paddh $f28, $f28, $f16 \n\t"
1208 "paddh $f30, $f30, $f18 \n\t"
1209 "paddh $f8, $f8, $f28 \n\t"
1210 "paddh $f10, $f10, $f30 \n\t"
1211 "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
1212 "paddh $f8, $f8, $f28 \n\t"
1213 "paddh $f10, $f10, $f30 \n\t"
1214 "dmtc1 %[iAlpha], $f28 \n\t"
1215 "psrah $f8, $f8, $f28 \n\t"
1216 "psrah $f10, $f10, $f28 \n\t"
1217 "dli %[iAlpha], 0x1 \n\t"
1218 "gslqc1 $f30, $f28, 640-544(%[tmp]) \n\t"
1219 "and $f24, $f24, $f8 \n\t"
1220 "and $f26, $f26, $f10 \n\t"
1221 "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
1222 "pandn $f28, $f28, $f8 \n\t"
1223 "pandn $f30, $f30, $f10 \n\t"
1224 "or $f24, $f24, $f28 \n\t"
1225 "or $f26, $f26, $f30 \n\t"
1226 "and $f12, $f4, $f24 \n\t"
1227 "and $f14, $f6, $f26 \n\t"
1228 "pandn $f24, $f4, $f8 \n\t"
1229 "pandn $f26, $f6, $f10 \n\t"
1230 "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
1231 "paddh $f8, $f8, $f28 \n\t"
1232 "paddh $f10, $f10, $f30 \n\t"
1233 "paddh $f8, $f8, $f16 \n\t"
1234 "paddh $f10, $f10, $f18 \n\t"
1235 "or $f12, $f12, $f24 \n\t"
1236 "or $f14, $f14, $f26 \n\t"
1237 "gslqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
1238 "dmtc1 %[iAlpha], $f28 \n\t"
1239 "packushb $f24, $f24, $f26 \n\t"
1240 "packushb $f26, $f12, $f14 \n\t"
1241 "psllh $f8, $f8, $f28 \n\t"
1242 "psllh $f10, $f10, $f28 \n\t"
1243 "gssqc1 $f26, $f24, 672-272(%[tmp]) \n\t"
1244 "gslqc1 $f26, $f24, 640-96(%[tmp]) \n\t"
1245 "gslqc1 $f30, $f28, 640-352(%[tmp]) \n\t"
1246 "or $f24, $f24, $f28 \n\t"
1247 "or $f26, $f26, $f30 \n\t"
1248 "dli %[iAlpha], 0x3 \n\t"
1249
1250 "and $f12, $f0, $f24 \n\t"
1251 "and $f14, $f2, $f26 \n\t"
1252 "gslqc1 $f26, $f24, 640-144(%[tmp]) \n\t"
1253 "pandn $f24, $f0, $f24 \n\t"
1254 "pandn $f26, $f2, $f26 \n\t"
1255 "or $f12, $f12, $f24 \n\t"
1256 "or $f14, $f14, $f26 \n\t"
1257 "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
1258 "gssqc1 $f14, $f12, 640-352(%[tmp]) \n\t"
1259 "gslqc1 $f14, $f12, 640-464(%[tmp]) \n\t"
1260 "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
1261 "paddh $f12, $f12, $f28 \n\t"
1262 "paddh $f14, $f14, $f30 \n\t"
1263 "paddh $f8, $f8, $f12 \n\t"
1264 "paddh $f10, $f10, $f14 \n\t"
1265 "gslqc1 $f14, $f12, 640-448(%[tmp]) \n\t"
1266 "paddh $f20, $f20, $f8 \n\t"
1267 "paddh $f22, $f22, $f10 \n\t"
1268 "dmtc1 %[iAlpha], $f28 \n\t"
1269 "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
1270 "psrah $f20, $f20, $f28 \n\t"
1271 "psrah $f22, $f22, $f28 \n\t"
1272 "and $f24, $f24, $f20 \n\t"
1273 "and $f26, $f26, $f22 \n\t"
1274 "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
1275 "paddh $f8, $f8, $f20 \n\t"
1276 "paddh $f10, $f10, $f22 \n\t"
1277 "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
1278 "dli %[iAlpha], 0x2 \n\t"
1279 "paddh $f20, $f20, $f28 \n\t"
1280 "paddh $f22, $f22, $f30 \n\t"
1281 "paddh $f16, $f12, $f12 \n\t"
1282 "paddh $f18, $f14, $f14 \n\t"
1283 "paddh $f16, $f16, $f8 \n\t"
1284 "paddh $f18, $f18, $f10 \n\t"
1285 "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
1286 "paddh $f16, $f16, $f28 \n\t"
1287 "paddh $f18, $f18, $f30 \n\t"
1288 "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
1289 "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
1290 "paddh $f12, $f12, $f28 \n\t"
1291 "paddh $f14, $f14, $f30 \n\t"
1292 "dmtc1 %[iAlpha], $f28 \n\t"
1293 "psrah $f16, $f16, $f28 \n\t"
1294 "psrah $f18, $f18, $f28 \n\t"
1295 "pandn $f8, $f8, $f16 \n\t"
1296 "pandn $f10, $f10, $f18 \n\t"
1297 "or $f24, $f24, $f8 \n\t"
1298 "or $f26, $f26, $f10 \n\t"
1299 "and $f28, $f4, $f24 \n\t"
1300 "and $f30, $f6, $f26 \n\t"
1301 "gslqc1 $f26, $f24, 640-496(%[tmp]) \n\t"
1302 "pandn $f8, $f4, $f24 \n\t"
1303 "pandn $f10, $f6, $f26 \n\t"
1304 "or $f28, $f28, $f8 \n\t"
1305 "or $f30, $f30, $f10 \n\t"
1306 "gslqc1 $f10, $f8, 640-352(%[tmp]) \n\t"
1307 "packushb $f8, $f8, $f10 \n\t"
1308 "packushb $f10, $f28, $f30 \n\t"
1309 "gssqc1 $f10, $f8, 688-272(%[tmp]) \n\t"
1310 "gslqc1 $f10, $f8, 640-128(%[tmp]) \n\t"
1311 "gslqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
1312 "or $f8, $f8, $f28 \n\t"
1313 "or $f10, $f10, $f30 \n\t"
1314 "dli %[iAlpha], 0x1 \n\t"
1315
1316 "and $f16, $f0, $f8 \n\t"
1317 "and $f18, $f2, $f10 \n\t"
1318 "paddh $f20, $f20, $f24 \n\t"
1319 "paddh $f22, $f22, $f26 \n\t"
1320 "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
1321 "pandn $f8, $f0, $f28 \n\t"
1322 "pandn $f10, $f2, $f30 \n\t"
1323 "or $f16, $f16, $f8 \n\t"
1324 "or $f18, $f18, $f10 \n\t"
1325 "dmtc1 %[iAlpha], $f28 \n\t"
1326 "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
1327 "dli %[iAlpha], 0x3 \n\t"
1328 "psllh $f20, $f20, $f28 \n\t"
1329 "psllh $f22, $f22, $f28 \n\t"
1330 "paddh $f20, $f20, $f12 \n\t"
1331 "paddh $f22, $f22, $f14 \n\t"
1332 "dmtc1 %[iAlpha], $f28 \n\t"
1333 "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
1334 "paddh $f8, $f8, $f20 \n\t"
1335 "paddh $f10, $f10, $f22 \n\t"
1336 "psrah $f8, $f8, $f28 \n\t"
1337 "psrah $f10, $f10, $f28 \n\t"
1338 "gssqc1 $f18, $f16, 640-288(%[tmp]) \n\t"
1339 "gslqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
1340 "and $f16, $f16, $f8 \n\t"
1341 "and $f18, $f18, $f10 \n\t"
1342 "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
1343 "paddh $f20, $f8, $f8 \n\t"
1344 "paddh $f22, $f10, $f10 \n\t"
1345 "gslqc1 $f10, $f8, 640-432(%[tmp]) \n\t"
1346 "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
1347 "paddh $f8, $f8, $f28 \n\t"
1348 "paddh $f10, $f10, $f30 \n\t"
1349 "dli %[iAlpha], 0x2 \n\t"
1350 "paddh $f20, $f20, $f8 \n\t"
1351 "paddh $f22, $f22, $f10 \n\t"
1352 "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
1353 "paddh $f20, $f20, $f28 \n\t"
1354 "paddh $f22, $f22, $f30 \n\t"
1355 "dmtc1 %[iAlpha], $f28 \n\t"
1356 "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
1357 "psrah $f20, $f20, $f28 \n\t"
1358 "psrah $f22, $f22, $f28 \n\t"
1359 "pandn $f12, $f12, $f20 \n\t"
1360 "pandn $f14, $f14, $f22 \n\t"
1361 "or $f16, $f16, $f12 \n\t"
1362 "or $f18, $f18, $f14 \n\t"
1363 "gslqc1 $f14, $f12, 640-32(%[tmp]) \n\t"
1364 "gslqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
1365 "or $f12, $f12, $f28 \n\t"
1366 "or $f14, $f14, $f30 \n\t"
1367 "and $f28, $f4, $f16 \n\t"
1368 "and $f30, $f6, $f18 \n\t"
1369 "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
1370 "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
1371 "pandn $f8, $f4, $f16 \n\t"
1372 "pandn $f10, $f6, $f18 \n\t"
1373 "or $f28, $f28, $f8 \n\t"
1374 "or $f30, $f30, $f10 \n\t"
1375 "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
1376 "paddh $f16, $f16, $f8 \n\t"
1377 "paddh $f18, $f18, $f10 \n\t"
1378 "gslqc1 $f10, $f8, 640-288(%[tmp]) \n\t"
1379 "packushb $f8, $f8, $f10 \n\t"
1380 "packushb $f10, $f28, $f30 \n\t"
1381 "dli %[iAlpha], 0x2 \n\t"
1382 "gssqc1 $f10, $f8, 704-272(%[tmp]) \n\t"
1383
1384 "and $f8, $f0, $f12 \n\t"
1385 "and $f10, $f2, $f14 \n\t"
1386 "gslqc1 $f30, $f28, 640-384(%[tmp]) \n\t"
1387 "pandn $f12, $f0, $f28 \n\t"
1388 "pandn $f14, $f2, $f30 \n\t"
1389 "or $f8, $f8, $f12 \n\t"
1390 "or $f10, $f10, $f14 \n\t"
1391 "gssqc1 $f10, $f8, 640-304(%[tmp]) \n\t"
1392 "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
1393 "gslqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
1394 "paddh $f12, $f8, $f28 \n\t"
1395 "paddh $f14, $f10, $f30 \n\t"
1396 "paddh $f12, $f12, $f16 \n\t"
1397 "paddh $f14, $f14, $f18 \n\t"
1398 "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
1399 "paddh $f12, $f12, $f28 \n\t"
1400 "paddh $f14, $f14, $f30 \n\t"
1401 "dmtc1 %[iAlpha], $f28 \n\t"
1402 "psrah $f12, $f12, $f28 \n\t"
1403 "psrah $f14, $f14, $f28 \n\t"
1404 "and $f24, $f24, $f12 \n\t"
1405 "and $f26, $f26, $f14 \n\t"
1406 "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
1407 "pandn $f16, $f12, $f20 \n\t"
1408 "pandn $f18, $f14, $f22 \n\t"
1409 "or $f24, $f24, $f16 \n\t"
1410 "or $f26, $f26, $f18 \n\t"
1411 "and $f28, $f4, $f24 \n\t"
1412 "and $f30, $f6, $f26 \n\t"
1413 "gslqc1 $f26, $f24, 640-304(%[tmp]) \n\t"
1414 "pandn $f16, $f4, $f20 \n\t"
1415 "pandn $f18, $f6, $f22 \n\t"
1416 "or $f28, $f28, $f16 \n\t"
1417 "or $f30, $f30, $f18 \n\t"
1418 "dli %[iAlpha], 0x1 \n\t"
1419
1420 "packushb $f24, $f24, $f26 \n\t"
1421 "packushb $f26, $f28, $f30 \n\t"
1422 "gslqc1 $f30, $f28, 640-112(%[tmp]) \n\t"
1423 "gslqc1 $f18, $f16, 640-80(%[tmp]) \n\t"
1424 "or $f28, $f28, $f16 \n\t"
1425 "or $f30, $f30, $f18 \n\t"
1426 "and $f16, $f0, $f28 \n\t"
1427 "and $f18, $f2, $f30 \n\t"
1428 "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
1429 "pandn $f0, $f0, $f28 \n\t"
1430 "pandn $f2, $f2, $f30 \n\t"
1431 "or $f16, $f16, $f0 \n\t"
1432 "or $f18, $f18, $f2 \n\t"
1433 "xor $f28, $f28, $f28 \n\t"
1434 "xor $f30, $f30, $f30 \n\t"
1435 "gslqc1 $f2, $f0, 0x0($12) \n\t"
1436 "dmtc1 %[iAlpha], $f28 \n\t"
1437 "punpcklbh $f0, $f2, $f30 \n\t"
1438 "punpckhbh $f2, $f2, $f30 \n\t"
1439 "psllh $f0, $f0, $f28 \n\t"
1440 "psllh $f2, $f2, $f28 \n\t"
1441 "paddh $f0, $f0, $f8 \n\t"
1442 "paddh $f2, $f2, $f10 \n\t"
1443 "paddh $f0, $f0, $f8 \n\t"
1444 "paddh $f2, $f2, $f10 \n\t"
1445 "paddh $f0, $f0, $f8 \n\t"
1446 "paddh $f2, $f2, $f10 \n\t"
1447 "paddh $f0, $f0, $f20 \n\t"
1448 "paddh $f2, $f2, $f22 \n\t"
1449 "dli %[iAlpha], 0x3 \n\t"
1450 "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
1451 "paddh $f0, $f0, $f28 \n\t"
1452 "paddh $f2, $f2, $f30 \n\t"
1453 "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
1454 "paddh $f0, $f0, $f28 \n\t"
1455 "paddh $f2, $f2, $f30 \n\t"
1456 "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
1457 "paddh $f0, $f0, $f28 \n\t"
1458 "paddh $f2, $f2, $f30 \n\t"
1459 "dmtc1 %[iAlpha], $f28 \n\t"
1460 "psrah $f0, $f0, $f28 \n\t"
1461 "psrah $f2, $f2, $f28 \n\t"
1462 "and $f0, $f0, $f12 \n\t"
1463 "and $f2, $f2, $f14 \n\t"
1464 "pandn $f12, $f12, $f8 \n\t"
1465 "pandn $f14, $f14, $f10 \n\t"
1466 "or $f0, $f0, $f12 \n\t"
1467 "or $f2, $f2, $f14 \n\t"
1468 "and $f28, $f4, $f0 \n\t"
1469 "and $f30, $f6, $f2 \n\t"
1470
1471 "gslqc1 $f2, $f0, 656-272(%[tmp]) \n\t"
1472 "gssqc1 $f2, $f0, 0x0($11) \n\t"
1473
1474 "gslqc1 $f2, $f0, 672-272(%[tmp]) \n\t"
1475
1476 "gssqc1 $f2, $f0, 0x0($8) \n\t"
1477 "gslqc1 $f2, $f0, 688-272(%[tmp]) \n\t"
1478 "gssqc1 $f2, $f0, 0x0($9) \n\t"
1479 "gslqc1 $f2, $f0, 704-272(%[tmp]) \n\t"
1480
1481 "pandn $f4, $f4, $f8 \n\t"
1482 "pandn $f6, $f6, $f10 \n\t"
1483 "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
1484 "or $f28, $f28, $f4 \n\t"
1485 "or $f30, $f30, $f6 \n\t"
1486 "packushb $f16, $f16, $f18 \n\t"
1487 "packushb $f18, $f28, $f30 \n\t"
1488 "gssqc1 $f26, $f24, 0x0($13) \n\t"
1489 "gssqc1 $f18, $f16, 0x0(%[iStride]) \n\t"
1490 : [pPix]"+&r"((unsigned char *)pPix)
1491 : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
1492 [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
1493 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
1494 "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
1495 "$f22", "$f24", "$f26", "$f28", "$f30"
1496 );
1497 RECOVER_REG;
1498 }
1499
DeblockChromaLt4V_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTC)1500 void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
1501 int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
1502 unsigned char tmp[256] __attribute__((aligned(32)));
1503 BACKUP_REG;
1504 __asm__ volatile (
1505 ".set arch=loongson3a \n\t"
1506 "lb $8, 0x2(%[pTC]) \n\t"
1507 "lb $9, 0x3(%[pTC]) \n\t"
1508 "move $11, $8 \n\t"
1509 "lb $8, 0x1(%[pTC]) \n\t"
1510 "lb %[pTC], 0x0(%[pTC]) \n\t"
1511 "move $12, %[pTC] \n\t"
1512 "and %[pTC], $9, 0xFFFF \n\t"
1513 "dmtc1 %[pTC], $f4 \n\t"
1514 "and %[pTC], $9, 0xFFFF \n\t"
1515 "dmtc1 %[pTC], $f8 \n\t"
1516 "move %[pTC], $11 \n\t"
1517 "and $9, %[pTC], 0xFFFF \n\t"
1518 "and %[pTC], %[pTC], 0xFFFF \n\t"
1519 "dmtc1 %[pTC], $f16 \n\t"
1520 "and %[pTC], $8, 0xFFFF \n\t"
1521 "dmtc1 %[pTC], $f20 \n\t"
1522 "dmtc1 $9, $f12 \n\t"
1523 "and %[pTC], $8, 0xFFFF \n\t"
1524 "dmtc1 %[pTC], $f24 \n\t"
1525 "move %[pTC], $12 \n\t"
1526 "and $9, %[pTC], 0xFFFF \n\t"
1527 "and %[pTC], %[pTC], 0xFFFF \n\t"
1528 "punpcklhw $f24, $f24, $f8 \n\t"
1529 "xor $f0, $f0, $f0 \n\t"
1530 "xor $f2, $f2, $f2 \n\t"
1531 "gssqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
1532 "dmtc1 $9, $f28 \n\t"
1533 "dmtc1 %[pTC], $f0 \n\t"
1534 "daddu %[pTC], %[iStride], %[iStride] \n\t"
1535 "dsubu $9, %[pPixCb], %[pTC] \n\t"
1536 "punpcklhw $f20, $f20, $f4 \n\t"
1537 "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
1538 "punpcklhw $f0, $f0, $f16 \n\t"
1539 "gsldxc1 $f16, 0x0(%[iStride], %[pPixCr]) \n\t"
1540 "punpcklhw $f28, $f28, $f12 \n\t"
1541 "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
1542 "punpcklhw $f0, $f0, $f24 \n\t"
1543 "gsldxc1 $f24, 0x0($9, $0) \n\t"
1544 "punpcklhw $f28, $f28, $f20 \n\t"
1545 "punpckhhw $f2, $f0, $f28 \n\t"
1546 "punpcklhw $f0, $f0, $f28 \n\t"
1547 "dsubu $9, %[pPixCr], %[pTC] \n\t"
1548 "psubh $f8, $f4, $f0 \n\t"
1549 "psubh $f10, $f6, $f2 \n\t"
1550 "gssqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
1551 "gsldxc1 $f8, 0x0($9, $0) \n\t"
1552 "mov.d $f26, $f8 \n\t"
1553 "dsubu %[pTC], %[pPixCb], %[iStride] \n\t"
1554 "gsldxc1 $f28, 0x0(%[pTC], $0) \n\t"
1555 "dsubu $9, %[pPixCr], %[iStride] \n\t"
1556 "gsldxc1 $f8, 0x0($9, $0) \n\t"
1557 "mov.d $f30, $f8 \n\t"
1558 "gsldxc1 $f8, 0x0(%[pPixCr], $0) \n\t"
1559 "mov.d $f14, $f8 \n\t"
1560 "gsldxc1 $f8, 0x0(%[iStride], %[pPixCb]) \n\t"
1561 "mov.d $f10, $f16 \n\t"
1562 "gssqc1 $f10, $f8, 0xE0(%[tmp]) \n\t"
1563 "dmtc1 %[iAlpha], $f8 \n\t"
1564 "punpcklhw $f16, $f8, $f8 \n\t"
1565 "dmtc1 %[iBeta], $f8 \n\t"
1566 "punpcklhw $f20, $f8, $f8 \n\t"
1567 "punpcklwd $f8, $f20, $f20 \n\t"
1568 "mov.d $f10, $f8 \n\t"
1569 "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
1570 "punpckhbh $f10, $f24, $f4 \n\t"
1571 "punpcklbh $f8, $f24, $f4 \n\t"
1572 "gssqc1 $f14, $f12, 0xd0(%[tmp]) \n\t"
1573 "punpcklwd $f16, $f16, $f16 \n\t"
1574 "mov.d $f18, $f16 \n\t"
1575 "gssqc1 $f10, $f8, 0x30(%[tmp]) \n\t"
1576 "punpcklbh $f24, $f26, $f6 \n\t"
1577 "punpckhbh $f26, $f26, $f6 \n\t"
1578 "gssqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
1579 "gslqc1 $f26, $f24, 0xd0(%[tmp]) \n\t"
1580 "punpcklbh $f24, $f26, $f6 \n\t"
1581 "punpckhbh $f26, $f26, $f6 \n\t"
1582 "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
1583 "gslqc1 $f26, $f24, 0xe0(%[tmp]) \n\t"
1584 "punpcklbh $f24, $f26, $f6 \n\t"
1585 "punpckhbh $f26, $f26, $f6 \n\t"
1586 "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
1587 "gslqc1 $f22, $f20, 0xe0(%[tmp]) \n\t"
1588 "mov.d $f8, $f28 \n\t"
1589 "mov.d $f10, $f30 \n\t"
1590 "punpcklbh $f28, $f30, $f6 \n\t"
1591 "punpckhbh $f30, $f30, $f6 \n\t"
1592 "punpckhbh $f22, $f20, $f4 \n\t"
1593 "punpcklbh $f20, $f20, $f4 \n\t"
1594 "gssqc1 $f30, $f28, 0xa0(%[tmp]) \n\t"
1595 "punpckhbh $f14, $f12, $f4 \n\t"
1596 "punpcklbh $f12, $f12, $f4 \n\t"
1597 "dli %[iBeta], 0x4 \n\t"
1598 "punpckhbh $f10, $f8, $f4 \n\t"
1599 "punpcklbh $f8, $f8, $f4 \n\t"
1600 "dmtc1 %[iBeta], $f24 \n\t"
1601 "punpcklhw $f28, $f24, $f24 \n\t"
1602 "punpcklwd $f24, $f28, $f28 \n\t"
1603 "mov.d $f26, $f24 \n\t"
1604 "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
1605 "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
1606 "psubh $f28, $f28, $f20 \n\t"
1607 "psubh $f30, $f30, $f22 \n\t"
1608 "pcmpgth $f24, $f0, $f4 \n\t"
1609 "pcmpgth $f26, $f2, $f6 \n\t"
1610 "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
1611 "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
1612 "psubh $f24, $f12, $f8 \n\t"
1613 "psubh $f26, $f14, $f10 \n\t"
1614 "dmfc1 %[iAlpha], $f12 \n\t"
1615 "dmfc1 %[iBeta], $f14 \n\t"
1616 "dli $10, 0x2 \n\t"
1617 "dmtc1 $10, $f12 \n\t"
1618 "dli $10, 0x3 \n\t"
1619 "dmtc1 $10, $f14 \n\t"
1620 "psllh $f24, $f24, $f12 \n\t"
1621 "psllh $f26, $f26, $f12 \n\t"
1622 "paddh $f24, $f24, $f28 \n\t"
1623 "paddh $f26, $f26, $f30 \n\t"
1624 "gslqc1 $f30, $f28, 0x20(%[tmp]) \n\t"
1625 "paddh $f24, $f24, $f28 \n\t"
1626 "paddh $f26, $f26, $f30 \n\t"
1627 "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
1628 "psrah $f24, $f24, $f14 \n\t"
1629 "psrah $f26, $f26, $f14 \n\t"
1630 "dmtc1 %[iAlpha], $f12 \n\t"
1631 "dmtc1 %[iBeta], $f14 \n\t"
1632 "pmaxsh $f4, $f4, $f24 \n\t"
1633 "pmaxsh $f6, $f6, $f26 \n\t"
1634 "gssqc1 $f2, $f0, 0x10(%[tmp]) \n\t"
1635 "gslqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
1636 "pminsh $f24, $f24, $f4 \n\t"
1637 "pminsh $f26, $f26, $f6 \n\t"
1638 "gssqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
1639 "psubh $f4, $f8, $f12 \n\t"
1640 "psubh $f6, $f10, $f14 \n\t"
1641 WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
1642 "pcmpgth $f24, $f16, $f4 \n\t"
1643 "pcmpgth $f26, $f18, $f6 \n\t"
1644 "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
1645 "psubh $f4, $f4, $f8 \n\t"
1646 "psubh $f6, $f6, $f10 \n\t"
1647 "dmfc1 %[iAlpha], $f8 \n\t"
1648 "dmfc1 %[iBeta], $f10 \n\t"
1649 WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
1650 "pcmpgth $f28, $f28, $f4 \n\t"
1651 "pcmpgth $f30, $f30, $f6 \n\t"
1652 "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
1653 "and $f24, $f24, $f28 \n\t"
1654 "and $f26, $f26, $f30 \n\t"
1655 "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
1656 "psubh $f20, $f20, $f12 \n\t"
1657 "psubh $f22, $f22, $f14 \n\t"
1658 WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
1659 "pcmpgth $f4, $f4, $f20 \n\t"
1660 "pcmpgth $f6, $f6, $f22 \n\t"
1661 "gslqc1 $f22, $f20, 0x80(%[tmp]) \n\t"
1662 "gslqc1 $f10, $f8, 0x90(%[tmp]) \n\t"
1663 "psubh $f20, $f20, $f8 \n\t"
1664 "psubh $f22, $f22, $f10 \n\t"
1665 "and $f24, $f24, $f4 \n\t"
1666 "and $f26, $f26, $f6 \n\t"
1667 "gslqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
1668 "and $f24, $f24, $f8 \n\t"
1669 "and $f26, $f26, $f10 \n\t"
1670 "gslqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
1671 "and $f4, $f4, $f24 \n\t"
1672 "and $f6, $f6, $f26 \n\t"
1673 "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
1674 "gssqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
1675 "gslqc1 $f6, $f4, 0xa0(%[tmp]) \n\t"
1676 "psubh $f24, $f24, $f4 \n\t"
1677 "psubh $f26, $f26, $f6 \n\t"
1678 "dli $10, 0x2 \n\t"
1679 "dmtc1 $10, $f8 \n\t"
1680 "psllh $f24, $f24, $f8 \n\t"
1681 "psllh $f26, $f26, $f8 \n\t"
1682 "paddh $f24, $f24, $f20 \n\t"
1683 "paddh $f26, $f26, $f22 \n\t"
1684 "dli $10, 0x3 \n\t"
1685 "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
1686 "paddh $f24, $f24, $f8 \n\t"
1687 "paddh $f26, $f26, $f10 \n\t"
1688 "dmtc1 $10, $f8 \n\t"
1689 "gslqc1 $f22, $f20, 0x60(%[tmp]) \n\t"
1690 "psrah $f24, $f24, $f8 \n\t"
1691 "psrah $f26, $f26, $f8 \n\t"
1692 "pmaxsh $f20, $f20, $f24 \n\t"
1693 "pmaxsh $f22, $f22, $f26 \n\t"
1694 "pminsh $f0, $f0, $f20 \n\t"
1695 "pminsh $f2, $f2, $f22 \n\t"
1696 "gslqc1 $f22, $f20, 0x70(%[tmp]) \n\t"
1697 "psubh $f24, $f4, $f20 \n\t"
1698 "psubh $f26, $f6, $f22 \n\t"
1699 WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
1700 "pcmpgth $f16, $f16, $f24 \n\t"
1701 "pcmpgth $f18, $f18, $f26 \n\t"
1702 "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
1703 "psubh $f24, $f24, $f4 \n\t"
1704 "psubh $f26, $f26, $f6 \n\t"
1705 WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
1706 "pcmpgth $f28, $f28, $f24 \n\t"
1707 "pcmpgth $f30, $f30, $f26 \n\t"
1708 "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
1709 "and $f16, $f16, $f28 \n\t"
1710 "and $f18, $f18, $f30 \n\t"
1711 "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
1712 "psubh $f24, $f24, $f20 \n\t"
1713 "psubh $f26, $f26, $f22 \n\t"
1714 WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
1715 "dmtc1 %[iAlpha], $f8 \n\t"
1716 "dmtc1 %[iBeta], $f10 \n\t"
1717 "pcmpgth $f28, $f28, $f24 \n\t"
1718 "pcmpgth $f30, $f30, $f26 \n\t"
1719 "and $f16, $f16, $f28 \n\t"
1720 "and $f18, $f18, $f30 \n\t"
1721 "gslqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
1722 "and $f16, $f16, $f24 \n\t"
1723 "and $f18, $f18, $f26 \n\t"
1724 "and $f0, $f0, $f16 \n\t"
1725 "and $f2, $f2, $f18 \n\t"
1726 "gslqc1 $f18, $f16, 0x30(%[tmp]) \n\t"
1727 "paddh $f8, $f8, $f16 \n\t"
1728 "paddh $f10, $f10, $f18 \n\t"
1729 "paddh $f4, $f4, $f0 \n\t"
1730 "paddh $f6, $f6, $f2 \n\t"
1731 "packushb $f8, $f8, $f10 \n\t"
1732 "packushb $f10, $f4, $f6 \n\t"
1733 "gssdxc1 $f8, 0x0(%[pTC], $0) \n\t"
1734 "psubh $f12, $f12, $f16 \n\t"
1735 "psubh $f14, $f14, $f18 \n\t"
1736 "psubh $f20, $f20, $f0 \n\t"
1737 "psubh $f22, $f22, $f2 \n\t"
1738 "packushb $f12, $f12, $f14 \n\t"
1739 "packushb $f14, $f20, $f22 \n\t"
1740 "gssdxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
1741 "gssdxc1 $f10, 0x0($9, $0) \n\t"
1742 "gssdxc1 $f14, 0x0(%[pPixCr], $0) \n\t"
1743 : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
1744 : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
1745 [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
1746 : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
1747 "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
1748 "$f28", "$f30"
1749 );
1750 RECOVER_REG;
1751 }
1752
DeblockChromaEq4V_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)1753 void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
1754 int32_t iAlpha, int32_t iBeta) {
1755 unsigned char tmp[128] __attribute__((aligned(32)));
1756 BACKUP_REG;
1757 __asm__ volatile (
1758 ".set arch=loongson3a \n\t"
1759 "daddu $8, %[iStride], %[iStride] \n\t"
1760 "dsubu $9, %[pPixCb], $8 \n\t"
1761 "gsldxc1 $f16, 0x0(%[pPixCr], $0) \n\t"
1762 "gsldxc1 $f20, 0x0(%[iStride], %[pPixCr]) \n\t"
1763 "gsldxc1 $f4, 0x0($9, $0) \n\t"
1764 "dsubu $9, %[pPixCr], $8 \n\t"
1765 "gsldxc1 $f8, 0x0($9, $0) \n\t"
1766 "mov.d $f6, $f8 \n\t"
1767 "dsubu $8, %[pPixCb], %[iStride] \n\t"
1768 "gsldxc1 $f8, 0x0($8, $0) \n\t"
1769 "dsubu $9, %[pPixCr], %[iStride] \n\t"
1770 "gsldxc1 $f12, 0x0($9, $0) \n\t"
1771 "mov.d $f10, $f12 \n\t"
1772 "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
1773 "mov.d $f14, $f16 \n\t"
1774 "gsldxc1 $f16, 0x0(%[iStride], %[pPixCb]) \n\t"
1775 "mov.d $f18, $f20 \n\t"
1776 "dmtc1 %[iAlpha], $f20 \n\t"
1777 "xor $f0, $f0, $f0 \n\t"
1778 "xor $f2, $f2, $f2 \n\t"
1779 "punpcklhw $f24, $f20, $f20 \n\t"
1780 "punpcklwd $f20, $f24, $f24 \n\t"
1781 "mov.d $f22, $f20 \n\t"
1782 "dmtc1 %[iBeta], $f24 \n\t"
1783 "punpcklhw $f28, $f24, $f24 \n\t"
1784 "punpcklwd $f24, $f28, $f28 \n\t"
1785 "mov.d $f26, $f24 \n\t"
1786 "mov.d $f28, $f4 \n\t"
1787 "punpcklbh $f4, $f6, $f2 \n\t"
1788 "punpckhbh $f6, $f6, $f2 \n\t"
1789 "punpckhbh $f30, $f28, $f0 \n\t"
1790 "punpcklbh $f28, $f28, $f0 \n\t"
1791 "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
1792 "gssqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
1793 "punpckhbh $f30, $f8, $f0 \n\t"
1794 "punpcklbh $f28, $f8, $f0 \n\t"
1795 "gssqc1 $f30, $f28, 0x10(%[tmp]) \n\t"
1796 "punpckhbh $f30, $f12, $f0 \n\t"
1797 "punpcklbh $f28, $f12, $f0 \n\t"
1798 "punpcklbh $f12, $f14, $f2 \n\t"
1799 "punpckhbh $f14, $f14, $f2 \n\t"
1800 "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
1801 "mov.d $f28, $f16 \n\t"
1802 "punpcklbh $f16, $f18, $f2 \n\t"
1803 "punpckhbh $f18, $f18, $f2 \n\t"
1804 "punpcklbh $f8, $f10, $f2 \n\t"
1805 "punpckhbh $f10, $f10, $f2 \n\t"
1806 "punpckhbh $f30, $f28, $f0 \n\t"
1807 "punpcklbh $f28, $f28, $f0 \n\t"
1808 "gssqc1 $f14, $f12, 0x30(%[tmp]) \n\t"
1809 "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
1810 "gslqc1 $f2, $f0, 0x50(%[tmp]) \n\t"
1811 "psubh $f4, $f12, $f0 \n\t"
1812 "psubh $f6, $f14, $f2 \n\t"
1813 WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
1814 "gssqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
1815 "pcmpgth $f0, $f20, $f4 \n\t"
1816 "pcmpgth $f2, $f22, $f6 \n\t"
1817 "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
1818 "psubh $f4, $f4, $f12 \n\t"
1819 "psubh $f6, $f6, $f14 \n\t"
1820 WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
1821 "pcmpgth $f16, $f24, $f4 \n\t"
1822 "pcmpgth $f18, $f26, $f6 \n\t"
1823 "and $f0, $f0, $f16 \n\t"
1824 "and $f2, $f2, $f18 \n\t"
1825 "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
1826 "psubh $f4, $f28, $f16 \n\t"
1827 "psubh $f6, $f30, $f18 \n\t"
1828 WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
1829 "pcmpgth $f16, $f24, $f4 \n\t"
1830 "pcmpgth $f18, $f26, $f6 \n\t"
1831 "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
1832 "psubh $f4, $f8, $f4 \n\t"
1833 "psubh $f6, $f10, $f6 \n\t"
1834 "dmfc1 %[iAlpha], $f28 \n\t"
1835 "dmfc1 %[iBeta], $f30 \n\t"
1836 WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
1837 "pcmpgth $f20, $f20, $f4 \n\t"
1838 "pcmpgth $f22, $f22, $f6 \n\t"
1839 "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
1840 "and $f0, $f0, $f16 \n\t"
1841 "and $f2, $f2, $f18 \n\t"
1842 "psubh $f4, $f4, $f8 \n\t"
1843 "psubh $f6, $f6, $f10 \n\t"
1844 WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
1845 "pcmpgth $f16, $f24, $f4 \n\t"
1846 "pcmpgth $f18, $f26, $f6 \n\t"
1847 "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
1848 "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
1849 "psubh $f4, $f4, $f28 \n\t"
1850 "psubh $f6, $f6, $f30 \n\t"
1851 "and $f20, $f20, $f16 \n\t"
1852 "and $f22, $f22, $f18 \n\t"
1853 WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
1854 "dmtc1 %[iAlpha], $f28 \n\t"
1855 "dmtc1 %[iBeta], $f30 \n\t"
1856 "pcmpgth $f24, $f24, $f4 \n\t"
1857 "pcmpgth $f26, $f26, $f6 \n\t"
1858 "and $f20, $f20, $f24 \n\t"
1859 "and $f22, $f22, $f26 \n\t"
1860 "dli %[iBeta], 0x2 \n\t"
1861 "dmtc1 %[iBeta], $f4 \n\t"
1862 "punpcklhw $f16, $f4, $f4 \n\t"
1863 "punpcklwd $f4, $f16, $f16 \n\t"
1864 "mov.d $f6, $f4 \n\t"
1865 "gslqc1 $f18, $f16, 0x60(%[tmp]) \n\t"
1866 "paddh $f24, $f16, $f16 \n\t"
1867 "paddh $f26, $f18, $f18 \n\t"
1868 "paddh $f24, $f24, $f12 \n\t"
1869 "paddh $f26, $f26, $f14 \n\t"
1870 "paddh $f24, $f24, $f28 \n\t"
1871 "paddh $f26, $f26, $f30 \n\t"
1872 "gssqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
1873 "gslqc1 $f18, $f16, 0x10(%[tmp]) \n\t"
1874 "paddh $f24, $f24, $f16 \n\t"
1875 "paddh $f26, $f26, $f18 \n\t"
1876 "dmtc1 %[iBeta], $f16 \n\t"
1877 "psrah $f24, $f24, $f16 \n\t"
1878 "psrah $f26, $f26, $f16 \n\t"
1879 "pandn $f16, $f0, $f12 \n\t"
1880 "pandn $f18, $f2, $f14 \n\t"
1881 "gslqc1 $f14, $f12, 0x40(%[tmp]) \n\t"
1882 "and $f4, $f0, $f24 \n\t"
1883 "and $f6, $f2, $f26 \n\t"
1884 "or $f4, $f4, $f16 \n\t"
1885 "or $f6, $f6, $f18 \n\t"
1886 "paddh $f24, $f12, $f12 \n\t"
1887 "paddh $f26, $f14, $f14 \n\t"
1888 "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
1889 "paddh $f24, $f24, $f8 \n\t"
1890 "paddh $f26, $f26, $f10 \n\t"
1891 "gslqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
1892 "paddh $f24, $f24, $f16 \n\t"
1893 "paddh $f26, $f26, $f18 \n\t"
1894 "dmtc1 %[iBeta], $f16 \n\t"
1895 "paddh $f24, $f24, $f12 \n\t"
1896 "paddh $f26, $f26, $f14 \n\t"
1897 "psrah $f24, $f24, $f16 \n\t"
1898 "psrah $f26, $f26, $f16 \n\t"
1899 "and $f16, $f20, $f24 \n\t"
1900 "and $f18, $f22, $f26 \n\t"
1901 "pandn $f24, $f20, $f8 \n\t"
1902 "pandn $f26, $f22, $f10 \n\t"
1903 "or $f16, $f16, $f24 \n\t"
1904 "or $f18, $f18, $f26 \n\t"
1905 "packushb $f4, $f4, $f6 \n\t"
1906 "packushb $f6, $f16, $f18 \n\t"
1907 "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
1908 "paddh $f24, $f28, $f28 \n\t"
1909 "paddh $f26, $f30, $f30 \n\t"
1910 "paddh $f24, $f24, $f16 \n\t"
1911 "paddh $f26, $f26, $f18 \n\t"
1912 "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
1913 "paddh $f24, $f24, $f8 \n\t"
1914 "paddh $f26, $f26, $f10 \n\t"
1915 "dmtc1 %[iBeta], $f28 \n\t"
1916 "paddh $f24, $f24, $f12 \n\t"
1917 "paddh $f26, $f26, $f14 \n\t"
1918 "psrah $f24, $f24, $f28 \n\t"
1919 "psrah $f26, $f26, $f28 \n\t"
1920 "and $f8, $f0, $f24 \n\t"
1921 "and $f10, $f2, $f26 \n\t"
1922 "pandn $f0, $f0, $f16 \n\t"
1923 "pandn $f2, $f2, $f18 \n\t"
1924 "or $f8, $f8, $f0 \n\t"
1925 "or $f10, $f10, $f2 \n\t"
1926 "gslqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
1927 "paddh $f24, $f0, $f0 \n\t"
1928 "paddh $f26, $f2, $f2 \n\t"
1929 "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
1930 "paddh $f24, $f24, $f0 \n\t"
1931 "paddh $f26, $f26, $f2 \n\t"
1932 "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
1933 "paddh $f24, $f24, $f16 \n\t"
1934 "paddh $f26, $f26, $f18 \n\t"
1935 "paddh $f24, $f24, $f12 \n\t"
1936 "paddh $f26, $f26, $f14 \n\t"
1937 "gssdxc1 $f4, 0x0($8, $0) \n\t"
1938 "psrah $f24, $f24, $f28 \n\t"
1939 "psrah $f26, $f26, $f28 \n\t"
1940 "and $f16, $f20, $f24 \n\t"
1941 "and $f18, $f22, $f26 \n\t"
1942 "pandn $f20, $f20, $f0 \n\t"
1943 "pandn $f22, $f22, $f2 \n\t"
1944 "or $f16, $f16, $f20 \n\t"
1945 "or $f18, $f18, $f22 \n\t"
1946 "packushb $f8, $f8, $f10 \n\t"
1947 "packushb $f10, $f16, $f18 \n\t"
1948 "gssdxc1 $f8, 0x0(%[pPixCb], $0) \n\t"
1949 "gssdxc1 $f6, 0x0($9, $0) \n\t"
1950 "gssdxc1 $f10, 0x0(%[pPixCr], $0) \n\t"
1951 : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
1952 : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
1953 [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
1954 : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
1955 "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
1956 "$f28", "$f30"
1957 );
1958 RECOVER_REG;
1959 }
1960
DeblockChromaEq4H_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)1961 void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
1962 int32_t iAlpha, int32_t iBeta) {
1963 unsigned char tmp[256] __attribute__((aligned(32)));
1964 BACKUP_REG;
1965 __asm__ volatile (
1966 ".set arch=loongson3a \n\t"
1967 "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
1968 "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
1969 "move $9, %[pPixCb] \n\t"
1970 "move $10, %[pPixCr] \n\t"
1971 "dsll $11, %[iStride], 0x2 \n\t"
1972 "daddu %[pPixCb], %[pPixCb], $11 \n\t"
1973 "daddu %[pPixCr], %[pPixCr], $11 \n\t"
1974 "daddiu $11, %[tmp], 0x80 \n\t"
1975 "gsldlc1 $f0, 0x7($9) \n\t"
1976 "gsldrc1 $f0, 0x0($9) \n\t"
1977 "daddu $12, $9, %[iStride] \n\t"
1978 "gsldlc1 $f4, 0x7($12) \n\t"
1979 "gsldrc1 $f4, 0x0($12) \n\t"
1980 "daddu $12, $12, %[iStride] \n\t"
1981 "gsldlc1 $f8, 0x7($12) \n\t"
1982 "gsldrc1 $f8, 0x0($12) \n\t"
1983 "daddu $12, $12, %[iStride] \n\t"
1984 "gsldlc1 $f12, 0x7($12) \n\t"
1985 "gsldlc1 $f16, 0x7($10) \n\t"
1986 "gsldrc1 $f12, 0x0($12) \n\t"
1987 "gsldrc1 $f16, 0x0($10) \n\t"
1988 "daddu $12, $10, %[iStride] \n\t"
1989 "gsldlc1 $f20, 0x7($12) \n\t"
1990 "gsldrc1 $f20, 0x0($12) \n\t"
1991 "daddu $12, $12, %[iStride] \n\t"
1992 "gsldlc1 $f24, 0x7($12) \n\t"
1993 "gsldrc1 $f24, 0x0($12) \n\t"
1994 "daddu $12, $12, %[iStride] \n\t"
1995 "gsldlc1 $f28, 0x7($12) \n\t"
1996 "gsldrc1 $f28, 0x0($12) \n\t"
1997 "punpcklwd $f0, $f0, $f16 \n\t"
1998 "punpcklwd $f4, $f4, $f20 \n\t"
1999 "punpcklwd $f8, $f8, $f24 \n\t"
2000 "punpcklwd $f12, $f12, $f28 \n\t"
2001 "gsldlc1 $f16, 0x7(%[pPixCb]) \n\t"
2002 "gsldlc1 $f20, 0x7(%[pPixCr]) \n\t"
2003 "gsldrc1 $f16, 0x0(%[pPixCb]) \n\t"
2004 "gsldrc1 $f20, 0x0(%[pPixCr]) \n\t"
2005 "punpcklwd $f16, $f16, $f20 \n\t"
2006 "mov.d $f2, $f16 \n\t"
2007 "daddu $12, %[pPixCb], %[iStride] \n\t"
2008 "daddu $13, %[pPixCr], %[iStride] \n\t"
2009 "gsldlc1 $f16, 0x7($12) \n\t"
2010 "gsldlc1 $f20, 0x7($13) \n\t"
2011 "gsldrc1 $f16, 0x0($12) \n\t"
2012 "gsldrc1 $f20, 0x0($13) \n\t"
2013 "punpcklwd $f16, $f16, $f20 \n\t"
2014 "mov.d $f6, $f16 \n\t"
2015 "daddu $12, $12, %[iStride] \n\t"
2016 "daddu $13, $13, %[iStride] \n\t"
2017 "gsldlc1 $f16, 0x7($12) \n\t"
2018 "gsldlc1 $f20, 0x7($13) \n\t"
2019 "gsldrc1 $f16, 0x0($12) \n\t"
2020 "gsldrc1 $f20, 0x0($13) \n\t"
2021 "punpcklwd $f16, $f16, $f20 \n\t"
2022 "mov.d $f10, $f16 \n\t"
2023 "daddu $12, $12, %[iStride] \n\t"
2024 "daddu $13, $13, %[iStride] \n\t"
2025 "gsldlc1 $f16, 0x7($12) \n\t"
2026 "gsldlc1 $f20, 0x7($13) \n\t"
2027 "gsldrc1 $f16, 0x0($12) \n\t"
2028 "gsldrc1 $f20, 0x0($13) \n\t"
2029 "punpcklwd $f16, $f16, $f20 \n\t"
2030 "mov.d $f14, $f16 \n\t"
2031 "punpcklbh $f24, $f2, $f6 \n\t"
2032 "punpckhbh $f26, $f2, $f6 \n\t"
2033 "punpckhbh $f2, $f0, $f4 \n\t"
2034 "punpcklbh $f0, $f0, $f4 \n\t"
2035 "punpcklbh $f28, $f10, $f14 \n\t"
2036 "punpckhbh $f30, $f10, $f14 \n\t"
2037 "punpckhbh $f10, $f8, $f12 \n\t"
2038 "punpcklbh $f8, $f8, $f12 \n\t"
2039 "punpcklhw $f16, $f2, $f10 \n\t"
2040 "punpckhhw $f18, $f2, $f10 \n\t"
2041 "punpckhhw $f2, $f0, $f8 \n\t"
2042 "punpcklhw $f0, $f0, $f8 \n\t"
2043 "punpcklhw $f20, $f26, $f30 \n\t"
2044 "punpckhhw $f22, $f26, $f30 \n\t"
2045 "punpckhhw $f26, $f24, $f28 \n\t"
2046 "punpcklhw $f24, $f24, $f28 \n\t"
2047 "punpcklwd $f4, $f2, $f26 \n\t"
2048 "punpckhwd $f6, $f2, $f26 \n\t"
2049 "punpckhwd $f2, $f0, $f24 \n\t"
2050 "punpcklwd $f0, $f0, $f24 \n\t"
2051 "punpcklwd $f8, $f18, $f22 \n\t"
2052 "punpckhwd $f10, $f18, $f22 \n\t"
2053 "punpckhwd $f18, $f16, $f20 \n\t"
2054 "punpcklwd $f16, $f16, $f20 \n\t"
2055 "mov.d $f20, $f2 \n\t"
2056 "mov.d $f22, $f18 \n\t"
2057 "mov.d $f2, $f16 \n\t"
2058 "mov.d $f24, $f6 \n\t"
2059 "mov.d $f26, $f10 \n\t"
2060 "mov.d $f6, $f8 \n\t"
2061 "gssqc1 $f2, $f0, 0x0($11) \n\t"
2062 "gssqc1 $f22, $f20, 0x10($11) \n\t"
2063 "gssqc1 $f6, $f4, 0x20($11) \n\t"
2064 "gssqc1 $f26, $f24, 0x30($11) \n\t"
2065 "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
2066 "gslqc1 $f18, $f16, 0x90(%[tmp]) \n\t"
2067 "gslqc1 $f22, $f20, 0xa0(%[tmp]) \n\t"
2068 "gslqc1 $f30, $f28, 0xb0(%[tmp]) \n\t"
2069 "xor $f0, $f0, $f0 \n\t"
2070 "dmtc1 %[iAlpha], $f4 \n\t"
2071 "punpcklhw $f8, $f4, $f4 \n\t"
2072 "punpcklwd $f4, $f8, $f8 \n\t"
2073 "mov.d $f6, $f4 \n\t"
2074 "dmtc1 %[iBeta], $f8 \n\t"
2075 "punpcklhw $f12, $f8, $f8 \n\t"
2076 "punpcklwd $f8, $f12, $f12 \n\t"
2077 "mov.d $f10, $f8 \n\t"
2078 "mov.d $f12, $f24 \n\t"
2079 "punpcklbh $f24, $f26, $f0 \n\t"
2080 "punpckhbh $f26, $f26, $f0 \n\t"
2081 "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
2082 "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
2083 "punpcklbh $f24, $f26, $f0 \n\t"
2084 "punpckhbh $f26, $f26, $f0 \n\t"
2085 "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
2086 "gslqc1 $f26, $f24, 0xa0(%[tmp]) \n\t"
2087 "punpcklbh $f24, $f26, $f0 \n\t"
2088 "punpckhbh $f26, $f26, $f0 \n\t"
2089 "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
2090 "gslqc1 $f26, $f24, 0xb0(%[tmp]) \n\t"
2091 "punpcklbh $f24, $f26, $f0 \n\t"
2092 "punpckhbh $f26, $f26, $f0 \n\t"
2093 "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
2094 "punpckhbh $f30, $f28, $f0 \n\t"
2095 "punpcklbh $f28, $f28, $f0 \n\t"
2096 "punpckhbh $f18, $f16, $f0 \n\t"
2097 "punpcklbh $f16, $f16, $f0 \n\t"
2098 "punpckhbh $f22, $f20, $f0 \n\t"
2099 "punpcklbh $f20, $f20, $f0 \n\t"
2100 "punpckhbh $f14, $f12, $f0 \n\t"
2101 "punpcklbh $f12, $f12, $f0 \n\t"
2102 "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
2103 "psubh $f24, $f16, $f20 \n\t"
2104 "psubh $f26, $f18, $f22 \n\t"
2105 WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2106 "pcmpgth $f0, $f4, $f24 \n\t"
2107 "pcmpgth $f2, $f6, $f26 \n\t"
2108 "psubh $f24, $f12, $f16 \n\t"
2109 "psubh $f26, $f14, $f18 \n\t"
2110 WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
2111 "pcmpgth $f28, $f8, $f24 \n\t"
2112 "pcmpgth $f30, $f10, $f26 \n\t"
2113 "gslqc1 $f26, $f24, 0x50(%[tmp]) \n\t"
2114 "psubh $f24, $f24, $f20 \n\t"
2115 "psubh $f26, $f26, $f22 \n\t"
2116 "and $f0, $f0, $f28 \n\t"
2117 "and $f2, $f2, $f30 \n\t"
2118 WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
2119 "dmfc1 %[iAlpha], $f20 \n\t"
2120 "dmfc1 %[iBeta], $f22 \n\t"
2121 "pcmpgth $f28, $f8, $f24 \n\t"
2122 "pcmpgth $f30, $f10, $f26 \n\t"
2123 "gslqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
2124 "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
2125 "psubh $f24, $f24, $f20 \n\t"
2126 "psubh $f26, $f26, $f22 \n\t"
2127 WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
2128 "pcmpgth $f4, $f4, $f24 \n\t"
2129 "pcmpgth $f6, $f6, $f26 \n\t"
2130 "gslqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
2131 "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
2132 "psubh $f24, $f24, $f20 \n\t"
2133 "psubh $f26, $f26, $f22 \n\t"
2134 WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
2135 "and $f0, $f0, $f28 \n\t"
2136 "and $f2, $f2, $f30 \n\t"
2137 "pcmpgth $f28, $f8, $f24 \n\t"
2138 "pcmpgth $f30, $f10, $f26 \n\t"
2139 "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
2140 "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
2141 "psubh $f24, $f24, $f20 \n\t"
2142 "psubh $f26, $f26, $f22 \n\t"
2143 WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
2144 "dli $8, 0x2 \n\t"
2145 "and $f4, $f4, $f28 \n\t"
2146 "and $f6, $f6, $f30 \n\t"
2147 "pcmpgth $f8, $f8, $f24 \n\t"
2148 "pcmpgth $f10, $f10, $f26 \n\t"
2149 "and $f4, $f4, $f8 \n\t"
2150 "and $f6, $f6, $f10 \n\t"
2151 "dmtc1 $8, $f8 \n\t"
2152 "punpcklhw $f24, $f8, $f8 \n\t"
2153 "punpcklwd $f8, $f24, $f24 \n\t"
2154 "mov.d $f10, $f8 \n\t"
2155 "gssqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
2156 "paddh $f8, $f12, $f12 \n\t"
2157 "paddh $f10, $f14, $f14 \n\t"
2158 "paddh $f8, $f8, $f16 \n\t"
2159 "paddh $f10, $f10, $f18 \n\t"
2160 "gslqc1 $f22, $f20, 0x50(%[tmp]) \n\t"
2161 "paddh $f8, $f8, $f20 \n\t"
2162 "paddh $f10, $f10, $f22 \n\t"
2163 "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
2164 "paddh $f8, $f8, $f24 \n\t"
2165 "paddh $f10, $f10, $f26 \n\t"
2166 "dmtc1 $8, $f20 \n\t"
2167 "psrah $f8, $f8, $f20 \n\t"
2168 "psrah $f10, $f10, $f20 \n\t"
2169 "and $f24, $f0, $f8 \n\t"
2170 "and $f26, $f2, $f10 \n\t"
2171 "pandn $f8, $f0, $f16 \n\t"
2172 "pandn $f10, $f2, $f18 \n\t"
2173 "or $f24, $f24, $f8 \n\t"
2174 "or $f26, $f26, $f10 \n\t"
2175 "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
2176 "paddh $f28, $f8, $f8 \n\t"
2177 "paddh $f30, $f10, $f10 \n\t"
2178 "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
2179 "paddh $f28, $f28, $f20 \n\t"
2180 "paddh $f30, $f30, $f22 \n\t"
2181 "gslqc1 $f18, $f16, 0x70(%[tmp]) \n\t"
2182 "paddh $f28, $f28, $f16 \n\t"
2183 "paddh $f30, $f30, $f18 \n\t"
2184 "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
2185 "paddh $f28, $f28, $f8 \n\t"
2186 "paddh $f30, $f30, $f10 \n\t"
2187 "pandn $f8, $f4, $f20 \n\t"
2188 "pandn $f10, $f6, $f22 \n\t"
2189 "dmtc1 $8, $f20 \n\t"
2190 "psrah $f28, $f28, $f20 \n\t"
2191 "psrah $f30, $f30, $f20 \n\t"
2192 "and $f16, $f4, $f28 \n\t"
2193 "and $f18, $f6, $f30 \n\t"
2194 "or $f16, $f16, $f8 \n\t"
2195 "or $f18, $f18, $f10 \n\t"
2196 "gslqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
2197 "packushb $f24, $f24, $f26 \n\t"
2198 "packushb $f26, $f16, $f18 \n\t"
2199 "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
2200 "paddh $f24, $f8, $f8 \n\t"
2201 "paddh $f26, $f10, $f10 \n\t"
2202 "dmtc1 %[iAlpha], $f20 \n\t"
2203 "dmtc1 %[iBeta], $f22 \n\t"
2204 "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
2205 "paddh $f24, $f24, $f20 \n\t"
2206 "paddh $f26, $f26, $f22 \n\t"
2207 "paddh $f24, $f24, $f12 \n\t"
2208 "paddh $f26, $f26, $f14 \n\t"
2209 "mov.d $f16, $f0 \n\t"
2210 "mov.d $f18, $f2 \n\t"
2211 "pandn $f0, $f0, $f20 \n\t"
2212 "pandn $f2, $f2, $f22 \n\t"
2213 "dmtc1 $8, $f20 \n\t"
2214 "paddh $f24, $f24, $f8 \n\t"
2215 "paddh $f26, $f26, $f10 \n\t"
2216 "psrah $f24, $f24, $f20 \n\t"
2217 "psrah $f26, $f26, $f20 \n\t"
2218 "and $f16, $f16, $f24 \n\t"
2219 "and $f18, $f18, $f26 \n\t"
2220 "or $f16, $f16, $f0 \n\t"
2221 "or $f18, $f18, $f2 \n\t"
2222 "gslqc1 $f2, $f0, 0x70(%[tmp]) \n\t"
2223 "paddh $f20, $f0, $f0 \n\t"
2224 "paddh $f22, $f2, $f2 \n\t"
2225 "gslqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
2226 "paddh $f20, $f20, $f0 \n\t"
2227 "paddh $f22, $f22, $f2 \n\t"
2228 "gslqc1 $f14, $f12, 0x60(%[tmp]) \n\t"
2229 "paddh $f20, $f20, $f12 \n\t"
2230 "paddh $f22, $f22, $f14 \n\t"
2231 "paddh $f20, $f20, $f8 \n\t"
2232 "paddh $f22, $f22, $f10 \n\t"
2233 "dmtc1 $8, $f8 \n\t"
2234 "psrah $f20, $f20, $f8 \n\t"
2235 "psrah $f22, $f22, $f8 \n\t"
2236 "and $f12, $f4, $f20 \n\t"
2237 "and $f14, $f6, $f22 \n\t"
2238 "pandn $f4, $f4, $f0 \n\t"
2239 "pandn $f6, $f6, $f2 \n\t"
2240 "or $f12, $f12, $f4 \n\t"
2241 "or $f14, $f14, $f6 \n\t"
2242 "packushb $f16, $f16, $f18 \n\t"
2243 "packushb $f18, $f12, $f14 \n\t"
2244 "gssqc1 $f18, $f16, 0xa0(%[tmp]) \n\t"
2245 "gslqc1 $f2, $f0, 0x0($11) \n\t"
2246 "gslqc1 $f6, $f4, 0x10($11) \n\t"
2247 "gslqc1 $f10, $f8, 0x20($11) \n\t"
2248 "gslqc1 $f14, $f12, 0x30($11) \n\t"
2249 "mov.d $f26, $f2 \n\t"
2250 "punpckhbh $f2, $f0, $f4 \n\t"
2251 "punpcklbh $f0, $f0, $f4 \n\t"
2252 "punpcklbh $f24, $f26, $f6 \n\t"
2253 "punpckhbh $f26, $f26, $f6 \n\t"
2254 "mov.d $f30, $f10 \n\t"
2255 "punpckhbh $f10, $f8, $f12 \n\t"
2256 "punpcklbh $f8, $f8, $f12 \n\t"
2257 "punpcklbh $f28, $f30, $f14 \n\t"
2258 "punpckhbh $f30, $f30, $f14 \n\t"
2259 "punpcklhw $f16, $f2, $f10 \n\t"
2260 "punpckhhw $f18, $f2, $f10 \n\t"
2261 "punpcklhw $f20, $f26, $f30 \n\t"
2262 "punpckhhw $f22, $f26, $f30 \n\t"
2263 "punpckhhw $f2, $f0, $f8 \n\t"
2264 "punpcklhw $f0, $f0, $f8 \n\t"
2265 "punpckhhw $f26, $f24, $f28 \n\t"
2266 "punpcklhw $f24, $f24, $f28 \n\t"
2267 "punpcklwd $f4, $f2, $f26 \n\t"
2268 "punpckhwd $f6, $f2, $f26 \n\t"
2269 "punpcklwd $f8, $f18, $f22 \n\t"
2270 "punpckhwd $f10, $f18, $f22 \n\t"
2271 "punpckhwd $f2, $f0, $f24 \n\t"
2272 "punpcklwd $f0, $f0, $f24 \n\t"
2273 "punpckhwd $f18, $f16, $f20 \n\t"
2274 "punpcklwd $f16, $f16, $f20 \n\t"
2275 "mov.d $f20, $f2 \n\t"
2276 "mov.d $f24, $f6 \n\t"
2277 "mov.d $f2, $f16 \n\t"
2278 "mov.d $f22, $f18 \n\t"
2279 "mov.d $f6, $f8 \n\t"
2280 "mov.d $f26, $f10 \n\t"
2281 "dli %[iAlpha], 0x20 \n\t"
2282 "dmtc1 %[iAlpha], $f8 \n\t"
2283 "gsswlc1 $f0, 0x3($9) \n\t"
2284 "gsswrc1 $f0, 0x0($9) \n\t"
2285 "daddu $12, $9, %[iStride] \n\t"
2286 "gsswlc1 $f20, 0x3($12) \n\t"
2287 "gsswrc1 $f20, 0x0($12) \n\t"
2288 "daddu $12, $12, %[iStride] \n\t"
2289 "gsswlc1 $f4, 0x3($12) \n\t"
2290 "gsswrc1 $f4, 0x0($12) \n\t"
2291 "daddu $12, $12, %[iStride] \n\t"
2292 "gsswlc1 $f24, 0x3($12) \n\t"
2293 "gsswrc1 $f24, 0x0($12) \n\t"
2294 "dsrl $f0, $f0, $f8 \n\t"
2295 "dsrl $f20, $f20, $f8 \n\t"
2296 "dsrl $f4, $f4, $f8 \n\t"
2297 "dsrl $f24, $f24, $f8 \n\t"
2298 "gsswlc1 $f0, 0x3($10) \n\t"
2299 "gsswrc1 $f0, 0x0($10) \n\t"
2300 "daddu $13, $10, %[iStride] \n\t"
2301 "daddu $8, $13, %[iStride] \n\t"
2302 "gsswlc1 $f20, 0x3($13) \n\t"
2303 "gsswrc1 $f20, 0x0($13) \n\t"
2304 "daddu $13, $8, %[iStride] \n\t"
2305 "gsswlc1 $f4, 0x3($8) \n\t"
2306 "gsswrc1 $f4, 0x0($8) \n\t"
2307 "gsswlc1 $f24, 0x3($13) \n\t"
2308 "gsswrc1 $f24, 0x0($13) \n\t"
2309 "gsswlc1 $f2, 0x3(%[pPixCb]) \n\t"
2310 "gsswrc1 $f2, 0x0(%[pPixCb]) \n\t"
2311 "daddu $12, %[pPixCb], %[iStride] \n\t"
2312 "gsswlc1 $f22, 0x3($12) \n\t"
2313 "gsswrc1 $f22, 0x0($12) \n\t"
2314 "daddu $12, $12, %[iStride] \n\t"
2315 "gsswlc1 $f6, 0x3($12) \n\t"
2316 "gsswrc1 $f6, 0x0($12) \n\t"
2317 "daddu $12, $12, %[iStride] \n\t"
2318 "gsswlc1 $f26, 0x3($12) \n\t"
2319 "gsswrc1 $f26, 0x0($12) \n\t"
2320 "dsrl $f2, $f2, $f8 \n\t"
2321 "dsrl $f22, $f22, $f8 \n\t"
2322 "dsrl $f6, $f6, $f8 \n\t"
2323 "dsrl $f26, $f26, $f8 \n\t"
2324 "gsswlc1 $f2, 0x3(%[pPixCr]) \n\t"
2325 "gsswrc1 $f2, 0x0(%[pPixCr]) \n\t"
2326 "daddu $13, %[pPixCr], %[iStride] \n\t"
2327 "daddu $8, $13, %[iStride] \n\t"
2328 "gsswlc1 $f22, 0x3($13) \n\t"
2329 "gsswrc1 $f22, 0x0($13) \n\t"
2330 "daddu $13, $8, %[iStride] \n\t"
2331 "gsswlc1 $f6, 0x3($8) \n\t"
2332 "gsswrc1 $f6, 0x0($8) \n\t"
2333 "gsswlc1 $f26, 0x3($13) \n\t"
2334 "gsswrc1 $f26, 0x0($13) \n\t"
2335 : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
2336 : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
2337 [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
2338 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
2339 "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2340 "$f24", "$f26", "$f28", "$f30"
2341 );
2342 RECOVER_REG;
2343 }
2344
DeblockChromaLt4H_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTC)2345 void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
2346 int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
2347 unsigned char tmp[320] __attribute__((aligned(32)));
2348 BACKUP_REG;
2349 __asm__ volatile (
2350 ".set arch=loongson3a \n\t"
2351 "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
2352 "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
2353 "daddu $8, %[pPixCb], %[iStride] \n\t"
2354 "gsldlc1 $f0, 0x7(%[pPixCb]) \n\t"
2355 "gsldlc1 $f4, 0x7($8) \n\t"
2356 "gsldrc1 $f0, 0x0(%[pPixCb]) \n\t"
2357 "gsldrc1 $f4, 0x0($8) \n\t"
2358 "daddu $9, $8, %[iStride] \n\t"
2359 "daddu $8, $9, %[iStride] \n\t"
2360 "gsldlc1 $f8, 0x7($9) \n\t"
2361 "gsldlc1 $f12, 0x7($8) \n\t"
2362 "gsldrc1 $f8, 0x0($9) \n\t"
2363 "gsldrc1 $f12, 0x0($8) \n\t"
2364 "daddu $9, $8, %[iStride] \n\t"
2365
2366 "daddu $10, %[pPixCr], %[iStride] \n\t"
2367 "gsldlc1 $f16, 0x7(%[pPixCr]) \n\t"
2368 "gsldlc1 $f20, 0x7($10) \n\t"
2369 "gsldrc1 $f16, 0x0(%[pPixCr]) \n\t"
2370 "gsldrc1 $f20, 0x0($10) \n\t"
2371 "daddu $11, $10, %[iStride] \n\t"
2372 "daddu $10, $11, %[iStride] \n\t"
2373 "gsldlc1 $f24, 0x7($11) \n\t"
2374 "gsldlc1 $f28, 0x7($10) \n\t"
2375 "gsldrc1 $f24, 0x0($11) \n\t"
2376 "gsldrc1 $f28, 0x0($10) \n\t"
2377 "daddu $11, $10, %[iStride] \n\t"
2378
2379 "punpcklwd $f0, $f0, $f16 \n\t"
2380 "punpcklwd $f4, $f4, $f20 \n\t"
2381 "punpcklwd $f8, $f8, $f24 \n\t"
2382 "punpcklwd $f12, $f12, $f28 \n\t"
2383 "gsldlc1 $f16, 0x7($9) \n\t"
2384 "gsldlc1 $f20, 0x7($11) \n\t"
2385 "gsldrc1 $f16, 0x0($9) \n\t"
2386 "gsldrc1 $f20, 0x0($11) \n\t"
2387 "punpcklwd $f16, $f16, $f20 \n\t"
2388 "mov.d $f2, $f16 \n\t"
2389 "daddu $8, $9, %[iStride] \n\t"
2390 "daddu $10, $11, %[iStride] \n\t"
2391 "gsldlc1 $f16, 0x7($8) \n\t"
2392 "gsldlc1 $f20, 0x7($10) \n\t"
2393 "gsldrc1 $f16, 0x0($8) \n\t"
2394 "gsldrc1 $f20, 0x0($10) \n\t"
2395 "punpcklwd $f16, $f16, $f20 \n\t"
2396 "mov.d $f6, $f16 \n\t"
2397 "daddu $9, $8, %[iStride] \n\t"
2398 "daddu $11, $10, %[iStride] \n\t"
2399
2400 "gsldlc1 $f16, 0x7($9) \n\t"
2401 "gsldlc1 $f20, 0x7($11) \n\t"
2402 "gsldrc1 $f16, 0x0($9) \n\t"
2403 "gsldrc1 $f20, 0x0($11) \n\t"
2404 "punpcklwd $f16, $f16, $f20 \n\t"
2405 "mov.d $f10, $f16 \n\t"
2406 "daddu $8, $9, %[iStride] \n\t"
2407 "daddu $10, $11, %[iStride] \n\t"
2408
2409 "gsldlc1 $f16, 0x7($8) \n\t"
2410 "gsldlc1 $f20, 0x7($10) \n\t"
2411 "gsldrc1 $f16, 0x0($8) \n\t"
2412 "gsldrc1 $f20, 0x0($10) \n\t"
2413 "punpcklwd $f16, $f16, $f20 \n\t"
2414 "mov.d $f14, $f16 \n\t"
2415
2416 "punpcklbh $f24, $f2, $f6 \n\t"
2417 "punpckhbh $f26, $f2, $f6 \n\t"
2418 "punpckhbh $f2, $f0, $f4 \n\t"
2419 "punpcklbh $f0, $f0, $f4 \n\t"
2420 "punpcklbh $f28, $f10, $f14 \n\t"
2421 "punpckhbh $f30, $f10, $f14 \n\t"
2422 "punpckhbh $f10, $f8, $f12 \n\t"
2423 "punpcklbh $f8, $f8, $f12 \n\t"
2424
2425 "punpcklhw $f16, $f2, $f10 \n\t"
2426 "punpckhhw $f18, $f2, $f10 \n\t"
2427 "punpckhhw $f2, $f0, $f8 \n\t"
2428 "punpcklhw $f0, $f0, $f8 \n\t"
2429 "punpcklhw $f20, $f26, $f30 \n\t"
2430 "punpckhhw $f22, $f26, $f30 \n\t"
2431 "punpckhhw $f26, $f24, $f28 \n\t"
2432 "punpcklhw $f24, $f24, $f28 \n\t"
2433
2434 "punpcklwd $f4, $f2, $f26 \n\t"
2435 "punpckhwd $f6, $f2, $f26 \n\t"
2436 "punpckhwd $f2, $f0, $f24 \n\t"
2437 "punpcklwd $f0, $f0, $f24 \n\t"
2438 "punpcklwd $f8, $f18, $f22 \n\t"
2439 "punpckhwd $f10, $f18, $f22 \n\t"
2440 "punpckhwd $f18, $f16, $f20 \n\t"
2441 "punpcklwd $f16, $f16, $f20 \n\t"
2442
2443 "mov.d $f20, $f2 \n\t"
2444 "mov.d $f22, $f18 \n\t"
2445 "mov.d $f2, $f16 \n\t"
2446 "mov.d $f24, $f6 \n\t"
2447 "mov.d $f26, $f10 \n\t"
2448 "mov.d $f6, $f8 \n\t"
2449 "daddiu $11, %[tmp], 0x70 \n\t"
2450
2451 "gssqc1 $f2, $f0, 0x0($11) \n\t"
2452 "gssqc1 $f22, $f20, 0x10($11) \n\t"
2453 "gssqc1 $f6, $f4, 0x20($11) \n\t"
2454 "gssqc1 $f26, $f24, 0x30($11) \n\t"
2455
2456 "lb $8, 0x3(%[pTC]) \n\t"
2457 "lb $9, 0x2(%[pTC]) \n\t"
2458 "lb $10, 0x1(%[pTC]) \n\t"
2459 "lb $11, 0x0(%[pTC]) \n\t"
2460
2461 "and $12, $8, 0xFFFF \n\t"
2462 "dmtc1 $12, $f8 \n\t"
2463
2464 "and $9, $9, 0xFFFF \n\t"
2465 "dmtc1 $9, $f12 \n\t"
2466 "mov.d $f16, $f12 \n\t"
2467
2468 "and $9, $10, 0xFFFF \n\t"
2469 "dmtc1 $9, $f20 \n\t"
2470 "xor $f0, $f0, $f0 \n\t"
2471 "mov.d $f24, $f20 \n\t"
2472 "and $9, $11, 0xFFFF \n\t"
2473 "punpcklhw $f24, $f24, $f8 \n\t"
2474
2475 "mov.d $f4, $f8 \n\t"
2476 "dmtc1 $9, $f28 \n\t"
2477 "mov.d $f0, $f28 \n\t"
2478
2479 "punpcklhw $f28, $f28, $f12 \n\t"
2480 "punpcklhw $f20, $f20, $f4 \n\t"
2481 "xor $f4, $f4, $f4 \n\t"
2482 "xor $f6, $f6, $f6 \n\t"
2483 "punpcklhw $f28, $f28, $f20 \n\t"
2484 "gslqc1 $f22, $f20, 0xA0(%[tmp]) \n\t"
2485 "punpcklhw $f0, $f0, $f16 \n\t"
2486 "punpcklhw $f0, $f0, $f24 \n\t"
2487
2488 "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
2489 "punpckhhw $f2, $f0, $f28 \n\t"
2490 "punpcklhw $f0, $f0, $f28 \n\t"
2491 "gslqc1 $f30, $f28, 0x80(%[tmp]) \n\t"
2492 "psubh $f8, $f4, $f0 \n\t"
2493 "psubh $f10, $f6, $f2 \n\t"
2494 "gssqc1 $f10, $f8, 0xD0(%[tmp]) \n\t"
2495 "dmtc1 %[iAlpha], $f8 \n\t"
2496 "punpcklhw $f12, $f8, $f8 \n\t"
2497 "punpcklwd $f16, $f12, $f12 \n\t"
2498 "mov.d $f18, $f16 \n\t"
2499
2500 "dmtc1 %[iBeta], $f8 \n\t"
2501 "punpcklhw $f12, $f8, $f8 \n\t"
2502 "punpcklwd $f8, $f12, $f12 \n\t"
2503 "mov.d $f10, $f8 \n\t"
2504
2505 "gslqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
2506 "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
2507 "punpckhbh $f10, $f24, $f4 \n\t"
2508 "punpcklbh $f8, $f24, $f4 \n\t"
2509 "punpcklbh $f24, $f26, $f6 \n\t"
2510 "punpckhbh $f26, $f26, $f6 \n\t"
2511
2512 "gssqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
2513 "gssqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
2514 "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
2515 "punpcklbh $f8, $f28, $f4 \n\t"
2516 "punpckhbh $f10, $f28, $f4 \n\t"
2517 "punpcklbh $f28, $f30, $f6 \n\t"
2518 "punpckhbh $f30, $f30, $f6 \n\t"
2519 "punpcklbh $f24, $f26, $f6 \n\t"
2520 "punpckhbh $f26, $f26, $f6 \n\t"
2521 "punpckhbh $f14, $f12, $f4 \n\t"
2522 "punpcklbh $f12, $f12, $f4 \n\t"
2523 "punpckhbh $f22, $f20, $f4 \n\t"
2524 "punpcklbh $f20, $f20, $f4 \n\t"
2525 "gssqc1 $f30, $f28, 0xF0(%[tmp]) \n\t"
2526 "gssqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
2527 "gslqc1 $f26, $f24, 0xA0(%[tmp]) \n\t"
2528 "punpcklbh $f24, $f26, $f6 \n\t"
2529 "punpckhbh $f26, $f26, $f6 \n\t"
2530
2531 "dli $13, 0x4 \n\t"
2532 "gssqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
2533 "dmtc1 $13, $f24 \n\t"
2534 "punpcklhw $f28, $f24, $f24 \n\t"
2535 "punpcklwd $f24, $f28, $f28 \n\t"
2536 "mov.d $f26, $f24 \n\t"
2537 "dli $12, 0x2 \n\t"
2538 "dli $13, 0x3 \n\t"
2539
2540 "gssqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
2541 "dmfc1 %[iAlpha], $f0 \n\t"
2542 "dmfc1 %[iBeta], $f2 \n\t"
2543 "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
2544 "gslqc1 $f30, $f28, 0x40(%[tmp]) \n\t"
2545 "psubh $f28, $f28, $f20 \n\t"
2546 "psubh $f30, $f30, $f22 \n\t"
2547 "pcmpgth $f24, $f0, $f4 \n\t"
2548 "pcmpgth $f26, $f2, $f6 \n\t"
2549
2550 "dmtc1 $12, $f0 \n\t"
2551 "dmtc1 $13, $f2 \n\t"
2552 "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
2553 "gslqc1 $f6, $f4, 0xD0(%[tmp]) \n\t"
2554 "psubh $f24, $f12, $f8 \n\t"
2555 "psubh $f26, $f14, $f10 \n\t"
2556 "psllh $f24, $f24, $f0 \n\t"
2557 "psllh $f26, $f26, $f0 \n\t"
2558 "paddh $f24, $f24, $f28 \n\t"
2559 "paddh $f26, $f26, $f30 \n\t"
2560 "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
2561 "paddh $f24, $f24, $f28 \n\t"
2562 "paddh $f26, $f26, $f30 \n\t"
2563 "psrah $f24, $f24, $f2 \n\t"
2564 "psrah $f26, $f26, $f2 \n\t"
2565 "pmaxsh $f4, $f4, $f24 \n\t"
2566 "pmaxsh $f6, $f6, $f26 \n\t"
2567
2568 "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
2569 "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
2570 "pminsh $f24, $f24, $f4 \n\t"
2571 "pminsh $f26, $f26, $f6 \n\t"
2572
2573 "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
2574 "psubh $f4, $f8, $f12 \n\t"
2575 "psubh $f6, $f10, $f14 \n\t"
2576 WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
2577 "pcmpgth $f24, $f16, $f4 \n\t"
2578 "pcmpgth $f26, $f18, $f6 \n\t"
2579 "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
2580 "psubh $f4, $f4, $f8 \n\t"
2581 "psubh $f6, $f6, $f10 \n\t"
2582 WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
2583 "pcmpgth $f28, $f28, $f4 \n\t"
2584 "pcmpgth $f30, $f30, $f6 \n\t"
2585
2586 "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
2587 "and $f24, $f24, $f28 \n\t"
2588 "and $f26, $f26, $f30 \n\t"
2589 "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
2590 "psubh $f20, $f20, $f12 \n\t"
2591 "psubh $f22, $f22, $f14 \n\t"
2592 WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
2593 "pcmpgth $f4, $f4, $f20 \n\t"
2594 "pcmpgth $f6, $f6, $f22 \n\t"
2595
2596 "gslqc1 $f22, $f20, 0xB0(%[tmp]) \n\t"
2597 "gslqc1 $f2, $f0, 0xE0(%[tmp]) \n\t"
2598 "psubh $f20, $f20, $f0 \n\t"
2599 "psubh $f22, $f22, $f2 \n\t"
2600 "and $f24, $f24, $f4 \n\t"
2601 "and $f26, $f26, $f6 \n\t"
2602 "gslqc1 $f2, $f0, 0x60(%[tmp]) \n\t"
2603 "and $f24, $f24, $f0 \n\t"
2604 "and $f26, $f26, $f2 \n\t"
2605
2606 "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
2607 "and $f4, $f4, $f24 \n\t"
2608 "and $f6, $f6, $f26 \n\t"
2609 "gslqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
2610 "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
2611 "gslqc1 $f6, $f4, 0xF0(%[tmp]) \n\t"
2612
2613 "dmtc1 $12, $f0 \n\t"
2614 "psubh $f24, $f24, $f4 \n\t"
2615 "psubh $f26, $f26, $f6 \n\t"
2616 "psllh $f24, $f24, $f0 \n\t"
2617 "psllh $f26, $f26, $f0 \n\t"
2618 "paddh $f24, $f24, $f20 \n\t"
2619 "paddh $f26, $f26, $f22 \n\t"
2620 "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
2621 "paddh $f24, $f24, $f0 \n\t"
2622 "paddh $f26, $f26, $f2 \n\t"
2623 "dmtc1 %[iBeta], $f2 \n\t"
2624
2625 "dmtc1 $13, $f0 \n\t"
2626 "gslqc1 $f22, $f20, 0xD0(%[tmp]) \n\t"
2627 "psrah $f24, $f24, $f0 \n\t"
2628 "psrah $f26, $f26, $f0 \n\t"
2629 "dmtc1 %[iAlpha], $f0 \n\t"
2630 "pmaxsh $f20, $f20, $f24 \n\t"
2631 "pmaxsh $f22, $f22, $f26 \n\t"
2632 "pminsh $f0, $f0, $f20 \n\t"
2633 "pminsh $f2, $f2, $f22 \n\t"
2634
2635 "dmfc1 %[iAlpha], $f0 \n\t"
2636 "dmfc1 %[iBeta], $f2 \n\t"
2637 "gslqc1 $f22, $f20, 0xC0(%[tmp]) \n\t"
2638 "psubh $f24, $f4, $f20 \n\t"
2639 "psubh $f26, $f6, $f22 \n\t"
2640 WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2641 "pcmpgth $f16, $f16, $f24 \n\t"
2642 "pcmpgth $f18, $f18, $f26 \n\t"
2643
2644 "gslqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
2645 "psubh $f24, $f24, $f4 \n\t"
2646 "psubh $f26, $f26, $f6 \n\t"
2647 WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2648 "pcmpgth $f28, $f28, $f24 \n\t"
2649 "pcmpgth $f30, $f30, $f26 \n\t"
2650
2651 "gslqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
2652 "and $f16, $f16, $f28 \n\t"
2653 "and $f18, $f18, $f30 \n\t"
2654
2655 "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
2656 "psubh $f24, $f24, $f20 \n\t"
2657 "psubh $f26, $f26, $f22 \n\t"
2658 WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2659 "pcmpgth $f28, $f28, $f24 \n\t"
2660 "pcmpgth $f30, $f30, $f26 \n\t"
2661 "and $f16, $f16, $f28 \n\t"
2662 "and $f18, $f18, $f30 \n\t"
2663 "gslqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
2664 "dmtc1 %[iAlpha], $f0 \n\t"
2665 "dmtc1 %[iBeta], $f2 \n\t"
2666 "and $f16, $f16, $f28 \n\t"
2667 "and $f18, $f18, $f30 \n\t"
2668 "and $f0, $f0, $f16 \n\t"
2669 "and $f2, $f2, $f18 \n\t"
2670
2671 "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
2672 "paddh $f8, $f8, $f16 \n\t"
2673 "paddh $f10, $f10, $f18 \n\t"
2674 "paddh $f4, $f4, $f0 \n\t"
2675 "paddh $f6, $f6, $f2 \n\t"
2676 "psubh $f12, $f12, $f16 \n\t"
2677 "psubh $f14, $f14, $f18 \n\t"
2678 "psubh $f20, $f20, $f0 \n\t"
2679 "psubh $f22, $f22, $f2 \n\t"
2680 "packushb $f8, $f8, $f10 \n\t"
2681 "packushb $f10, $f4, $f6 \n\t"
2682 "packushb $f12, $f12, $f14 \n\t"
2683 "packushb $f14, $f20, $f22 \n\t"
2684
2685 "gssqc1 $f10, $f8, 0x80(%[tmp]) \n\t"
2686 "gssqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
2687 "daddiu $11, %[tmp], 0x70 \n\t"
2688
2689 "gslqc1 $f2, $f0, 0x0($11) \n\t"
2690 "gslqc1 $f6, $f4, 0x10($11) \n\t"
2691 "gslqc1 $f10, $f8, 0x20($11) \n\t"
2692 "gslqc1 $f14, $f12, 0x30($11) \n\t"
2693
2694 "punpcklbh $f24, $f2, $f6 \n\t"
2695 "punpckhbh $f26, $f2, $f6 \n\t"
2696 "punpckhbh $f2, $f0, $f4 \n\t"
2697 "punpcklbh $f0, $f0, $f4 \n\t"
2698
2699 "punpcklbh $f28, $f10, $f14 \n\t"
2700 "punpckhbh $f30, $f10, $f14 \n\t"
2701 "punpckhbh $f10, $f8, $f12 \n\t"
2702 "punpcklbh $f8, $f8, $f12 \n\t"
2703
2704 "punpcklhw $f16, $f2, $f10 \n\t"
2705 "punpckhhw $f18, $f2, $f10 \n\t"
2706 "punpckhhw $f2, $f0, $f8 \n\t"
2707 "punpcklhw $f0, $f0, $f8 \n\t"
2708 "punpcklhw $f20, $f26, $f30 \n\t"
2709 "punpckhhw $f22, $f26, $f30 \n\t"
2710 "punpckhhw $f26, $f24, $f28 \n\t"
2711 "punpcklhw $f24, $f24, $f28 \n\t"
2712
2713 "punpcklwd $f4, $f2, $f26 \n\t"
2714 "punpckhwd $f6, $f2, $f26 \n\t"
2715 "punpckhwd $f2, $f0, $f24 \n\t"
2716 "punpcklwd $f0, $f0, $f24 \n\t"
2717 "punpcklwd $f8, $f18, $f22 \n\t"
2718 "punpckhwd $f10, $f18, $f22 \n\t"
2719 "punpckhwd $f18, $f16, $f20 \n\t"
2720 "punpcklwd $f16, $f16, $f20 \n\t"
2721
2722 "mov.d $f20, $f2 \n\t"
2723 "mov.d $f22, $f18 \n\t"
2724 "mov.d $f2, $f16 \n\t"
2725 "mov.d $f24, $f6 \n\t"
2726 "mov.d $f26, $f10 \n\t"
2727 "mov.d $f6, $f8 \n\t"
2728
2729 "dli %[iAlpha], 0x20 \n\t"
2730 "daddu $8, %[pPixCb], %[iStride] \n\t"
2731 "gsswlc1 $f0, 0x3(%[pPixCb]) \n\t"
2732 "gsswlc1 $f20, 0x3($8) \n\t"
2733 "gsswrc1 $f0, 0x0(%[pPixCb]) \n\t"
2734 "gsswrc1 $f20, 0x0($8) \n\t"
2735 "daddu $9, $8, %[iStride] \n\t"
2736 "daddu $8, $9, %[iStride] \n\t"
2737 "gsswlc1 $f4, 0x3($9) \n\t"
2738 "gsswlc1 $f24, 0x3($8) \n\t"
2739 "gsswrc1 $f4, 0x0($9) \n\t"
2740 "gsswrc1 $f24, 0x0($8) \n\t"
2741 "daddu $9, $8, %[iStride] \n\t"
2742 "dmtc1 %[iAlpha], $f8 \n\t"
2743
2744 "dsrl $f0, $f0, $f8 \n\t"
2745 "dsrl $f20, $f20, $f8 \n\t"
2746 "dsrl $f4, $f4, $f8 \n\t"
2747 "dsrl $f24, $f24, $f8 \n\t"
2748 "daddu $10, %[pPixCr], %[iStride] \n\t"
2749 "gsswlc1 $f0, 0x3(%[pPixCr]) \n\t"
2750 "gsswlc1 $f20, 0x3($10) \n\t"
2751 "gsswrc1 $f0, 0x0(%[pPixCr]) \n\t"
2752 "gsswrc1 $f20, 0x0($10) \n\t"
2753 "daddu $11, $10, %[iStride] \n\t"
2754 "daddu $10, $11, %[iStride] \n\t"
2755 "gsswlc1 $f4, 0x3($11) \n\t"
2756 "gsswlc1 $f24, 0x3($10) \n\t"
2757 "gsswrc1 $f4, 0x0($11) \n\t"
2758 "gsswrc1 $f24, 0x0($10) \n\t"
2759 "daddu $11, $10, %[iStride] \n\t"
2760
2761 "daddu $8, $9, %[iStride] \n\t"
2762 "gsswlc1 $f2, 0x3($9) \n\t"
2763 "gsswlc1 $f22, 0x3($8) \n\t"
2764 "gsswrc1 $f2, 0x0($9) \n\t"
2765 "gsswrc1 $f22, 0x0($8) \n\t"
2766 "daddu $9, $8, %[iStride] \n\t"
2767 "daddu $8, $9, %[iStride] \n\t"
2768 "gsswlc1 $f6, 0x3($9) \n\t"
2769 "gsswlc1 $f26, 0x3($8) \n\t"
2770 "gsswrc1 $f6, 0x0($9) \n\t"
2771 "gsswrc1 $f26, 0x0($8) \n\t"
2772
2773 "dsrl $f2, $f2, $f8 \n\t"
2774 "dsrl $f22, $f22, $f8 \n\t"
2775 "dsrl $f6, $f6, $f8 \n\t"
2776 "dsrl $f26, $f26, $f8 \n\t"
2777 "daddu $10, $11, %[iStride] \n\t"
2778 "gsswlc1 $f2, 0x3($11) \n\t"
2779 "gsswlc1 $f22, 0x3($10) \n\t"
2780 "gsswrc1 $f2, 0x0($11) \n\t"
2781 "gsswrc1 $f22, 0x0($10) \n\t"
2782 "daddu $11, $10, %[iStride] \n\t"
2783 "daddu $10, $11, %[iStride] \n\t"
2784 "gsswlc1 $f6, 0x3($11) \n\t"
2785 "gsswlc1 $f26, 0x3($10) \n\t"
2786 "gsswrc1 $f6, 0x0($11) \n\t"
2787 "gsswrc1 $f26, 0x0($10) \n\t"
2788 : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
2789 : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
2790 [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
2791 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
2792 "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
2793 "$f26", "$f28", "$f30"
2794 );
2795 RECOVER_REG;
2796 }
2797
WelsNonZeroCount_mmi(int8_t * pNonZeroCount)2798 void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
2799 __asm__ volatile(
2800 ".set arch=loongson3a \n\t"
2801 "gsldlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
2802 "gsldlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
2803 "gsldlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
2804 "gsldrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
2805 "gsldrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
2806 "gsldrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
2807 "pcmpeqh $f8, $f8, $f8 \n\t"
2808 "dli $8, 0xF \n\t"
2809 "dmtc1 $8, $f6 \n\t"
2810 "psrlh $f8, $f8, $f6 \n\t"
2811 "packushb $f8, $f8, $f8 \n\t"
2812
2813 "pminub $f0, $f0, $f8 \n\t"
2814 "pminub $f2, $f2, $f8 \n\t"
2815 "pminub $f4, $f4, $f8 \n\t"
2816 "gssdlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
2817 "gssdlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
2818 "gssdlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
2819 "gssdrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
2820 "gssdrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
2821 "gssdrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
2822 :
2823 : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
2824 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
2825 );
2826 }
2827