• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    deblock_mmi.c
33  *
34  * \brief   Loongson optimize
35  *
36  * \date    20/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
DeblockLumaLt4V_mmi(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTC)43 void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
44                          int32_t iBeta, int8_t *pTC) {
45   unsigned char tmp[512] __attribute__((aligned(32)));
46   BACKUP_REG;
47   __asm__ volatile (
48     ".set       arch=loongson3a                           \n\t"
49     "dsll       $8, %[iStride], 0x1                       \n\t"
50     "daddu      $8, $8, %[iStride]                        \n\t"
51     "dsubu      $14, %[pPix], $8                          \n\t"
52 
53     "dsll       $8, %[iStride], 0x1                       \n\t"
54     "dsubu      $9, %[pPix], $8                           \n\t"
55 
56     "dmtc1      %[iAlpha], $f0                            \n\t"
57     "dsubu      $13, %[pPix], %[iStride]                  \n\t"
58     "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
59     "daddu      $12, $8, %[pPix]                          \n\t"
60 
61     "punpcklhw  $f0, $f0, $f0                             \n\t"
62     "lb         $8, 0x0(%[pTC])                           \n\t"
63     "punpcklwd  $f0, $f0, $f0                             \n\t"
64     "mov.d      $f2, $f0                                  \n\t"
65     "gssqc1     $f2, $f0, 432-112(%[tmp])                 \n\t"
66     "dmtc1      %[iBeta], $f0                             \n\t"
67     "lb         %[iAlpha], 0x1(%[pTC])                    \n\t"
68     "dli        %[iBeta], 0xFFFF                          \n\t"
69     "punpcklhw  $f0, $f0, $f0                             \n\t"
70     "and        $10, %[iAlpha], %[iBeta]                  \n\t"
71     "punpcklwd  $f0, $f0, $f0                             \n\t"
72     "mov.d      $f2, $f0                                  \n\t"
73     "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
74     "dmtc1      $10, $f4                                  \n\t"
75     "mov.d      $f8, $f4                                  \n\t"
76     "dmtc1      %[iAlpha], $f16                           \n\t"
77     "and        %[iAlpha], $8, %[iBeta]                   \n\t"
78     "dmtc1      %[iAlpha], $f20                           \n\t"
79     "mov.d      $f24, $f20                                \n\t"
80     "mov.d      $f28, $f20                                \n\t"
81     "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
82     "dmtc1      %[iAlpha], $f0                            \n\t"
83 
84     "lb         %[iAlpha], 0x3(%[pTC])                    \n\t"
85     "lb         %[pTC], 0x2(%[pTC])                       \n\t"
86     "dmtc1      $10, $f12                                 \n\t"
87     "punpcklhw  $f0, $f0, $f16                            \n\t"
88     "and        $8, %[iAlpha], %[iBeta]                   \n\t"
89     "punpcklhw  $f24, $f24, $f8                           \n\t"
90     "punpcklhw  $f20, $f20, $f4                           \n\t"
91     "punpcklhw  $f0, $f0, $f24                            \n\t"
92     "punpcklhw  $f28, $f28, $f12                          \n\t"
93     "punpcklhw  $f28, $f28, $f20                          \n\t"
94     "punpckhhw  $f2, $f0, $f28                            \n\t"
95     "punpcklhw  $f0, $f0, $f28                            \n\t"
96     "gssqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
97     "dmtc1      $8, $f0                                   \n\t"
98     "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
99     "mov.d      $f8, $f0                                  \n\t"
100     "dmtc1      %[iAlpha], $f16                           \n\t"
101     "and        %[iAlpha], %[pTC], %[iBeta]               \n\t"
102     "dmtc1      $8, $f12                                  \n\t"
103     "dmtc1      %[iAlpha], $f20                           \n\t"
104     "punpcklhw  $f20, $f20, $f0                           \n\t"
105 
106     "xor        $f0, $f0, $f0                             \n\t"
107     "dmtc1      %[iAlpha], $f24                           \n\t"
108     "and        %[pTC], %[pTC], %[iBeta]                  \n\t"
109     "punpcklhw  $f24, $f24, $f8                           \n\t"
110     "dmtc1      %[iAlpha], $f28                           \n\t"
111     "dmtc1      %[pTC], $f4                               \n\t"
112 
113     "gslqc1     $f10, $f8, 0x0($9)                        \n\t"
114     "punpckhbh  $f10, $f8, $f0                            \n\t"
115     "punpcklbh  $f8, $f8, $f0                             \n\t"
116 
117     "dli        %[iAlpha], 0x4                            \n\t"
118     "seh        %[pTC], %[iAlpha]                         \n\t"
119     "punpcklhw  $f28, $f28, $f12                          \n\t"
120     "punpcklhw  $f28, $f28, $f20                          \n\t"
121     "gslqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
122     "gslqc1     $f14, $f12, 0x0($13)                      \n\t"
123     "gsldxc1    $f2, 0x0($12, $0)                         \n\t"
124     "punpckhbh  $f22, $f20, $f0                           \n\t"
125     "punpcklbh  $f20, $f20, $f0                           \n\t"
126     "gssqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
127     "punpckhbh  $f22, $f2, $f0                            \n\t"
128     "punpcklbh  $f20, $f2, $f0                            \n\t"
129     "gssqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
130     "punpcklhw  $f4, $f4, $f16                            \n\t"
131     "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
132     "punpcklhw  $f4, $f4, $f24                            \n\t"
133     "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
134     "punpckhhw  $f6, $f4, $f28                            \n\t"
135     "punpcklhw  $f4, $f4, $f28                            \n\t"
136     "punpckhbh  $f26, $f24, $f0                           \n\t"
137     "punpcklbh  $f24, $f24, $f0                           \n\t"
138     "punpckhbh  $f14, $f12, $f0                           \n\t"
139     "punpcklbh  $f12, $f12, $f0                           \n\t"
140     "punpckhbh  $f18, $f16, $f0                           \n\t"
141     "punpcklbh  $f16, $f16, $f0                           \n\t"
142     "psubh      $f28, $f12, $f16                          \n\t"
143     "psubh      $f30, $f14, $f18                          \n\t"
144     "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
145     WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
146     "gslqc1     $f18, $f16, 432-336(%[tmp])               \n\t"
147     "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
148     "pcmpgth    $f20, $f16, $f28                          \n\t"
149     "pcmpgth    $f22, $f18, $f30                          \n\t"
150     "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
151     "psubh      $f28, $f24, $f0                           \n\t"
152     "psubh      $f30, $f26, $f2                           \n\t"
153     WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
154     "pcmpgth    $f20, $f16, $f28                          \n\t"
155     "pcmpgth    $f22, $f18, $f30                          \n\t"
156     "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
157     "pavgh      $f20, $f12, $f24                          \n\t"
158     "pavgh      $f22, $f14, $f26                          \n\t"
159     "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
160     "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
161     "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
162     "gslqc1     $f2, $f0, 432-256(%[tmp])                 \n\t"
163     "psubh      $f20, $f20, $f28                          \n\t"
164     "psubh      $f22, $f22, $f30                          \n\t"
165     "psubh      $f20, $f20, $f0                           \n\t"
166     "psubh      $f22, $f22, $f2                           \n\t"
167     "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
168     "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
169     "psubh      $f20, $f24, $f12                          \n\t"
170     "psubh      $f22, $f26, $f14                          \n\t"
171     "gssqc1     $f26, $f24, 432-32(%[tmp])                \n\t"
172     "psubh      $f24, $f24, $f0                           \n\t"
173     "psubh      $f26, $f26, $f2                           \n\t"
174     "gssqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
175     WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
176     "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
177     "pcmpgth    $f20, $f20, $f28                          \n\t"
178     "pcmpgth    $f22, $f22, $f30                          \n\t"
179     WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
180     "pcmpgth    $f28, $f16, $f24                          \n\t"
181     "pcmpgth    $f30, $f18, $f26                          \n\t"
182 
183     "xor        $f0, $f0, $f0                             \n\t"
184     "and        $f20, $f20, $f28                          \n\t"
185     "and        $f22, $f22, $f30                          \n\t"
186     "psubh      $f24, $f12, $f8                           \n\t"
187     "psubh      $f26, $f14, $f10                          \n\t"
188     WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
189     "pcmpgth    $f28, $f16, $f24                          \n\t"
190     "pcmpgth    $f30, $f18, $f26                          \n\t"
191     "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
192     "and        $f20, $f20, $f28                          \n\t"
193     "and        $f22, $f22, $f30                          \n\t"
194     "pcmpgth    $f28, $f24, $f0                           \n\t"
195     "pcmpgth    $f30, $f26, $f0                           \n\t"
196     "pcmpeqh    $f24, $f24, $f0                           \n\t"
197     "pcmpeqh    $f26, $f26, $f0                           \n\t"
198     "or         $f28, $f28, $f24                          \n\t"
199     "or         $f30, $f30, $f26                          \n\t"
200     "and        $f20, $f20, $f28                          \n\t"
201     "and        $f22, $f22, $f30                          \n\t"
202     "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
203     "dmtc1      %[pTC], $f20                              \n\t"
204     "punpckhhw  $f26, $f20, $f20                          \n\t"
205     "punpcklhw  $f24, $f20, $f20                          \n\t"
206     "punpcklwd  $f20, $f24, $f24                          \n\t"
207     "mov.d      $f22, $f20                                \n\t"
208     "gssqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
209     "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
210     "psubh      $f24, $f0, $f20                           \n\t"
211     "dli        $11, 0x2                                  \n\t"
212     "psubh      $f26, $f0, $f22                           \n\t"
213     "dmtc1      $11, $f28                                 \n\t"
214     "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
215     "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
216     "psllh      $f20, $f20, $f28                          \n\t"
217     "psllh      $f22, $f22, $f28                          \n\t"
218     "psubh      $f28, $f8, $f0                            \n\t"
219     "psubh      $f30, $f10, $f2                           \n\t"
220     "paddh      $f28, $f28, $f20                          \n\t"
221     "paddh      $f30, $f30, $f22                          \n\t"
222     "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
223     "paddh      $f28, $f28, $f20                          \n\t"
224     "paddh      $f30, $f30, $f22                          \n\t"
225     "dli        $11, 0x3                                  \n\t"
226     "dmtc1      $11, $f20                                 \n\t"
227     "psrah      $f28, $f28, $f20                          \n\t"
228     "psrah      $f30, $f30, $f20                          \n\t"
229     "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
230     "pmaxsh     $f24, $f24, $f28                          \n\t"
231     "pmaxsh     $f26, $f26, $f30                          \n\t"
232     "gslqc1     $f2, $f0, 432-320(%[tmp])                 \n\t"
233     "pminsh     $f20, $f20, $f24                          \n\t"
234     "pminsh     $f22, $f22, $f26                          \n\t"
235 
236     "and        $f20, $f20, $f0                           \n\t"
237     "and        $f22, $f22, $f2                           \n\t"
238     "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
239     "gssqc1     $f22, $f20, 432-64(%[tmp])                \n\t"
240     "xor        $f0, $f0, $f0                             \n\t"
241     "gssqc1     $f26, $f24, 432-384(%[tmp])               \n\t"
242     "psubh      $f20, $f0, $f24                           \n\t"
243     "psubh      $f22, $f0, $f26                           \n\t"
244     "gssqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
245     "mov.d      $f24, $f20                                \n\t"
246     "mov.d      $f26, $f22                                \n\t"
247     "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
248     "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
249     "paddh      $f20, $f20, $f28                          \n\t"
250     "paddh      $f22, $f22, $f30                          \n\t"
251     "paddh      $f28, $f8, $f8                            \n\t"
252     "paddh      $f30, $f10, $f10                          \n\t"
253     "psubh      $f20, $f20, $f28                          \n\t"
254     "psubh      $f22, $f22, $f30                          \n\t"
255     "dli        $11, 0x1                                  \n\t"
256     "dmtc1      $11, $f28                                 \n\t"
257     "psrah      $f20, $f20, $f28                          \n\t"
258     "psrah      $f22, $f22, $f28                          \n\t"
259     "pmaxsh     $f24, $f24, $f20                          \n\t"
260     "pmaxsh     $f26, $f26, $f22                          \n\t"
261     "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
262     "pminsh     $f20, $f20, $f24                          \n\t"
263     "pminsh     $f22, $f22, $f26                          \n\t"
264 
265     "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
266     "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
267     "and        $f20, $f20, $f24                          \n\t"
268     "and        $f22, $f22, $f26                          \n\t"
269     "and        $f20, $f20, $f28                          \n\t"
270     "and        $f22, $f22, $f30                          \n\t"
271     "gslqc1     $f26, $f24, 432-240(%[tmp])               \n\t"
272     "gssqc1     $f22, $f20, 432-96(%[tmp])                \n\t"
273     "gslqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
274     "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
275     "paddh      $f20, $f20, $f28                          \n\t"
276     "paddh      $f22, $f22, $f30                          \n\t"
277     "paddh      $f28, $f24, $f24                          \n\t"
278     "paddh      $f30, $f26, $f26                          \n\t"
279     "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
280     "dli        $11, 0x1                                  \n\t"
281     "psubh      $f20, $f20, $f28                          \n\t"
282     "dmtc1      $11, $f28                                 \n\t"
283     "psubh      $f22, $f22, $f30                          \n\t"
284 
285     "psrah      $f20, $f20, $f28                          \n\t"
286     "psrah      $f22, $f22, $f28                          \n\t"
287     "gslqc1     $f30, $f28, 0x0(%[iStride])               \n\t"
288     "pmaxsh     $f24, $f24, $f20                          \n\t"
289     "pmaxsh     $f26, $f26, $f22                          \n\t"
290     "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
291     "pminsh     $f20, $f20, $f24                          \n\t"
292     "pminsh     $f22, $f22, $f26                          \n\t"
293     "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
294     "and        $f20, $f20, $f24                          \n\t"
295     "and        $f22, $f22, $f26                          \n\t"
296     "gslqc1     $f26, $f24, 432-256(%[tmp])               \n\t"
297     "and        $f20, $f20, $f24                          \n\t"
298     "and        $f22, $f22, $f26                          \n\t"
299     "gslqc1     $f26, $f24, 0x0($9)                       \n\t"
300     "punpcklbh  $f28, $f30, $f0                           \n\t"
301     "punpckhbh  $f30, $f30, $f0                           \n\t"
302     "gssqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
303 
304     "gslqc1     $f30, $f28, 0x0($12)                      \n\t"
305     "punpcklbh  $f24, $f26, $f0                           \n\t"
306     "punpckhbh  $f26, $f26, $f0                           \n\t"
307     "gssqc1     $f22, $f20, 432-48(%[tmp])                \n\t"
308     "gslqc1     $f22, $f20, 0x0($14)                      \n\t"
309     "gssqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
310     "gslqc1     $f26, $f24, 0x0($13)                      \n\t"
311     "punpcklbh  $f28, $f30, $f0                           \n\t"
312     "punpckhbh  $f30, $f30, $f0                           \n\t"
313     "punpcklbh  $f20, $f22, $f0                           \n\t"
314     "punpckhbh  $f22, $f22, $f0                           \n\t"
315     "gssqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
316     "punpcklbh  $f24, $f26, $f0                           \n\t"
317     "punpckhbh  $f26, $f26, $f0                           \n\t"
318     "gssqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
319 
320     "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
321     "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
322     "psubh      $f28, $f28, $f20                          \n\t"
323     "psubh      $f30, $f30, $f22                          \n\t"
324     "gssqc1     $f22, $f20, 432-16(%[tmp])                \n\t"
325     WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
326     "punpcklbh  $f24, $f26, $f0                           \n\t"
327     "punpckhbh  $f26, $f26, $f0                           \n\t"
328     "pcmpgth    $f20, $f16, $f28                          \n\t"
329     "pcmpgth    $f22, $f18, $f30                          \n\t"
330     "gslqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
331     "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
332 
333     "psubh      $f28, $f24, $f28                          \n\t"
334     "psubh      $f30, $f26, $f30                          \n\t"
335     WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
336     "pcmpgth    $f20, $f16, $f28                          \n\t"
337     "pcmpgth    $f22, $f18, $f30                          \n\t"
338     "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
339 
340     "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
341     "gssqc1     $f26, $f24, 432-80(%[tmp])                \n\t"
342     "pavgh      $f20, $f20, $f24                          \n\t"
343     "pavgh      $f22, $f22, $f26                          \n\t"
344     "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
345 
346     "gslqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
347     "gslqc1     $f30, $f28, 432-256(%[tmp])               \n\t"
348     "psubh      $f20, $f4, $f20                           \n\t"
349     "psubh      $f22, $f6, $f22                           \n\t"
350     "psubh      $f20, $f20, $f28                          \n\t"
351     "psubh      $f22, $f22, $f30                          \n\t"
352     "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
353     "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
354     "gslqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
355     "psubh      $f20, $f24, $f20                          \n\t"
356     "psubh      $f22, $f26, $f22                          \n\t"
357     "psubh      $f24, $f24, $f28                          \n\t"
358     "psubh      $f26, $f26, $f30                          \n\t"
359     "gssqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
360     "mov.d      $f28, $f20                                \n\t"
361     "mov.d      $f30, $f22                                \n\t"
362     WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
363     "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
364     "pcmpgth    $f20, $f20, $f28                          \n\t"
365     "pcmpgth    $f22, $f22, $f30                          \n\t"
366     WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
367     "pcmpgth    $f28, $f16, $f24                          \n\t"
368     "pcmpgth    $f30, $f18, $f26                          \n\t"
369     "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
370 
371     "and        $f20, $f20, $f28                          \n\t"
372     "and        $f22, $f22, $f30                          \n\t"
373     "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
374     "psubh      $f28, $f28, $f24                          \n\t"
375     "psubh      $f30, $f30, $f26                          \n\t"
376     "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
377     "psubh      $f24, $f24, $f0                           \n\t"
378     "psubh      $f26, $f26, $f2                           \n\t"
379     WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
380     "pcmpgth    $f16, $f16, $f28                          \n\t"
381     "pcmpgth    $f18, $f18, $f30                          \n\t"
382     "gslqc1     $f30, $f28, 432-96(%[tmp])                \n\t"
383     "and        $f20, $f20, $f16                          \n\t"
384     "and        $f22, $f22, $f18                          \n\t"
385     "xor        $f0, $f0, $f0                             \n\t"
386 
387     "paddh      $f8, $f8, $f28                            \n\t"
388     "paddh      $f10, $f10, $f30                          \n\t"
389     "pcmpgth    $f16, $f4, $f0                            \n\t"
390     "pcmpgth    $f18, $f6, $f0                            \n\t"
391     "pcmpeqh    $f28, $f4, $f0                            \n\t"
392     "pcmpeqh    $f30, $f6, $f0                            \n\t"
393     "or         $f16, $f16, $f28                          \n\t"
394     "or         $f18, $f18, $f30                          \n\t"
395     "and        $f20, $f20, $f16                          \n\t"
396     "and        $f22, $f22, $f18                          \n\t"
397     "gslqc1     $f18, $f16, 432-224(%[tmp])               \n\t"
398     "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
399     "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
400     "dli        $11, 0x2                                  \n\t"
401     "psubh      $f28, $f0, $f16                           \n\t"
402     "psubh      $f30, $f0, $f18                           \n\t"
403     "psubh      $f2, $f0, $f6                             \n\t"
404     "psubh      $f0, $f0, $f4                             \n\t"
405     "dmfc1      %[iAlpha], $f28                           \n\t"
406     "dmtc1      $11, $f28                                 \n\t"
407     "psllh      $f20, $f20, $f28                          \n\t"
408     "psllh      $f22, $f22, $f28                          \n\t"
409     "dmtc1      %[iAlpha], $f28                           \n\t"
410     "paddh      $f24, $f24, $f20                          \n\t"
411     "paddh      $f26, $f26, $f22                          \n\t"
412     "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
413     "paddh      $f24, $f24, $f20                          \n\t"
414     "paddh      $f26, $f26, $f22                          \n\t"
415     "gslqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
416     "dli        $11, 0x3                                  \n\t"
417     "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
418     "dmfc1      %[iAlpha], $f0                            \n\t"
419     "dmtc1      $11, $f0                                  \n\t"
420     "psrah      $f24, $f24, $f0                           \n\t"
421     "psrah      $f26, $f26, $f0                           \n\t"
422     "dmtc1      %[iAlpha], $f0                            \n\t"
423     "pmaxsh     $f28, $f28, $f24                          \n\t"
424     "pmaxsh     $f30, $f30, $f26                          \n\t"
425     "pminsh     $f16, $f16, $f28                          \n\t"
426     "pminsh     $f18, $f18, $f30                          \n\t"
427     "gslqc1     $f30, $f28, 432-320(%[tmp])               \n\t"
428     "and        $f16, $f16, $f28                          \n\t"
429     "and        $f18, $f18, $f30                          \n\t"
430     "mov.d      $f24, $f0                                 \n\t"
431     "mov.d      $f26, $f2                                 \n\t"
432     "gslqc1     $f2, $f0, 432-16(%[tmp])                  \n\t"
433     "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
434     "paddh      $f0, $f0, $f28                            \n\t"
435     "paddh      $f2, $f2, $f30                            \n\t"
436     "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
437     "gslqc1     $f18, $f16, 432-368(%[tmp])               \n\t"
438     "dli        $11, 0x1                                  \n\t"
439     "paddh      $f16, $f16, $f16                          \n\t"
440     "paddh      $f18, $f18, $f18                          \n\t"
441     "psubh      $f0, $f0, $f16                            \n\t"
442     "psubh      $f2, $f2, $f18                            \n\t"
443 
444     "dmtc1      $11, $f28                                 \n\t"
445     "gslqc1     $f18, $f16, 432-64(%[tmp])                \n\t"
446     "psrah      $f0, $f0, $f28                            \n\t"
447     "psrah      $f2, $f2, $f28                            \n\t"
448     "pmaxsh     $f24, $f24, $f0                           \n\t"
449     "pmaxsh     $f26, $f26, $f2                           \n\t"
450     "gslqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
451     "pminsh     $f28, $f4, $f24                           \n\t"
452     "pminsh     $f30, $f6, $f26                           \n\t"
453     "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
454     "and        $f28, $f28, $f24                          \n\t"
455     "and        $f30, $f30, $f26                          \n\t"
456     "dmfc1      %[iAlpha], $f24                           \n\t"
457     "dmfc1      %[iBeta], $f26                            \n\t"
458     "gslqc1     $f26, $f24, 432-288(%[tmp])               \n\t"
459     "and        $f28, $f28, $f24                          \n\t"
460     "and        $f30, $f30, $f26                          \n\t"
461     "paddh      $f20, $f20, $f28                          \n\t"
462     "paddh      $f22, $f22, $f30                          \n\t"
463     "packushb   $f8, $f8, $f10                            \n\t"
464     "packushb   $f10, $f20, $f22                          \n\t"
465     "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
466     "paddh      $f0, $f0, $f20                            \n\t"
467     "paddh      $f2, $f2, $f22                            \n\t"
468     "paddh      $f12, $f12, $f16                          \n\t"
469     "paddh      $f14, $f14, $f18                          \n\t"
470     "packushb   $f12, $f12, $f14                          \n\t"
471     "packushb   $f14, $f0, $f2                            \n\t"
472 
473     "gslqc1     $f2, $f0, 432-32(%[tmp])                  \n\t"
474     "psubh      $f0, $f0, $f16                            \n\t"
475     "psubh      $f2, $f2, $f18                            \n\t"
476     "gslqc1     $f18, $f16, 432-80(%[tmp])                \n\t"
477     "psubh      $f16, $f16, $f20                          \n\t"
478     "gslqc1     $f26, $f24, 432-48(%[tmp])                \n\t"
479     "psubh      $f18, $f18, $f22                          \n\t"
480 
481     "gslqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
482     "paddh      $f20, $f20, $f24                          \n\t"
483     "paddh      $f22, $f22, $f26                          \n\t"
484     "gslqc1     $f26, $f24, 432-304(%[tmp])               \n\t"
485     "packushb   $f0, $f0, $f2                             \n\t"
486     "packushb   $f2, $f16, $f18                           \n\t"
487     "gslqc1     $f18, $f16, 432-384(%[tmp])               \n\t"
488     "paddh      $f16, $f16, $f24                          \n\t"
489     "paddh      $f18, $f18, $f26                          \n\t"
490     "gssqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
491     "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
492     "mov.d      $f28, $f0                                 \n\t"
493     "mov.d      $f30, $f2                                 \n\t"
494     "paddh      $f0, $f0, $f0                             \n\t"
495     "paddh      $f2, $f2, $f2                             \n\t"
496 
497     "dmtc1      %[iAlpha], $f24                           \n\t"
498     "dmtc1      %[iBeta], $f26                            \n\t"
499 
500     "psubh      $f16, $f16, $f0                           \n\t"
501     "psubh      $f18, $f18, $f2                           \n\t"
502     "dli        $11, 0x1                                  \n\t"
503     "gslqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
504     "gssqc1     $f10, $f8, 0x0($9)                        \n\t"
505     "dmtc1      $11, $f8                                  \n\t"
506     "psrah      $f16, $f16, $f8                           \n\t"
507     "psrah      $f18, $f18, $f8                           \n\t"
508     "pmaxsh     $f0, $f0, $f16                            \n\t"
509     "pmaxsh     $f2, $f2, $f18                            \n\t"
510     "pminsh     $f4, $f4, $f0                             \n\t"
511     "pminsh     $f6, $f6, $f2                             \n\t"
512     "gslqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
513 
514     "gslqc1     $f10, $f8, 428-256+4(%[tmp])              \n\t"
515     "and        $f4, $f4, $f24                            \n\t"
516     "and        $f6, $f6, $f26                            \n\t"
517     "and        $f4, $f4, $f8                             \n\t"
518     "and        $f6, $f6, $f10                            \n\t"
519     "gssqc1     $f14, $f12, 0x0($13)                      \n\t"
520     "paddh      $f28, $f28, $f4                           \n\t"
521     "paddh      $f30, $f30, $f6                           \n\t"
522     "packushb   $f20, $f20, $f22                          \n\t"
523     "packushb   $f22, $f28, $f30                          \n\t"
524     "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
525     "gssqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
526     : [pPix]"+&r"((unsigned char *)pPix)
527     : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
528       [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
529     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
530       "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
531       "$f22", "$f24", "$f26", "$f28", "$f30"
532   );
533   RECOVER_REG;
534 }
535 
DeblockLumaTransposeH2V_mmi(uint8_t * pPixY,int32_t iStride,uint8_t * pDst)536 void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
537                                  uint8_t *pDst) {
538   BACKUP_REG;
539   __asm__ volatile(
540     ".set       arch=loongson3a                           \n\t"
541     "dsll       $8, %[iStride], 0x3                       \n\t"
542     "daddu      $8, $8, %[pPixY]                          \n\t"
543 
544     "daddu      $9, %[pPixY], %[iStride]                  \n\t"
545     "daddu      $10, $8, %[iStride]                       \n\t"
546     "gsldlc1    $f0, 0x7(%[pPixY])                        \n\t"
547     "gsldlc1    $f2, 0x7($8)                              \n\t"
548     "gsldlc1    $f4, 0x7($9)                              \n\t"
549     "gsldlc1    $f6, 0x7($10)                             \n\t"
550     "gsldrc1    $f0, 0x0(%[pPixY])                        \n\t"
551     "gsldrc1    $f2, 0x0($8)                              \n\t"
552     "gsldrc1    $f4, 0x0($9)                              \n\t"
553     "gsldrc1    $f6, 0x0($10)                             \n\t"
554     "daddu      %[pPixY], $9, %[iStride]                  \n\t"
555     "daddu      $8, $10, %[iStride]                       \n\t"
556     "daddu      $9, %[pPixY], %[iStride]                  \n\t"
557     "daddu      $10, $8, %[iStride]                       \n\t"
558     "gsldlc1    $f8, 0x7(%[pPixY])                        \n\t"
559     "gsldlc1    $f10, 0x7($8)                             \n\t"
560     "gsldlc1    $f12, 0x7($9)                             \n\t"
561     "gsldlc1    $f14, 0x7($10)                            \n\t"
562     "gsldrc1    $f8, 0x0(%[pPixY])                        \n\t"
563     "gsldrc1    $f10, 0x0($8)                             \n\t"
564     "gsldrc1    $f12, 0x0($9)                             \n\t"
565     "gsldrc1    $f14, 0x0($10)                            \n\t"
566 
567     "daddu      %[pPixY], $9, %[iStride]                  \n\t"
568     "daddu      $8, $10, %[iStride]                       \n\t"
569     "daddu      $9, %[pPixY], %[iStride]                  \n\t"
570     "daddu      $10, $8, %[iStride]                       \n\t"
571     "gsldlc1    $f16, 0x7(%[pPixY])                       \n\t"
572     "gsldlc1    $f18, 0x7($8)                             \n\t"
573     "gsldlc1    $f20, 0x7($9)                             \n\t"
574     "gsldlc1    $f22, 0x7($10)                            \n\t"
575     "gsldrc1    $f16, 0x0(%[pPixY])                       \n\t"
576     "gsldrc1    $f18, 0x0($8)                             \n\t"
577     "gsldrc1    $f20, 0x0($9)                             \n\t"
578     "gsldrc1    $f22, 0x0($10)                            \n\t"
579     "daddu      %[pPixY], $9, %[iStride]                  \n\t"
580     "daddu      $8, $10, %[iStride]                       \n\t"
581     "daddu      $9, %[pPixY], %[iStride]                  \n\t"
582     "daddu      $10, $8, %[iStride]                       \n\t"
583     "gsldlc1    $f24, 0x7(%[pPixY])                       \n\t"
584     "gsldlc1    $f26, 0x7($8)                             \n\t"
585 
586     "gsldlc1    $f28, 0x7($9)                             \n\t"
587     "gsldlc1    $f30, 0x7($10)                            \n\t"
588     "gsldrc1    $f24, 0x0(%[pPixY])                       \n\t"
589     "gsldrc1    $f26, 0x0($8)                             \n\t"
590     "gsldrc1    $f28, 0x0($9)                             \n\t"
591     "gsldrc1    $f30, 0x0($10)                            \n\t"
592 
593     MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
594                      $f14, $f16, $f18, $f20, $f22, $f24,
595                      $f26, $f28, $f30, $9, $10)
596 
597     "gssqc1     $f18, $f16, 0x0(%[pDst])                  \n\t"
598     "gssqc1     $f10, $f8, 0x10(%[pDst])                  \n\t"
599     "gssqc1     $f14, $f12, 0x20(%[pDst])                 \n\t"
600     "gssqc1     $f30, $f28, 0x30(%[pDst])                 \n\t"
601     "gssqc1     $f22, $f20, 0x40(%[pDst])                 \n\t"
602     "gssqc1     $f6, $f4, 0x50(%[pDst])                   \n\t"
603     "gssqc1     $f26, $f24, 0x60(%[pDst])                 \n\t"
604     "gssqc1     $f2, $f0, 0x70(%[pDst])                   \n\t"
605     : [pPixY] "+&r"((unsigned char *)pPixY)
606     : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
607     : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
608       "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
609       "$f30"
610   );
611   RECOVER_REG;
612 }
613 
DeblockLumaTransposeV2H_mmi(uint8_t * pPixY,int32_t iStride,uint8_t * pSrc)614 void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
615                                  uint8_t *pSrc) {
616   BACKUP_REG;
617   __asm__ volatile(
618     ".set       arch=loongson3a                           \n\t"
619     "gslqc1     $f2, $f0, 0x0(%[pSrc])                    \n\t"
620     "gslqc1     $f6, $f4, 0x10(%[pSrc])                   \n\t"
621     "gslqc1     $f10, $f8, 0x20(%[pSrc])                  \n\t"
622     "gslqc1     $f14, $f12, 0x30(%[pSrc])                 \n\t"
623     "gslqc1     $f18, $f16, 0x40(%[pSrc])                 \n\t"
624     "gslqc1     $f22, $f20, 0x50(%[pSrc])                 \n\t"
625     "gslqc1     $f26, $f24, 0x60(%[pSrc])                 \n\t"
626     "gslqc1     $f30, $f28, 0x70(%[pSrc])                 \n\t"
627 
628     MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
629                      $f14, $f16, $f18, $f20, $f22, $f24,
630                      $f26, $f28, $f30, $9, $10)
631 
632     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
633     "gssdlc1    $f16, 0x7(%[pPixY])                       \n\t"
634     "gssdlc1    $f8, 0x7($8)                              \n\t"
635     "gssdrc1    $f16, 0x0(%[pPixY])                       \n\t"
636     "gssdrc1    $f8, 0x0($8)                              \n\t"
637     "daddu      %[pPixY], $8, %[iStride]                  \n\t"
638     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
639     "gssdlc1    $f12, 0x7(%[pPixY])                       \n\t"
640     "gssdlc1    $f28, 0x7($8)                             \n\t"
641     "gssdrc1    $f12, 0x0(%[pPixY])                       \n\t"
642     "gssdrc1    $f28, 0x0($8)                             \n\t"
643 
644     "daddu      %[pPixY], $8, %[iStride]                  \n\t"
645     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
646     "gssdlc1    $f20, 0x7(%[pPixY])                       \n\t"
647     "gssdlc1    $f4, 0x7($8)                              \n\t"
648     "gssdrc1    $f20, 0x0(%[pPixY])                       \n\t"
649     "gssdrc1    $f4, 0x0($8)                              \n\t"
650     "daddu      %[pPixY], $8, %[iStride]                  \n\t"
651     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
652     "gssdlc1    $f24, 0x7(%[pPixY])                       \n\t"
653     "gssdlc1    $f0, 0x7($8)                              \n\t"
654     "gssdrc1    $f24, 0x0(%[pPixY])                       \n\t"
655     "gssdrc1    $f0, 0x0($8)                              \n\t"
656 
657     "daddu      %[pPixY], $8, %[iStride]                  \n\t"
658     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
659     "gssdlc1    $f18, 0x7(%[pPixY])                       \n\t"
660     "gssdlc1    $f10, 0x7($8)                             \n\t"
661     "gssdrc1    $f18, 0x0(%[pPixY])                       \n\t"
662     "gssdrc1    $f10, 0x0($8)                             \n\t"
663     "daddu      %[pPixY], $8, %[iStride]                  \n\t"
664     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
665     "gssdlc1    $f14, 0x7(%[pPixY])                       \n\t"
666     "gssdlc1    $f30, 0x7($8)                             \n\t"
667     "gssdrc1    $f14, 0x0(%[pPixY])                       \n\t"
668     "gssdrc1    $f30, 0x0($8)                             \n\t"
669 
670     "daddu      %[pPixY], $8, %[iStride]                  \n\t"
671     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
672     "gssdlc1    $f22, 0x7(%[pPixY])                       \n\t"
673     "gssdlc1    $f6, 0x7($8)                              \n\t"
674     "gssdrc1    $f22, 0x0(%[pPixY])                       \n\t"
675     "gssdrc1    $f6, 0x0($8)                              \n\t"
676     "daddu      %[pPixY], $8, %[iStride]                  \n\t"
677     "daddu      $8, %[pPixY], %[iStride]                  \n\t"
678     "gssdlc1    $f26, 0x7(%[pPixY])                       \n\t"
679     "gssdlc1    $f2, 0x7($8)                              \n\t"
680     "gssdrc1    $f26, 0x0(%[pPixY])                       \n\t"
681     "gssdrc1    $f2, 0x0($8)                              \n\t"
682     : [pPixY] "+&r"((unsigned char *)pPixY)
683     : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
684     : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
685       "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
686       "$f30"
687   );
688   RECOVER_REG;
689 }
690 
DeblockLumaEq4V_mmi(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)691 void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
692                          int32_t iBeta) {
693   unsigned char tmp[720] __attribute__((aligned(32)));
694   BACKUP_REG;
695   __asm__ volatile (
696     ".set       arch=loongson3a                           \n\t"
697     "dsll       $11, %[iStride], 0x2                      \n\t"
698     "xor        $f8, $f8, $f8                             \n\t"
699     "daddu      $14, %[iStride], %[pPix]                  \n\t"
700     "dsubu      $8, %[pPix], $11                          \n\t"
701     "gslqc1     $f14, $f12, 0x0($8)                       \n\t"
702     "gslqc1     $f22, $f20, 0x0(%[pPix])                  \n\t"
703     "daddu      $9, %[iStride], %[iStride]                \n\t"
704     "daddu      $10, $9, %[iStride]                       \n\t"
705     "move       $12, $9                                   \n\t"
706     "dsubu      $8, %[pPix], $9                           \n\t"
707     "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
708     "dsubu      $9, %[pPix], %[iStride]                   \n\t"
709     "gslqc1     $f18, $f16, 0x0($9)                       \n\t"
710     "daddu      $13, %[iStride], %[pPix]                  \n\t"
711 
712     "move       %[iStride], $12                           \n\t"
713     "daddu      $15, $12, %[pPix]                         \n\t"
714 
715     "daddu      $12, %[pPix], $10                         \n\t"
716     "dsubu      $11, %[pPix], $10                         \n\t"
717 
718     "gslqc1     $f26, $f24, 0x0($11)                      \n\t"
719     "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
720     "dmtc1      %[iAlpha], $f0                            \n\t"
721 
722     "punpcklhw  $f28, $f0, $f0                            \n\t"
723     "punpcklwd  $f0, $f28, $f28                           \n\t"
724     "mov.d      $f2, $f0                                  \n\t"
725     "gssqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
726     "dmtc1      %[iBeta], $f0                             \n\t"
727     "gsldxc1    $f10, 0x0($15, $0)                        \n\t"
728     "punpcklhw  $f28, $f0, $f0                            \n\t"
729     "punpcklwd  $f0, $f28, $f28                           \n\t"
730     "punpckhbh  $f30, $f10, $f8                           \n\t"
731     "mov.d      $f2, $f0                                  \n\t"
732 
733     "punpcklbh  $f28, $f10, $f8                           \n\t"
734     "gssqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
735     "gssqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
736     "mov.d      $f0, $f4                                  \n\t"
737     "gssqc1     $f22, $f20, 704-272(%[tmp])               \n\t"
738     "gssqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
739     "mov.d      $f4, $f16                                 \n\t"
740     "punpckhbh  $f22, $f20, $f8                           \n\t"
741     "punpcklbh  $f20, $f20, $f8                           \n\t"
742     "punpckhbh  $f6, $f4, $f8                             \n\t"
743     "punpcklbh  $f4, $f4, $f8                             \n\t"
744 
745     "psubh      $f28, $f20, $f4                           \n\t"
746     "psubh      $f30, $f22, $f6                           \n\t"
747     WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
748     "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
749     "punpckhbh  $f2, $f0, $f8                             \n\t"
750     "punpcklbh  $f0, $f0, $f8                             \n\t"
751     "gssqc1     $f18, $f16, 688-272(%[tmp])               \n\t"
752     "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
753     "gssqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
754 
755     "psubh      $f28, $f4, $f0                            \n\t"
756     "psubh      $f30, $f6, $f2                            \n\t"
757 
758     "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
759     WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
760     "punpckhbh  $f18, $f16, $f8                           \n\t"
761     "punpcklbh  $f16, $f16, $f8                           \n\t"
762     "pcmpgth    $f0, $f0, $f28                            \n\t"
763     "pcmpgth    $f2, $f2, $f30                            \n\t"
764     "gssqc1     $f18, $f16, 640-384(%[tmp])               \n\t"
765     "psubh      $f28, $f20, $f16                          \n\t"
766     "psubh      $f30, $f22, $f18                          \n\t"
767     "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
768     "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
769     "punpckhbh  $f26, $f24, $f8                           \n\t"
770     "punpcklbh  $f24, $f24, $f8                           \n\t"
771     WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
772     "gssqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
773     "gssqc1     $f6, $f4, 640-144(%[tmp])                 \n\t"
774     "gssqc1     $f22, $f20, 640-400(%[tmp])               \n\t"
775     "pcmpgth    $f16, $f16, $f28                          \n\t"
776     "pcmpgth    $f18, $f18, $f30                          \n\t"
777     "and        $f0, $f0, $f16                            \n\t"
778     "and        $f2, $f2, $f18                            \n\t"
779     "gslqc1     $f18, $f16, 640-320(%[tmp])               \n\t"
780     "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
781     "dli        %[iAlpha], 0x2                            \n\t"
782     "dli        %[iBeta], 0x2                             \n\t"
783     "pcmpgth    $f16, $f16, $f28                          \n\t"
784     "pcmpgth    $f18, $f18, $f30                          \n\t"
785     "and        $f0, $f0, $f16                            \n\t"
786     "and        $f2, $f2, $f18                            \n\t"
787     "dmtc1      %[iAlpha], $f16                           \n\t"
788     "dmtc1      %[iBeta], $f10                            \n\t"
789     "gssqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
790     "gslqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
791 
792     "punpcklhw  $f28, $f16, $f16                          \n\t"
793     "psrah      $f16, $f0, $f10                           \n\t"
794     "psrah      $f18, $f2, $f10                           \n\t"
795     "punpcklwd  $f28, $f28, $f28                          \n\t"
796     "mov.d      $f30, $f28                                \n\t"
797     "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
798     "paddh      $f16, $f16, $f28                          \n\t"
799     "paddh      $f18, $f18, $f30                          \n\t"
800     "gssqc1     $f18, $f16, 640-576(%[tmp])               \n\t"
801     "pcmpgth    $f16, $f16, $f8                           \n\t"
802     "pcmpgth    $f18, $f18, $f10                          \n\t"
803     "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
804 
805     "gssqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
806     "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
807     "psubh      $f28, $f4, $f24                           \n\t"
808     "psubh      $f30, $f6, $f26                           \n\t"
809     WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
810     "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
811     "pcmpgth    $f16, $f16, $f28                          \n\t"
812     "pcmpgth    $f18, $f18, $f30                          \n\t"
813 
814     "gslqc1     $f2, $f0, 640-416(%[tmp])                 \n\t"
815     "and        $f16, $f16, $f8                           \n\t"
816     "and        $f18, $f18, $f10                          \n\t"
817     "gssqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
818     "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
819     "psubh      $f28, $f20, $f0                           \n\t"
820     "psubh      $f30, $f22, $f2                           \n\t"
821     WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
822     "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
823     "pcmpgth    $f16, $f16, $f28                          \n\t"
824     "pcmpgth    $f18, $f18, $f30                          \n\t"
825 
826     "and        $f16, $f16, $f8                           \n\t"
827     "and        $f18, $f18, $f10                          \n\t"
828     "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
829 
830     "gslqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
831     "xor        $f8, $f8, $f8                             \n\t"
832     "pandn      $f16, $f16, $f24                          \n\t"
833     "dli        %[iAlpha], 0x4                            \n\t"
834     "pandn      $f18, $f18, $f26                          \n\t"
835     "gssqc1     $f18, $f16, 640-16(%[tmp])                \n\t"
836     "dmtc1      %[iAlpha], $f16                           \n\t"
837     "punpcklhw  $f28, $f16, $f16                          \n\t"
838     "dli        %[iAlpha], 0x1                            \n\t"
839     "punpckhbh  $f18, $f12, $f8                           \n\t"
840     "dmtc1      %[iAlpha], $f30                           \n\t"
841     "punpcklbh  $f16, $f12, $f8                           \n\t"
842     "psllh      $f16, $f16, $f30                          \n\t"
843     "psllh      $f18, $f18, $f30                          \n\t"
844     "paddh      $f16, $f16, $f24                          \n\t"
845     "paddh      $f18, $f18, $f26                          \n\t"
846     "gslqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
847     "paddh      $f16, $f16, $f24                          \n\t"
848     "paddh      $f18, $f18, $f26                          \n\t"
849     "paddh      $f16, $f16, $f24                          \n\t"
850     "paddh      $f18, $f18, $f26                          \n\t"
851     "paddh      $f16, $f16, $f0                           \n\t"
852     "paddh      $f18, $f18, $f2                           \n\t"
853 
854     "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
855     "punpcklwd  $f28, $f28, $f28                          \n\t"
856     "mov.d      $f30, $f28                                \n\t"
857     "paddh      $f16, $f16, $f4                           \n\t"
858     "paddh      $f18, $f18, $f6                           \n\t"
859     "gssqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
860     "paddh      $f16, $f16, $f20                          \n\t"
861     "paddh      $f18, $f18, $f22                          \n\t"
862     "paddh      $f16, $f16, $f28                          \n\t"
863     "paddh      $f18, $f18, $f30                          \n\t"
864     "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
865     "gslqc1     $f2, $f0, 640-384(%[tmp])                 \n\t"
866     "pandn      $f24, $f24, $f28                          \n\t"
867     "pandn      $f26, $f26, $f30                          \n\t"
868     "gssqc1     $f26, $f24, 640-80(%[tmp])                \n\t"
869     "gslqc1     $f26, $f24, 0x0($12)                      \n\t"
870     "dmtc1      %[iAlpha], $f10                           \n\t"
871     "punpckhbh  $f26, $f24, $f8                           \n\t"
872     "punpcklbh  $f24, $f24, $f8                           \n\t"
873     "psllh      $f24, $f24, $f10                          \n\t"
874     "psllh      $f26, $f26, $f10                          \n\t"
875     "paddh      $f24, $f24, $f28                          \n\t"
876     "paddh      $f26, $f26, $f30                          \n\t"
877     "paddh      $f24, $f24, $f28                          \n\t"
878     "paddh      $f26, $f26, $f30                          \n\t"
879     "paddh      $f24, $f24, $f28                          \n\t"
880     "paddh      $f26, $f26, $f30                          \n\t"
881     "paddh      $f24, $f24, $f0                           \n\t"
882     "paddh      $f26, $f26, $f2                           \n\t"
883 
884     "dli        %[iAlpha], 0x3                            \n\t"
885     "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
886     "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
887     "paddh      $f24, $f24, $f20                          \n\t"
888     "paddh      $f26, $f26, $f22                          \n\t"
889     "paddh      $f24, $f24, $f4                           \n\t"
890     "paddh      $f26, $f26, $f6                           \n\t"
891     "paddh      $f24, $f24, $f0                           \n\t"
892     "paddh      $f26, $f26, $f2                           \n\t"
893     "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
894     "dmtc1      %[iAlpha], $f10                           \n\t"
895     "psrah      $f24, $f24, $f10                          \n\t"
896     "psrah      $f26, $f26, $f10                          \n\t"
897     "and        $f24, $f24, $f0                           \n\t"
898     "and        $f26, $f26, $f2                           \n\t"
899     "gssqc1     $f26, $f24, 640-112(%[tmp])               \n\t"
900     "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
901     "pandn      $f24, $f24, $f28                          \n\t"
902     "pandn      $f26, $f26, $f30                          \n\t"
903     "gssqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
904     "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
905     "gssqc1     $f26, $f24, 640-528(%[tmp])               \n\t"
906     "gslqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
907     "gslqc1     $f2, $f0, 640-544(%[tmp])                 \n\t"
908     "dmtc1      %[iAlpha], $f10                           \n\t"
909     "paddh      $f24, $f24, $f28                          \n\t"
910     "paddh      $f26, $f26, $f30                          \n\t"
911     "psrah      $f16, $f16, $f10                          \n\t"
912     "psrah      $f18, $f18, $f10                          \n\t"
913     "and        $f16, $f16, $f0                           \n\t"
914     "and        $f18, $f18, $f2                           \n\t"
915     "gslqc1     $f2, $f0, 640-624(%[tmp])                 \n\t"
916     "paddh      $f28, $f4, $f20                           \n\t"
917     "paddh      $f30, $f6, $f22                           \n\t"
918     "paddh      $f24, $f24, $f28                          \n\t"
919     "paddh      $f26, $f26, $f30                          \n\t"
920     "paddh      $f24, $f24, $f0                           \n\t"
921     "paddh      $f26, $f26, $f2                           \n\t"
922     "gslqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
923     "dli        %[iAlpha], 0x2                            \n\t"
924 
925     "dmtc1      %[iAlpha], $f10                           \n\t"
926     "paddh      $f20, $f20, $f4                           \n\t"
927     "paddh      $f22, $f22, $f6                           \n\t"
928     "psrah      $f24, $f24, $f10                          \n\t"
929     "psrah      $f26, $f26, $f10                          \n\t"
930     "and        $f28, $f28, $f24                          \n\t"
931     "and        $f30, $f30, $f26                          \n\t"
932 
933     "gslqc1     $f26, $f24, 640-384(%[tmp])               \n\t"
934     "gssqc1     $f30, $f28, 640-64(%[tmp])                \n\t"
935     "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
936     "pandn      $f28, $f28, $f24                          \n\t"
937     "pandn      $f30, $f30, $f26                          \n\t"
938     "gssqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
939     "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
940     "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
941     "paddh      $f28, $f28, $f24                          \n\t"
942     "paddh      $f30, $f30, $f26                          \n\t"
943     "paddh      $f28, $f28, $f20                          \n\t"
944     "paddh      $f30, $f30, $f22                          \n\t"
945     "paddh      $f28, $f28, $f8                           \n\t"
946     "paddh      $f30, $f30, $f10                          \n\t"
947     "dmtc1      %[iAlpha], $f10                           \n\t"
948     "gslqc1     $f22, $f20, 640-560(%[tmp])               \n\t"
949     "psrah      $f28, $f28, $f10                          \n\t"
950     "psrah      $f30, $f30, $f10                          \n\t"
951     "and        $f20, $f20, $f28                          \n\t"
952     "and        $f22, $f22, $f30                          \n\t"
953     "gssqc1     $f22, $f20, 640-32(%[tmp])                \n\t"
954 
955     "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
956     "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
957     "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
958     "paddh      $f28, $f20, $f20                          \n\t"
959     "paddh      $f30, $f22, $f22                          \n\t"
960     "paddh      $f20, $f4, $f24                           \n\t"
961     "paddh      $f22, $f6, $f26                           \n\t"
962     "paddh      $f24, $f24, $f0                           \n\t"
963     "paddh      $f26, $f26, $f2                           \n\t"
964     "paddh      $f28, $f28, $f20                          \n\t"
965     "paddh      $f30, $f30, $f22                          \n\t"
966     "paddh      $f28, $f28, $f8                           \n\t"
967     "paddh      $f30, $f30, $f10                          \n\t"
968     "dmtc1      %[iAlpha], $f10                           \n\t"
969     "gslqc1     $f22, $f20, 640-544(%[tmp])               \n\t"
970     "psrah      $f28, $f28, $f10                          \n\t"
971     "psrah      $f30, $f30, $f10                          \n\t"
972     "dli        %[iAlpha], 0x1                            \n\t"
973     "pandn      $f20, $f20, $f28                          \n\t"
974     "pandn      $f22, $f22, $f30                          \n\t"
975     "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
976     "paddh      $f28, $f28, $f4                           \n\t"
977     "paddh      $f30, $f30, $f6                           \n\t"
978     "gslqc1     $f6, $f4, 640-400(%[tmp])                 \n\t"
979     "paddh      $f28, $f28, $f4                           \n\t"
980     "paddh      $f30, $f30, $f6                           \n\t"
981     "gslqc1     $f6, $f4, 640-544(%[tmp])                 \n\t"
982     "dmtc1      %[iAlpha], $f10                           \n\t"
983     "gssqc1     $f22, $f20, 640-352(%[tmp])               \n\t"
984     "gslqc1     $f22, $f20, 640-368(%[tmp])               \n\t"
985     "psllh      $f28, $f28, $f10                          \n\t"
986     "psllh      $f30, $f30, $f10                          \n\t"
987     "dli        %[iAlpha], 0x3                            \n\t"
988     "paddh      $f28, $f28, $f24                          \n\t"
989     "paddh      $f30, $f30, $f26                          \n\t"
990     "paddh      $f20, $f20, $f28                          \n\t"
991     "paddh      $f22, $f22, $f30                          \n\t"
992     "dmtc1      %[iAlpha], $f10                           \n\t"
993 
994     "dli        %[iAlpha], 0x2                            \n\t"
995     "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
996     "psrah      $f20, $f20, $f10                          \n\t"
997     "psrah      $f22, $f22, $f10                          \n\t"
998     "and        $f4, $f4, $f20                            \n\t"
999     "and        $f6, $f6, $f22                            \n\t"
1000     "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
1001     "gssqc1     $f6, $f4, 640-96(%[tmp])                  \n\t"
1002     "gslqc1     $f6, $f4, 640-384(%[tmp])                 \n\t"
1003     "gslqc1     $f10, $f8, 640-400(%[tmp])                \n\t"
1004     "paddh      $f24, $f4, $f4                            \n\t"
1005     "paddh      $f26, $f6, $f6                            \n\t"
1006     "paddh      $f4, $f4, $f8                             \n\t"
1007     "paddh      $f6, $f6, $f10                            \n\t"
1008     "gslqc1     $f10, $f8, 640-144(%[tmp])                \n\t"
1009     "paddh      $f28, $f28, $f20                          \n\t"
1010     "paddh      $f30, $f30, $f22                          \n\t"
1011     "paddh      $f4, $f4, $f8                             \n\t"
1012     "paddh      $f6, $f6, $f10                            \n\t"
1013     "gslqc1     $f10, $f8, 640-592(%[tmp])                \n\t"
1014     "paddh      $f24, $f24, $f28                          \n\t"
1015     "paddh      $f26, $f26, $f30                          \n\t"
1016     "paddh      $f20, $f20, $f8                           \n\t"
1017     "paddh      $f22, $f22, $f10                          \n\t"
1018     "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
1019     "paddh      $f24, $f24, $f8                           \n\t"
1020     "dmtc1      %[iAlpha], $f8                            \n\t"
1021     "paddh      $f26, $f26, $f10                          \n\t"
1022     "dli        %[iAlpha], 0x1                            \n\t"
1023     "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
1024     "dmtc1      %[iAlpha], $f10                           \n\t"
1025     "psrah      $f24, $f24, $f8                           \n\t"
1026     "psrah      $f26, $f26, $f8                           \n\t"
1027     "psllh      $f4, $f4, $f10                            \n\t"
1028     "psllh      $f6, $f6, $f10                            \n\t"
1029     "paddh      $f4, $f4, $f20                            \n\t"
1030     "paddh      $f6, $f6, $f22                            \n\t"
1031     "dli        %[iAlpha], 0x3                            \n\t"
1032 
1033     "gslqc1     $f22, $f20, 656-272(%[tmp])               \n\t"
1034     "pandn      $f28, $f28, $f24                          \n\t"
1035     "pandn      $f30, $f30, $f26                          \n\t"
1036     "gslqc1     $f26, $f24, 640-416(%[tmp])               \n\t"
1037     "dmtc1      %[iAlpha], $f10                           \n\t"
1038     "paddh      $f24, $f24, $f4                           \n\t"
1039     "paddh      $f26, $f26, $f6                           \n\t"
1040     "gslqc1     $f6, $f4, 640-560(%[tmp])                 \n\t"
1041     "psrah      $f24, $f24, $f10                          \n\t"
1042     "psrah      $f26, $f26, $f10                          \n\t"
1043     "and        $f4, $f4, $f24                            \n\t"
1044     "and        $f6, $f6, $f26                            \n\t"
1045 
1046     "xor        $f8, $f8, $f8                             \n\t"
1047     "gslqc1     $f26, $f24, 704-272(%[tmp])               \n\t"
1048     "gssqc1     $f6, $f4, 640-128(%[tmp])                 \n\t"
1049     "gslqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
1050     "punpcklbh  $f4, $f6, $f8                             \n\t"
1051     "punpckhbh  $f6, $f6, $f8                             \n\t"
1052     "gssqc1     $f6, $f4, 640-448(%[tmp])                 \n\t"
1053     "gslqc1     $f6, $f4, 688-272(%[tmp])                 \n\t"
1054     "punpcklbh  $f4, $f6, $f8                             \n\t"
1055     "punpckhbh  $f6, $f6, $f8                             \n\t"
1056     "punpcklbh  $f24, $f26, $f8                           \n\t"
1057     "punpckhbh  $f26, $f26, $f8                           \n\t"
1058     "gssqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
1059     "punpcklbh  $f20, $f22, $f8                           \n\t"
1060     "punpckhbh  $f22, $f22, $f8                           \n\t"
1061     "gslqc1     $f30, $f28, 0x0($14)                      \n\t"
1062     "gssqc1     $f6, $f4, 640-496(%[tmp])                 \n\t"
1063     "gssqc1     $f26, $f24, 640-432(%[tmp])               \n\t"
1064 
1065     "gsldxc1    $f0, 0x8($15, $0)                         \n\t"
1066     "punpcklbh  $f28, $f30, $f8                           \n\t"
1067     "punpckhbh  $f30, $f30, $f8                           \n\t"
1068     "gssqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
1069 
1070     "punpcklbh  $f28, $f0, $f8                            \n\t"
1071     "punpckhbh  $f30, $f0, $f8                            \n\t"
1072     "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
1073     "gssqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
1074 
1075     "psubh      $f28, $f24, $f4                           \n\t"
1076     "psubh      $f30, $f26, $f6                           \n\t"
1077     "psubh      $f24, $f24, $f8                           \n\t"
1078     "psubh      $f26, $f26, $f10                          \n\t"
1079     WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1080     "gslqc1     $f10, $f8, 640-16(%[tmp])                 \n\t"
1081     "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
1082     "or         $f16, $f16, $f8                           \n\t"
1083     "or         $f18, $f18, $f10                          \n\t"
1084     WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
1085     "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
1086     "psubh      $f28, $f4, $f28                           \n\t"
1087     "psubh      $f30, $f6, $f30                           \n\t"
1088 
1089     "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
1090     WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1091     "pcmpgth    $f4, $f0, $f28                            \n\t"
1092     "pcmpgth    $f6, $f2, $f30                            \n\t"
1093     "pcmpgth    $f28, $f0, $f24                           \n\t"
1094     "pcmpgth    $f30, $f2, $f26                           \n\t"
1095     "gslqc1     $f26, $f24, 640-320(%[tmp])               \n\t"
1096     "and        $f4, $f4, $f28                            \n\t"
1097     "and        $f6, $f6, $f30                            \n\t"
1098     "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
1099     "pcmpgth    $f24, $f24, $f28                          \n\t"
1100     "pcmpgth    $f26, $f26, $f30                          \n\t"
1101     "and        $f4, $f4, $f24                            \n\t"
1102     "and        $f6, $f6, $f26                            \n\t"
1103 
1104     "gslqc1     $f26, $f24, 640-576(%[tmp])               \n\t"
1105     "pcmpgth    $f24, $f24, $f28                          \n\t"
1106     "pcmpgth    $f26, $f26, $f30                          \n\t"
1107     "xor        $f8, $f8, $f8                             \n\t"
1108     "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
1109     "punpcklbh  $f12, $f14, $f8                           \n\t"
1110     "punpckhbh  $f14, $f14, $f8                           \n\t"
1111     "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
1112     "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
1113     "psubh      $f28, $f28, $f20                          \n\t"
1114     "psubh      $f30, $f30, $f22                          \n\t"
1115     WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1116     "pcmpgth    $f24, $f24, $f28                          \n\t"
1117     "pcmpgth    $f26, $f26, $f30                          \n\t"
1118 
1119     "dli        %[iAlpha], 0x1                            \n\t"
1120     "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
1121     "and        $f24, $f24, $f8                           \n\t"
1122     "and        $f26, $f26, $f10                          \n\t"
1123     "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
1124     "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
1125     "psubh      $f28, $f28, $f8                           \n\t"
1126     "psubh      $f30, $f30, $f10                          \n\t"
1127     "dmtc1      %[iAlpha], $f10                           \n\t"
1128 
1129     "psllh      $f12, $f12, $f10                          \n\t"
1130     "psllh      $f14, $f14, $f10                          \n\t"
1131     "gssqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
1132     "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
1133 
1134     "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
1135     "paddh      $f12, $f12, $f20                          \n\t"
1136     "paddh      $f14, $f14, $f22                          \n\t"
1137     "paddh      $f12, $f12, $f20                          \n\t"
1138     "paddh      $f14, $f14, $f22                          \n\t"
1139     "paddh      $f12, $f12, $f20                          \n\t"
1140     "paddh      $f14, $f14, $f22                          \n\t"
1141     "paddh      $f12, $f12, $f8                           \n\t"
1142     "paddh      $f14, $f14, $f10                          \n\t"
1143     "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
1144     "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
1145     "paddh      $f12, $f12, $f8                           \n\t"
1146     "paddh      $f14, $f14, $f10                          \n\t"
1147     WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
1148     "pcmpgth    $f24, $f24, $f28                          \n\t"
1149     "pcmpgth    $f26, $f26, $f30                          \n\t"
1150     "and        $f24, $f24, $f0                           \n\t"
1151     "and        $f26, $f26, $f2                           \n\t"
1152     "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
1153     "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
1154 
1155     "gslqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
1156     "dli        %[iAlpha], 0x3                            \n\t"
1157     "gslqc1     $f30, $f28, 640-368(%[tmp])               \n\t"
1158     "and        $f24, $f0, $f16                           \n\t"
1159     "and        $f26, $f2, $f18                           \n\t"
1160     "pandn      $f16, $f0, $f28                           \n\t"
1161     "pandn      $f18, $f2, $f30                           \n\t"
1162     "or         $f24, $f24, $f16                          \n\t"
1163     "or         $f26, $f26, $f18                          \n\t"
1164     "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
1165     "paddh      $f12, $f12, $f16                          \n\t"
1166     "paddh      $f14, $f14, $f18                          \n\t"
1167     "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
1168     "paddh      $f12, $f12, $f28                          \n\t"
1169     "paddh      $f14, $f14, $f30                          \n\t"
1170     "dmtc1      %[iAlpha], $f28                           \n\t"
1171     "psrah      $f12, $f12, $f28                          \n\t"
1172     "psrah      $f14, $f14, $f28                          \n\t"
1173     "and        $f12, $f12, $f8                           \n\t"
1174     "and        $f14, $f14, $f10                          \n\t"
1175     "pandn      $f8, $f8, $f20                            \n\t"
1176     "pandn      $f10, $f10, $f22                          \n\t"
1177     "or         $f12, $f12, $f8                           \n\t"
1178     "or         $f14, $f14, $f10                          \n\t"
1179     "and        $f28, $f4, $f12                           \n\t"
1180     "and        $f30, $f6, $f14                           \n\t"
1181     "gslqc1     $f14, $f12, 640-64(%[tmp])                \n\t"
1182     "gslqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
1183     "or         $f12, $f12, $f8                           \n\t"
1184     "or         $f14, $f14, $f10                          \n\t"
1185     "pandn      $f8, $f4, $f20                            \n\t"
1186     "pandn      $f10, $f6, $f22                           \n\t"
1187     "or         $f28, $f28, $f8                           \n\t"
1188     "or         $f30, $f30, $f10                          \n\t"
1189 
1190     "dli        %[iAlpha], 0x2                            \n\t"
1191     "and        $f8, $f0, $f12                            \n\t"
1192     "and        $f10, $f2, $f14                           \n\t"
1193     "gslqc1     $f14, $f12, 640-480(%[tmp])               \n\t"
1194     "pandn      $f12, $f0, $f12                           \n\t"
1195     "pandn      $f14, $f2, $f14                           \n\t"
1196     "or         $f8, $f8, $f12                            \n\t"
1197     "or         $f10, $f10, $f14                          \n\t"
1198     "packushb   $f24, $f24, $f26                          \n\t"
1199     "packushb   $f26, $f28, $f30                          \n\t"
1200     "gssqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
1201     "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
1202     "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
1203     "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
1204     "paddh      $f8, $f20, $f8                            \n\t"
1205     "paddh      $f10, $f22, $f10                          \n\t"
1206     "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
1207     "paddh      $f28, $f28, $f16                          \n\t"
1208     "paddh      $f30, $f30, $f18                          \n\t"
1209     "paddh      $f8, $f8, $f28                            \n\t"
1210     "paddh      $f10, $f10, $f30                          \n\t"
1211     "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
1212     "paddh      $f8, $f8, $f28                            \n\t"
1213     "paddh      $f10, $f10, $f30                          \n\t"
1214     "dmtc1      %[iAlpha], $f28                           \n\t"
1215     "psrah      $f8, $f8, $f28                            \n\t"
1216     "psrah      $f10, $f10, $f28                          \n\t"
1217     "dli        %[iAlpha], 0x1                            \n\t"
1218     "gslqc1     $f30, $f28, 640-544(%[tmp])               \n\t"
1219     "and        $f24, $f24, $f8                           \n\t"
1220     "and        $f26, $f26, $f10                          \n\t"
1221     "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
1222     "pandn      $f28, $f28, $f8                           \n\t"
1223     "pandn      $f30, $f30, $f10                          \n\t"
1224     "or         $f24, $f24, $f28                          \n\t"
1225     "or         $f26, $f26, $f30                          \n\t"
1226     "and        $f12, $f4, $f24                           \n\t"
1227     "and        $f14, $f6, $f26                           \n\t"
1228     "pandn      $f24, $f4, $f8                            \n\t"
1229     "pandn      $f26, $f6, $f10                           \n\t"
1230     "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
1231     "paddh      $f8, $f8, $f28                            \n\t"
1232     "paddh      $f10, $f10, $f30                          \n\t"
1233     "paddh      $f8, $f8, $f16                            \n\t"
1234     "paddh      $f10, $f10, $f18                          \n\t"
1235     "or         $f12, $f12, $f24                          \n\t"
1236     "or         $f14, $f14, $f26                          \n\t"
1237     "gslqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
1238     "dmtc1      %[iAlpha], $f28                           \n\t"
1239     "packushb   $f24, $f24, $f26                          \n\t"
1240     "packushb   $f26, $f12, $f14                          \n\t"
1241     "psllh      $f8, $f8, $f28                            \n\t"
1242     "psllh      $f10, $f10, $f28                          \n\t"
1243     "gssqc1     $f26, $f24, 672-272(%[tmp])               \n\t"
1244     "gslqc1     $f26, $f24, 640-96(%[tmp])                \n\t"
1245     "gslqc1     $f30, $f28, 640-352(%[tmp])               \n\t"
1246     "or         $f24, $f24, $f28                          \n\t"
1247     "or         $f26, $f26, $f30                          \n\t"
1248     "dli        %[iAlpha], 0x3                            \n\t"
1249 
1250     "and        $f12, $f0, $f24                           \n\t"
1251     "and        $f14, $f2, $f26                           \n\t"
1252     "gslqc1     $f26, $f24, 640-144(%[tmp])               \n\t"
1253     "pandn      $f24, $f0, $f24                           \n\t"
1254     "pandn      $f26, $f2, $f26                           \n\t"
1255     "or         $f12, $f12, $f24                          \n\t"
1256     "or         $f14, $f14, $f26                          \n\t"
1257     "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
1258     "gssqc1     $f14, $f12, 640-352(%[tmp])               \n\t"
1259     "gslqc1     $f14, $f12, 640-464(%[tmp])               \n\t"
1260     "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
1261     "paddh      $f12, $f12, $f28                          \n\t"
1262     "paddh      $f14, $f14, $f30                          \n\t"
1263     "paddh      $f8, $f8, $f12                            \n\t"
1264     "paddh      $f10, $f10, $f14                          \n\t"
1265     "gslqc1     $f14, $f12, 640-448(%[tmp])               \n\t"
1266     "paddh      $f20, $f20, $f8                           \n\t"
1267     "paddh      $f22, $f22, $f10                          \n\t"
1268     "dmtc1      %[iAlpha], $f28                           \n\t"
1269     "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
1270     "psrah      $f20, $f20, $f28                          \n\t"
1271     "psrah      $f22, $f22, $f28                          \n\t"
1272     "and        $f24, $f24, $f20                          \n\t"
1273     "and        $f26, $f26, $f22                          \n\t"
1274     "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
1275     "paddh      $f8, $f8, $f20                            \n\t"
1276     "paddh      $f10, $f10, $f22                          \n\t"
1277     "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
1278     "dli        %[iAlpha], 0x2                            \n\t"
1279     "paddh      $f20, $f20, $f28                          \n\t"
1280     "paddh      $f22, $f22, $f30                          \n\t"
1281     "paddh      $f16, $f12, $f12                          \n\t"
1282     "paddh      $f18, $f14, $f14                          \n\t"
1283     "paddh      $f16, $f16, $f8                           \n\t"
1284     "paddh      $f18, $f18, $f10                          \n\t"
1285     "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
1286     "paddh      $f16, $f16, $f28                          \n\t"
1287     "paddh      $f18, $f18, $f30                          \n\t"
1288     "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
1289     "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
1290     "paddh      $f12, $f12, $f28                          \n\t"
1291     "paddh      $f14, $f14, $f30                          \n\t"
1292     "dmtc1      %[iAlpha], $f28                           \n\t"
1293     "psrah      $f16, $f16, $f28                          \n\t"
1294     "psrah      $f18, $f18, $f28                          \n\t"
1295     "pandn      $f8, $f8, $f16                            \n\t"
1296     "pandn      $f10, $f10, $f18                          \n\t"
1297     "or         $f24, $f24, $f8                           \n\t"
1298     "or         $f26, $f26, $f10                          \n\t"
1299     "and        $f28, $f4, $f24                           \n\t"
1300     "and        $f30, $f6, $f26                           \n\t"
1301     "gslqc1     $f26, $f24, 640-496(%[tmp])               \n\t"
1302     "pandn      $f8, $f4, $f24                            \n\t"
1303     "pandn      $f10, $f6, $f26                           \n\t"
1304     "or         $f28, $f28, $f8                           \n\t"
1305     "or         $f30, $f30, $f10                          \n\t"
1306     "gslqc1     $f10, $f8, 640-352(%[tmp])                \n\t"
1307     "packushb   $f8, $f8, $f10                            \n\t"
1308     "packushb   $f10, $f28, $f30                          \n\t"
1309     "gssqc1     $f10, $f8, 688-272(%[tmp])                \n\t"
1310     "gslqc1     $f10, $f8, 640-128(%[tmp])                \n\t"
1311     "gslqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
1312     "or         $f8, $f8, $f28                            \n\t"
1313     "or         $f10, $f10, $f30                          \n\t"
1314     "dli        %[iAlpha], 0x1                            \n\t"
1315 
1316     "and        $f16, $f0, $f8                            \n\t"
1317     "and        $f18, $f2, $f10                           \n\t"
1318     "paddh      $f20, $f20, $f24                          \n\t"
1319     "paddh      $f22, $f22, $f26                          \n\t"
1320     "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
1321     "pandn      $f8, $f0, $f28                            \n\t"
1322     "pandn      $f10, $f2, $f30                           \n\t"
1323     "or         $f16, $f16, $f8                           \n\t"
1324     "or         $f18, $f18, $f10                          \n\t"
1325     "dmtc1      %[iAlpha], $f28                           \n\t"
1326     "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
1327     "dli        %[iAlpha], 0x3                            \n\t"
1328     "psllh      $f20, $f20, $f28                          \n\t"
1329     "psllh      $f22, $f22, $f28                          \n\t"
1330     "paddh      $f20, $f20, $f12                          \n\t"
1331     "paddh      $f22, $f22, $f14                          \n\t"
1332     "dmtc1      %[iAlpha], $f28                           \n\t"
1333     "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
1334     "paddh      $f8, $f8, $f20                            \n\t"
1335     "paddh      $f10, $f10, $f22                          \n\t"
1336     "psrah      $f8, $f8, $f28                            \n\t"
1337     "psrah      $f10, $f10, $f28                          \n\t"
1338     "gssqc1     $f18, $f16, 640-288(%[tmp])               \n\t"
1339     "gslqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
1340     "and        $f16, $f16, $f8                           \n\t"
1341     "and        $f18, $f18, $f10                          \n\t"
1342     "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
1343     "paddh      $f20, $f8, $f8                            \n\t"
1344     "paddh      $f22, $f10, $f10                          \n\t"
1345     "gslqc1     $f10, $f8, 640-432(%[tmp])                \n\t"
1346     "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
1347     "paddh      $f8, $f8, $f28                            \n\t"
1348     "paddh      $f10, $f10, $f30                          \n\t"
1349     "dli        %[iAlpha], 0x2                            \n\t"
1350     "paddh      $f20, $f20, $f8                           \n\t"
1351     "paddh      $f22, $f22, $f10                          \n\t"
1352     "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
1353     "paddh      $f20, $f20, $f28                          \n\t"
1354     "paddh      $f22, $f22, $f30                          \n\t"
1355     "dmtc1      %[iAlpha], $f28                           \n\t"
1356     "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
1357     "psrah      $f20, $f20, $f28                          \n\t"
1358     "psrah      $f22, $f22, $f28                          \n\t"
1359     "pandn      $f12, $f12, $f20                          \n\t"
1360     "pandn      $f14, $f14, $f22                          \n\t"
1361     "or         $f16, $f16, $f12                          \n\t"
1362     "or         $f18, $f18, $f14                          \n\t"
1363     "gslqc1     $f14, $f12, 640-32(%[tmp])                \n\t"
1364     "gslqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
1365     "or         $f12, $f12, $f28                          \n\t"
1366     "or         $f14, $f14, $f30                          \n\t"
1367     "and        $f28, $f4, $f16                           \n\t"
1368     "and        $f30, $f6, $f18                           \n\t"
1369     "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
1370     "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
1371     "pandn      $f8, $f4, $f16                            \n\t"
1372     "pandn      $f10, $f6, $f18                           \n\t"
1373     "or         $f28, $f28, $f8                           \n\t"
1374     "or         $f30, $f30, $f10                          \n\t"
1375     "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
1376     "paddh      $f16, $f16, $f8                           \n\t"
1377     "paddh      $f18, $f18, $f10                          \n\t"
1378     "gslqc1     $f10, $f8, 640-288(%[tmp])                \n\t"
1379     "packushb   $f8, $f8, $f10                            \n\t"
1380     "packushb   $f10, $f28, $f30                          \n\t"
1381     "dli        %[iAlpha], 0x2                            \n\t"
1382     "gssqc1     $f10, $f8, 704-272(%[tmp])                \n\t"
1383 
1384     "and        $f8, $f0, $f12                            \n\t"
1385     "and        $f10, $f2, $f14                           \n\t"
1386     "gslqc1     $f30, $f28, 640-384(%[tmp])               \n\t"
1387     "pandn      $f12, $f0, $f28                           \n\t"
1388     "pandn      $f14, $f2, $f30                           \n\t"
1389     "or         $f8, $f8, $f12                            \n\t"
1390     "or         $f10, $f10, $f14                          \n\t"
1391     "gssqc1     $f10, $f8, 640-304(%[tmp])                \n\t"
1392     "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
1393     "gslqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
1394     "paddh      $f12, $f8, $f28                           \n\t"
1395     "paddh      $f14, $f10, $f30                          \n\t"
1396     "paddh      $f12, $f12, $f16                          \n\t"
1397     "paddh      $f14, $f14, $f18                          \n\t"
1398     "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
1399     "paddh      $f12, $f12, $f28                          \n\t"
1400     "paddh      $f14, $f14, $f30                          \n\t"
1401     "dmtc1      %[iAlpha], $f28                           \n\t"
1402     "psrah      $f12, $f12, $f28                          \n\t"
1403     "psrah      $f14, $f14, $f28                          \n\t"
1404     "and        $f24, $f24, $f12                          \n\t"
1405     "and        $f26, $f26, $f14                          \n\t"
1406     "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
1407     "pandn      $f16, $f12, $f20                          \n\t"
1408     "pandn      $f18, $f14, $f22                          \n\t"
1409     "or         $f24, $f24, $f16                          \n\t"
1410     "or         $f26, $f26, $f18                          \n\t"
1411     "and        $f28, $f4, $f24                           \n\t"
1412     "and        $f30, $f6, $f26                           \n\t"
1413     "gslqc1     $f26, $f24, 640-304(%[tmp])               \n\t"
1414     "pandn      $f16, $f4, $f20                           \n\t"
1415     "pandn      $f18, $f6, $f22                           \n\t"
1416     "or         $f28, $f28, $f16                          \n\t"
1417     "or         $f30, $f30, $f18                          \n\t"
1418     "dli        %[iAlpha], 0x1                            \n\t"
1419 
1420     "packushb   $f24, $f24, $f26                          \n\t"
1421     "packushb   $f26, $f28, $f30                          \n\t"
1422     "gslqc1     $f30, $f28, 640-112(%[tmp])               \n\t"
1423     "gslqc1     $f18, $f16, 640-80(%[tmp])                \n\t"
1424     "or         $f28, $f28, $f16                          \n\t"
1425     "or         $f30, $f30, $f18                          \n\t"
1426     "and        $f16, $f0, $f28                           \n\t"
1427     "and        $f18, $f2, $f30                           \n\t"
1428     "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
1429     "pandn      $f0, $f0, $f28                            \n\t"
1430     "pandn      $f2, $f2, $f30                            \n\t"
1431     "or         $f16, $f16, $f0                           \n\t"
1432     "or         $f18, $f18, $f2                           \n\t"
1433     "xor        $f28, $f28, $f28                          \n\t"
1434     "xor        $f30, $f30, $f30                          \n\t"
1435     "gslqc1     $f2, $f0, 0x0($12)                        \n\t"
1436     "dmtc1      %[iAlpha], $f28                           \n\t"
1437     "punpcklbh  $f0, $f2, $f30                            \n\t"
1438     "punpckhbh  $f2, $f2, $f30                            \n\t"
1439     "psllh      $f0, $f0, $f28                            \n\t"
1440     "psllh      $f2, $f2, $f28                            \n\t"
1441     "paddh      $f0, $f0, $f8                             \n\t"
1442     "paddh      $f2, $f2, $f10                            \n\t"
1443     "paddh      $f0, $f0, $f8                             \n\t"
1444     "paddh      $f2, $f2, $f10                            \n\t"
1445     "paddh      $f0, $f0, $f8                             \n\t"
1446     "paddh      $f2, $f2, $f10                            \n\t"
1447     "paddh      $f0, $f0, $f20                            \n\t"
1448     "paddh      $f2, $f2, $f22                            \n\t"
1449     "dli        %[iAlpha], 0x3                            \n\t"
1450     "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
1451     "paddh      $f0, $f0, $f28                            \n\t"
1452     "paddh      $f2, $f2, $f30                            \n\t"
1453     "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
1454     "paddh      $f0, $f0, $f28                            \n\t"
1455     "paddh      $f2, $f2, $f30                            \n\t"
1456     "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
1457     "paddh      $f0, $f0, $f28                            \n\t"
1458     "paddh      $f2, $f2, $f30                            \n\t"
1459     "dmtc1      %[iAlpha], $f28                           \n\t"
1460     "psrah      $f0, $f0, $f28                            \n\t"
1461     "psrah      $f2, $f2, $f28                            \n\t"
1462     "and        $f0, $f0, $f12                            \n\t"
1463     "and        $f2, $f2, $f14                            \n\t"
1464     "pandn      $f12, $f12, $f8                           \n\t"
1465     "pandn      $f14, $f14, $f10                          \n\t"
1466     "or         $f0, $f0, $f12                            \n\t"
1467     "or         $f2, $f2, $f14                            \n\t"
1468     "and        $f28, $f4, $f0                            \n\t"
1469     "and        $f30, $f6, $f2                            \n\t"
1470 
1471     "gslqc1     $f2, $f0, 656-272(%[tmp])                 \n\t"
1472     "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
1473 
1474     "gslqc1     $f2, $f0, 672-272(%[tmp])                 \n\t"
1475 
1476     "gssqc1     $f2, $f0, 0x0($8)                         \n\t"
1477     "gslqc1     $f2, $f0, 688-272(%[tmp])                 \n\t"
1478     "gssqc1     $f2, $f0, 0x0($9)                         \n\t"
1479     "gslqc1     $f2, $f0, 704-272(%[tmp])                 \n\t"
1480 
1481     "pandn      $f4, $f4, $f8                             \n\t"
1482     "pandn      $f6, $f6, $f10                            \n\t"
1483     "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
1484     "or         $f28, $f28, $f4                           \n\t"
1485     "or         $f30, $f30, $f6                           \n\t"
1486     "packushb   $f16, $f16, $f18                          \n\t"
1487     "packushb   $f18, $f28, $f30                          \n\t"
1488     "gssqc1     $f26, $f24, 0x0($13)                      \n\t"
1489     "gssqc1     $f18, $f16, 0x0(%[iStride])               \n\t"
1490     : [pPix]"+&r"((unsigned char *)pPix)
1491     : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
1492       [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
1493     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
1494       "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
1495       "$f22", "$f24", "$f26", "$f28", "$f30"
1496   );
1497   RECOVER_REG;
1498 }
1499 
DeblockChromaLt4V_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTC)1500 void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
1501                            int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
1502   unsigned char tmp[256] __attribute__((aligned(32)));
1503   BACKUP_REG;
1504   __asm__ volatile (
1505     ".set       arch=loongson3a                           \n\t"
1506     "lb         $8, 0x2(%[pTC])                           \n\t"
1507     "lb         $9, 0x3(%[pTC])                           \n\t"
1508     "move       $11, $8                                   \n\t"
1509     "lb         $8, 0x1(%[pTC])                           \n\t"
1510     "lb         %[pTC], 0x0(%[pTC])                       \n\t"
1511     "move       $12, %[pTC]                               \n\t"
1512     "and        %[pTC], $9, 0xFFFF                        \n\t"
1513     "dmtc1      %[pTC], $f4                               \n\t"
1514     "and        %[pTC], $9, 0xFFFF                        \n\t"
1515     "dmtc1      %[pTC], $f8                               \n\t"
1516     "move       %[pTC], $11                               \n\t"
1517     "and        $9, %[pTC], 0xFFFF                        \n\t"
1518     "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
1519     "dmtc1      %[pTC], $f16                              \n\t"
1520     "and        %[pTC], $8, 0xFFFF                        \n\t"
1521     "dmtc1      %[pTC], $f20                              \n\t"
1522     "dmtc1      $9, $f12                                  \n\t"
1523     "and        %[pTC], $8, 0xFFFF                        \n\t"
1524     "dmtc1      %[pTC], $f24                              \n\t"
1525     "move       %[pTC], $12                               \n\t"
1526     "and        $9, %[pTC], 0xFFFF                        \n\t"
1527     "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
1528     "punpcklhw  $f24, $f24, $f8                           \n\t"
1529     "xor        $f0, $f0, $f0                             \n\t"
1530     "xor        $f2, $f2, $f2                             \n\t"
1531     "gssqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
1532     "dmtc1      $9, $f28                                  \n\t"
1533     "dmtc1      %[pTC], $f0                               \n\t"
1534     "daddu      %[pTC], %[iStride], %[iStride]            \n\t"
1535     "dsubu      $9, %[pPixCb], %[pTC]                     \n\t"
1536     "punpcklhw  $f20, $f20, $f4                           \n\t"
1537     "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
1538     "punpcklhw  $f0, $f0, $f16                            \n\t"
1539     "gsldxc1    $f16, 0x0(%[iStride], %[pPixCr])          \n\t"
1540     "punpcklhw  $f28, $f28, $f12                          \n\t"
1541     "gsldxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
1542     "punpcklhw  $f0, $f0, $f24                            \n\t"
1543     "gsldxc1    $f24, 0x0($9, $0)                         \n\t"
1544     "punpcklhw  $f28, $f28, $f20                          \n\t"
1545     "punpckhhw  $f2, $f0, $f28                            \n\t"
1546     "punpcklhw  $f0, $f0, $f28                            \n\t"
1547     "dsubu      $9, %[pPixCr], %[pTC]                     \n\t"
1548     "psubh      $f8, $f4, $f0                             \n\t"
1549     "psubh      $f10, $f6, $f2                            \n\t"
1550     "gssqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
1551     "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
1552     "mov.d      $f26, $f8                                 \n\t"
1553     "dsubu      %[pTC], %[pPixCb], %[iStride]             \n\t"
1554     "gsldxc1    $f28, 0x0(%[pTC], $0)                     \n\t"
1555     "dsubu      $9, %[pPixCr], %[iStride]                 \n\t"
1556     "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
1557     "mov.d      $f30, $f8                                 \n\t"
1558     "gsldxc1    $f8, 0x0(%[pPixCr], $0)                   \n\t"
1559     "mov.d      $f14, $f8                                 \n\t"
1560     "gsldxc1    $f8, 0x0(%[iStride], %[pPixCb])           \n\t"
1561     "mov.d      $f10, $f16                                \n\t"
1562     "gssqc1     $f10, $f8, 0xE0(%[tmp])                   \n\t"
1563     "dmtc1      %[iAlpha], $f8                            \n\t"
1564     "punpcklhw  $f16, $f8, $f8                            \n\t"
1565     "dmtc1      %[iBeta], $f8                             \n\t"
1566     "punpcklhw  $f20, $f8, $f8                            \n\t"
1567     "punpcklwd  $f8, $f20, $f20                           \n\t"
1568     "mov.d      $f10, $f8                                 \n\t"
1569     "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
1570     "punpckhbh  $f10, $f24, $f4                           \n\t"
1571     "punpcklbh  $f8, $f24, $f4                            \n\t"
1572     "gssqc1     $f14, $f12, 0xd0(%[tmp])                  \n\t"
1573     "punpcklwd  $f16, $f16, $f16                          \n\t"
1574     "mov.d      $f18, $f16                                \n\t"
1575     "gssqc1     $f10, $f8, 0x30(%[tmp])                   \n\t"
1576     "punpcklbh  $f24, $f26, $f6                           \n\t"
1577     "punpckhbh  $f26, $f26, $f6                           \n\t"
1578     "gssqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
1579     "gslqc1     $f26, $f24, 0xd0(%[tmp])                  \n\t"
1580     "punpcklbh  $f24, $f26, $f6                           \n\t"
1581     "punpckhbh  $f26, $f26, $f6                           \n\t"
1582     "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
1583     "gslqc1     $f26, $f24, 0xe0(%[tmp])                  \n\t"
1584     "punpcklbh  $f24, $f26, $f6                           \n\t"
1585     "punpckhbh  $f26, $f26, $f6                           \n\t"
1586     "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
1587     "gslqc1     $f22, $f20, 0xe0(%[tmp])                  \n\t"
1588     "mov.d      $f8, $f28                                 \n\t"
1589     "mov.d      $f10, $f30                                \n\t"
1590     "punpcklbh  $f28, $f30, $f6                           \n\t"
1591     "punpckhbh  $f30, $f30, $f6                           \n\t"
1592     "punpckhbh  $f22, $f20, $f4                           \n\t"
1593     "punpcklbh  $f20, $f20, $f4                           \n\t"
1594     "gssqc1     $f30, $f28, 0xa0(%[tmp])                  \n\t"
1595     "punpckhbh  $f14, $f12, $f4                           \n\t"
1596     "punpcklbh  $f12, $f12, $f4                           \n\t"
1597     "dli        %[iBeta], 0x4                             \n\t"
1598     "punpckhbh  $f10, $f8, $f4                            \n\t"
1599     "punpcklbh  $f8, $f8, $f4                             \n\t"
1600     "dmtc1      %[iBeta], $f24                            \n\t"
1601     "punpcklhw  $f28, $f24, $f24                          \n\t"
1602     "punpcklwd  $f24, $f28, $f28                          \n\t"
1603     "mov.d      $f26, $f24                                \n\t"
1604     "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
1605     "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
1606     "psubh      $f28, $f28, $f20                          \n\t"
1607     "psubh      $f30, $f30, $f22                          \n\t"
1608     "pcmpgth    $f24, $f0, $f4                            \n\t"
1609     "pcmpgth    $f26, $f2, $f6                            \n\t"
1610     "gslqc1     $f6, $f4, 0x60(%[tmp])                    \n\t"
1611     "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
1612     "psubh      $f24, $f12, $f8                           \n\t"
1613     "psubh      $f26, $f14, $f10                          \n\t"
1614     "dmfc1      %[iAlpha], $f12                           \n\t"
1615     "dmfc1      %[iBeta], $f14                            \n\t"
1616     "dli        $10, 0x2                                  \n\t"
1617     "dmtc1      $10, $f12                                 \n\t"
1618     "dli        $10, 0x3                                  \n\t"
1619     "dmtc1      $10, $f14                                 \n\t"
1620     "psllh      $f24, $f24, $f12                          \n\t"
1621     "psllh      $f26, $f26, $f12                          \n\t"
1622     "paddh      $f24, $f24, $f28                          \n\t"
1623     "paddh      $f26, $f26, $f30                          \n\t"
1624     "gslqc1     $f30, $f28, 0x20(%[tmp])                  \n\t"
1625     "paddh      $f24, $f24, $f28                          \n\t"
1626     "paddh      $f26, $f26, $f30                          \n\t"
1627     "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
1628     "psrah      $f24, $f24, $f14                          \n\t"
1629     "psrah      $f26, $f26, $f14                          \n\t"
1630     "dmtc1      %[iAlpha], $f12                           \n\t"
1631     "dmtc1      %[iBeta], $f14                            \n\t"
1632     "pmaxsh     $f4, $f4, $f24                            \n\t"
1633     "pmaxsh     $f6, $f6, $f26                            \n\t"
1634     "gssqc1     $f2, $f0, 0x10(%[tmp])                    \n\t"
1635     "gslqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
1636     "pminsh     $f24, $f24, $f4                           \n\t"
1637     "pminsh     $f26, $f26, $f6                           \n\t"
1638     "gssqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
1639     "psubh      $f4, $f8, $f12                            \n\t"
1640     "psubh      $f6, $f10, $f14                           \n\t"
1641     WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
1642     "pcmpgth    $f24, $f16, $f4                           \n\t"
1643     "pcmpgth    $f26, $f18, $f6                           \n\t"
1644     "gslqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
1645     "psubh      $f4, $f4, $f8                             \n\t"
1646     "psubh      $f6, $f6, $f10                            \n\t"
1647     "dmfc1      %[iAlpha], $f8                            \n\t"
1648     "dmfc1      %[iBeta], $f10                            \n\t"
1649     WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
1650     "pcmpgth    $f28, $f28, $f4                           \n\t"
1651     "pcmpgth    $f30, $f30, $f6                           \n\t"
1652     "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
1653     "and        $f24, $f24, $f28                          \n\t"
1654     "and        $f26, $f26, $f30                          \n\t"
1655     "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
1656     "psubh      $f20, $f20, $f12                          \n\t"
1657     "psubh      $f22, $f22, $f14                          \n\t"
1658     WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
1659     "pcmpgth    $f4, $f4, $f20                            \n\t"
1660     "pcmpgth    $f6, $f6, $f22                            \n\t"
1661     "gslqc1     $f22, $f20, 0x80(%[tmp])                  \n\t"
1662     "gslqc1     $f10, $f8, 0x90(%[tmp])                   \n\t"
1663     "psubh      $f20, $f20, $f8                           \n\t"
1664     "psubh      $f22, $f22, $f10                          \n\t"
1665     "and        $f24, $f24, $f4                           \n\t"
1666     "and        $f26, $f26, $f6                           \n\t"
1667     "gslqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
1668     "and        $f24, $f24, $f8                           \n\t"
1669     "and        $f26, $f26, $f10                          \n\t"
1670     "gslqc1     $f6, $f4, 0x10(%[tmp])                    \n\t"
1671     "and        $f4, $f4, $f24                            \n\t"
1672     "and        $f6, $f6, $f26                            \n\t"
1673     "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
1674     "gssqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
1675     "gslqc1     $f6, $f4, 0xa0(%[tmp])                    \n\t"
1676     "psubh      $f24, $f24, $f4                           \n\t"
1677     "psubh      $f26, $f26, $f6                           \n\t"
1678     "dli        $10, 0x2                                  \n\t"
1679     "dmtc1      $10, $f8                                  \n\t"
1680     "psllh      $f24, $f24, $f8                           \n\t"
1681     "psllh      $f26, $f26, $f8                           \n\t"
1682     "paddh      $f24, $f24, $f20                          \n\t"
1683     "paddh      $f26, $f26, $f22                          \n\t"
1684     "dli        $10, 0x3                                  \n\t"
1685     "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
1686     "paddh      $f24, $f24, $f8                           \n\t"
1687     "paddh      $f26, $f26, $f10                          \n\t"
1688     "dmtc1      $10, $f8                                  \n\t"
1689     "gslqc1     $f22, $f20, 0x60(%[tmp])                  \n\t"
1690     "psrah      $f24, $f24, $f8                           \n\t"
1691     "psrah      $f26, $f26, $f8                           \n\t"
1692     "pmaxsh     $f20, $f20, $f24                          \n\t"
1693     "pmaxsh     $f22, $f22, $f26                          \n\t"
1694     "pminsh     $f0, $f0, $f20                            \n\t"
1695     "pminsh     $f2, $f2, $f22                            \n\t"
1696     "gslqc1     $f22, $f20, 0x70(%[tmp])                  \n\t"
1697     "psubh      $f24, $f4, $f20                           \n\t"
1698     "psubh      $f26, $f6, $f22                           \n\t"
1699     WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
1700     "pcmpgth    $f16, $f16, $f24                          \n\t"
1701     "pcmpgth    $f18, $f18, $f26                          \n\t"
1702     "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
1703     "psubh      $f24, $f24, $f4                           \n\t"
1704     "psubh      $f26, $f26, $f6                           \n\t"
1705     WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
1706     "pcmpgth    $f28, $f28, $f24                          \n\t"
1707     "pcmpgth    $f30, $f30, $f26                          \n\t"
1708     "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
1709     "and        $f16, $f16, $f28                          \n\t"
1710     "and        $f18, $f18, $f30                          \n\t"
1711     "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
1712     "psubh      $f24, $f24, $f20                          \n\t"
1713     "psubh      $f26, $f26, $f22                          \n\t"
1714     WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
1715     "dmtc1      %[iAlpha], $f8                            \n\t"
1716     "dmtc1      %[iBeta], $f10                            \n\t"
1717     "pcmpgth    $f28, $f28, $f24                          \n\t"
1718     "pcmpgth    $f30, $f30, $f26                          \n\t"
1719     "and        $f16, $f16, $f28                          \n\t"
1720     "and        $f18, $f18, $f30                          \n\t"
1721     "gslqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
1722     "and        $f16, $f16, $f24                          \n\t"
1723     "and        $f18, $f18, $f26                          \n\t"
1724     "and        $f0, $f0, $f16                            \n\t"
1725     "and        $f2, $f2, $f18                            \n\t"
1726     "gslqc1     $f18, $f16, 0x30(%[tmp])                  \n\t"
1727     "paddh      $f8, $f8, $f16                            \n\t"
1728     "paddh      $f10, $f10, $f18                          \n\t"
1729     "paddh      $f4, $f4, $f0                             \n\t"
1730     "paddh      $f6, $f6, $f2                             \n\t"
1731     "packushb   $f8, $f8, $f10                            \n\t"
1732     "packushb   $f10, $f4, $f6                            \n\t"
1733     "gssdxc1    $f8, 0x0(%[pTC], $0)                      \n\t"
1734     "psubh      $f12, $f12, $f16                          \n\t"
1735     "psubh      $f14, $f14, $f18                          \n\t"
1736     "psubh      $f20, $f20, $f0                           \n\t"
1737     "psubh      $f22, $f22, $f2                           \n\t"
1738     "packushb   $f12, $f12, $f14                          \n\t"
1739     "packushb   $f14, $f20, $f22                          \n\t"
1740     "gssdxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
1741     "gssdxc1    $f10, 0x0($9, $0)                         \n\t"
1742     "gssdxc1    $f14, 0x0(%[pPixCr], $0)                  \n\t"
1743     : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
1744     : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
1745       [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
1746     : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
1747       "$f10", "$f12",  "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
1748       "$f28", "$f30"
1749   );
1750   RECOVER_REG;
1751 }
1752 
DeblockChromaEq4V_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)1753 void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
1754                            int32_t iAlpha, int32_t iBeta) {
1755   unsigned char tmp[128] __attribute__((aligned(32)));
1756   BACKUP_REG;
1757   __asm__ volatile (
1758     ".set       arch=loongson3a                          \n\t"
1759     "daddu      $8, %[iStride], %[iStride]               \n\t"
1760     "dsubu      $9, %[pPixCb], $8                        \n\t"
1761     "gsldxc1    $f16, 0x0(%[pPixCr], $0)                 \n\t"
1762     "gsldxc1    $f20, 0x0(%[iStride], %[pPixCr])         \n\t"
1763     "gsldxc1    $f4, 0x0($9, $0)                         \n\t"
1764     "dsubu      $9, %[pPixCr], $8                        \n\t"
1765     "gsldxc1    $f8, 0x0($9, $0)                         \n\t"
1766     "mov.d      $f6, $f8                                 \n\t"
1767     "dsubu      $8, %[pPixCb], %[iStride]                \n\t"
1768     "gsldxc1    $f8, 0x0($8, $0)                         \n\t"
1769     "dsubu      $9, %[pPixCr], %[iStride]                \n\t"
1770     "gsldxc1    $f12, 0x0($9, $0)                        \n\t"
1771     "mov.d      $f10, $f12                               \n\t"
1772     "gsldxc1    $f12, 0x0(%[pPixCb], $0)                 \n\t"
1773     "mov.d      $f14, $f16                               \n\t"
1774     "gsldxc1    $f16, 0x0(%[iStride], %[pPixCb])         \n\t"
1775     "mov.d      $f18, $f20                               \n\t"
1776     "dmtc1      %[iAlpha], $f20                          \n\t"
1777     "xor        $f0, $f0, $f0                            \n\t"
1778     "xor        $f2, $f2, $f2                            \n\t"
1779     "punpcklhw  $f24, $f20, $f20                         \n\t"
1780     "punpcklwd  $f20, $f24, $f24                         \n\t"
1781     "mov.d      $f22, $f20                               \n\t"
1782     "dmtc1      %[iBeta], $f24                           \n\t"
1783     "punpcklhw  $f28, $f24, $f24                         \n\t"
1784     "punpcklwd  $f24, $f28, $f28                         \n\t"
1785     "mov.d      $f26, $f24                               \n\t"
1786     "mov.d      $f28, $f4                                \n\t"
1787     "punpcklbh  $f4, $f6, $f2                            \n\t"
1788     "punpckhbh  $f6, $f6, $f2                            \n\t"
1789     "punpckhbh  $f30, $f28, $f0                          \n\t"
1790     "punpcklbh  $f28, $f28, $f0                          \n\t"
1791     "gssqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
1792     "gssqc1     $f30, $f28, 0x60(%[tmp])                 \n\t"
1793     "punpckhbh  $f30, $f8, $f0                           \n\t"
1794     "punpcklbh  $f28, $f8, $f0                           \n\t"
1795     "gssqc1     $f30, $f28, 0x10(%[tmp])                 \n\t"
1796     "punpckhbh  $f30, $f12, $f0                          \n\t"
1797     "punpcklbh  $f28, $f12, $f0                          \n\t"
1798     "punpcklbh  $f12, $f14, $f2                          \n\t"
1799     "punpckhbh  $f14, $f14, $f2                          \n\t"
1800     "gssqc1     $f30, $f28, 0x50(%[tmp])                 \n\t"
1801     "mov.d      $f28, $f16                               \n\t"
1802     "punpcklbh  $f16, $f18, $f2                          \n\t"
1803     "punpckhbh  $f18, $f18, $f2                          \n\t"
1804     "punpcklbh  $f8, $f10, $f2                           \n\t"
1805     "punpckhbh  $f10, $f10, $f2                          \n\t"
1806     "punpckhbh  $f30, $f28, $f0                          \n\t"
1807     "punpcklbh  $f28, $f28, $f0                          \n\t"
1808     "gssqc1     $f14, $f12, 0x30(%[tmp])                 \n\t"
1809     "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
1810     "gslqc1     $f2, $f0, 0x50(%[tmp])                   \n\t"
1811     "psubh      $f4, $f12, $f0                           \n\t"
1812     "psubh      $f6, $f14, $f2                           \n\t"
1813     WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
1814     "gssqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
1815     "pcmpgth    $f0, $f20, $f4                           \n\t"
1816     "pcmpgth    $f2, $f22, $f6                           \n\t"
1817     "gslqc1     $f6, $f4, 0x60(%[tmp])                   \n\t"
1818     "psubh      $f4, $f4, $f12                           \n\t"
1819     "psubh      $f6, $f6, $f14                           \n\t"
1820     WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
1821     "pcmpgth    $f16, $f24, $f4                          \n\t"
1822     "pcmpgth    $f18, $f26, $f6                          \n\t"
1823     "and        $f0, $f0, $f16                           \n\t"
1824     "and        $f2, $f2, $f18                           \n\t"
1825     "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
1826     "psubh      $f4, $f28, $f16                          \n\t"
1827     "psubh      $f6, $f30, $f18                          \n\t"
1828     WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
1829     "pcmpgth    $f16, $f24, $f4                          \n\t"
1830     "pcmpgth    $f18, $f26, $f6                          \n\t"
1831     "gslqc1     $f6, $f4, 0x30(%[tmp])                   \n\t"
1832     "psubh      $f4, $f8, $f4                            \n\t"
1833     "psubh      $f6, $f10, $f6                           \n\t"
1834     "dmfc1      %[iAlpha], $f28                          \n\t"
1835     "dmfc1      %[iBeta], $f30                           \n\t"
1836     WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
1837     "pcmpgth    $f20, $f20, $f4                          \n\t"
1838     "pcmpgth    $f22, $f22, $f6                          \n\t"
1839     "gslqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
1840     "and        $f0, $f0, $f16                           \n\t"
1841     "and        $f2, $f2, $f18                           \n\t"
1842     "psubh      $f4, $f4, $f8                            \n\t"
1843     "psubh      $f6, $f6, $f10                           \n\t"
1844     WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
1845     "pcmpgth    $f16, $f24, $f4                          \n\t"
1846     "pcmpgth    $f18, $f26, $f6                          \n\t"
1847     "gslqc1     $f6, $f4, 0x20(%[tmp])                   \n\t"
1848     "gslqc1     $f30, $f28, 0x30(%[tmp])                 \n\t"
1849     "psubh      $f4, $f4, $f28                           \n\t"
1850     "psubh      $f6, $f6, $f30                           \n\t"
1851     "and        $f20, $f20, $f16                         \n\t"
1852     "and        $f22, $f22, $f18                         \n\t"
1853     WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
1854     "dmtc1      %[iAlpha], $f28                          \n\t"
1855     "dmtc1      %[iBeta], $f30                           \n\t"
1856     "pcmpgth    $f24, $f24, $f4                          \n\t"
1857     "pcmpgth    $f26, $f26, $f6                          \n\t"
1858     "and        $f20, $f20, $f24                         \n\t"
1859     "and        $f22, $f22, $f26                         \n\t"
1860     "dli        %[iBeta], 0x2                            \n\t"
1861     "dmtc1      %[iBeta], $f4                            \n\t"
1862     "punpcklhw  $f16, $f4, $f4                           \n\t"
1863     "punpcklwd  $f4, $f16, $f16                          \n\t"
1864     "mov.d      $f6, $f4                                 \n\t"
1865     "gslqc1     $f18, $f16, 0x60(%[tmp])                 \n\t"
1866     "paddh      $f24, $f16, $f16                         \n\t"
1867     "paddh      $f26, $f18, $f18                         \n\t"
1868     "paddh      $f24, $f24, $f12                         \n\t"
1869     "paddh      $f26, $f26, $f14                         \n\t"
1870     "paddh      $f24, $f24, $f28                         \n\t"
1871     "paddh      $f26, $f26, $f30                         \n\t"
1872     "gssqc1     $f6, $f4, 0x10(%[tmp])                   \n\t"
1873     "gslqc1     $f18, $f16, 0x10(%[tmp])                 \n\t"
1874     "paddh      $f24, $f24, $f16                         \n\t"
1875     "paddh      $f26, $f26, $f18                         \n\t"
1876     "dmtc1      %[iBeta], $f16                           \n\t"
1877     "psrah      $f24, $f24, $f16                         \n\t"
1878     "psrah      $f26, $f26, $f16                         \n\t"
1879     "pandn      $f16, $f0, $f12                          \n\t"
1880     "pandn      $f18, $f2, $f14                          \n\t"
1881     "gslqc1     $f14, $f12, 0x40(%[tmp])                 \n\t"
1882     "and        $f4, $f0, $f24                           \n\t"
1883     "and        $f6, $f2, $f26                           \n\t"
1884     "or         $f4, $f4, $f16                           \n\t"
1885     "or         $f6, $f6, $f18                           \n\t"
1886     "paddh      $f24, $f12, $f12                         \n\t"
1887     "paddh      $f26, $f14, $f14                         \n\t"
1888     "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
1889     "paddh      $f24, $f24, $f8                          \n\t"
1890     "paddh      $f26, $f26, $f10                         \n\t"
1891     "gslqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
1892     "paddh      $f24, $f24, $f16                         \n\t"
1893     "paddh      $f26, $f26, $f18                         \n\t"
1894     "dmtc1      %[iBeta], $f16                           \n\t"
1895     "paddh      $f24, $f24, $f12                         \n\t"
1896     "paddh      $f26, $f26, $f14                         \n\t"
1897     "psrah      $f24, $f24, $f16                         \n\t"
1898     "psrah      $f26, $f26, $f16                         \n\t"
1899     "and        $f16, $f20, $f24                         \n\t"
1900     "and        $f18, $f22, $f26                         \n\t"
1901     "pandn      $f24, $f20, $f8                          \n\t"
1902     "pandn      $f26, $f22, $f10                         \n\t"
1903     "or         $f16, $f16, $f24                         \n\t"
1904     "or         $f18, $f18, $f26                         \n\t"
1905     "packushb   $f4, $f4, $f6                            \n\t"
1906     "packushb   $f6, $f16, $f18                          \n\t"
1907     "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
1908     "paddh      $f24, $f28, $f28                         \n\t"
1909     "paddh      $f26, $f30, $f30                         \n\t"
1910     "paddh      $f24, $f24, $f16                         \n\t"
1911     "paddh      $f26, $f26, $f18                         \n\t"
1912     "gslqc1     $f10, $f8, 0x60(%[tmp])                  \n\t"
1913     "paddh      $f24, $f24, $f8                          \n\t"
1914     "paddh      $f26, $f26, $f10                         \n\t"
1915     "dmtc1      %[iBeta], $f28                           \n\t"
1916     "paddh      $f24, $f24, $f12                         \n\t"
1917     "paddh      $f26, $f26, $f14                         \n\t"
1918     "psrah      $f24, $f24, $f28                         \n\t"
1919     "psrah      $f26, $f26, $f28                         \n\t"
1920     "and        $f8, $f0, $f24                           \n\t"
1921     "and        $f10, $f2, $f26                          \n\t"
1922     "pandn      $f0, $f0, $f16                           \n\t"
1923     "pandn      $f2, $f2, $f18                           \n\t"
1924     "or         $f8, $f8, $f0                            \n\t"
1925     "or         $f10, $f10, $f2                          \n\t"
1926     "gslqc1     $f2, $f0, 0x20(%[tmp])                   \n\t"
1927     "paddh      $f24, $f0, $f0                           \n\t"
1928     "paddh      $f26, $f2, $f2                           \n\t"
1929     "gslqc1     $f2, $f0, 0x30(%[tmp])                   \n\t"
1930     "paddh      $f24, $f24, $f0                          \n\t"
1931     "paddh      $f26, $f26, $f2                          \n\t"
1932     "gslqc1     $f18, $f16, 0x40(%[tmp])                 \n\t"
1933     "paddh      $f24, $f24, $f16                         \n\t"
1934     "paddh      $f26, $f26, $f18                         \n\t"
1935     "paddh      $f24, $f24, $f12                         \n\t"
1936     "paddh      $f26, $f26, $f14                         \n\t"
1937     "gssdxc1    $f4, 0x0($8, $0)                         \n\t"
1938     "psrah      $f24, $f24, $f28                         \n\t"
1939     "psrah      $f26, $f26, $f28                         \n\t"
1940     "and        $f16, $f20, $f24                         \n\t"
1941     "and        $f18, $f22, $f26                         \n\t"
1942     "pandn      $f20, $f20, $f0                          \n\t"
1943     "pandn      $f22, $f22, $f2                          \n\t"
1944     "or         $f16, $f16, $f20                         \n\t"
1945     "or         $f18, $f18, $f22                         \n\t"
1946     "packushb   $f8, $f8, $f10                           \n\t"
1947     "packushb   $f10, $f16, $f18                         \n\t"
1948     "gssdxc1    $f8, 0x0(%[pPixCb], $0)                  \n\t"
1949     "gssdxc1    $f6, 0x0($9, $0)                         \n\t"
1950     "gssdxc1    $f10, 0x0(%[pPixCr], $0)                 \n\t"
1951     : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
1952     : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
1953       [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
1954     : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
1955       "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
1956       "$f28", "$f30"
1957   );
1958   RECOVER_REG;
1959 }
1960 
DeblockChromaEq4H_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)1961 void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
1962                            int32_t iAlpha, int32_t iBeta) {
1963   unsigned char tmp[256] __attribute__((aligned(32)));
1964   BACKUP_REG;
1965   __asm__ volatile (
1966     ".set       arch=loongson3a                           \n\t"
1967     "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
1968     "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
1969     "move       $9, %[pPixCb]                             \n\t"
1970     "move       $10, %[pPixCr]                            \n\t"
1971     "dsll       $11, %[iStride], 0x2                      \n\t"
1972     "daddu      %[pPixCb], %[pPixCb], $11                 \n\t"
1973     "daddu      %[pPixCr], %[pPixCr], $11                 \n\t"
1974     "daddiu     $11, %[tmp], 0x80                         \n\t"
1975     "gsldlc1    $f0, 0x7($9)                              \n\t"
1976     "gsldrc1    $f0, 0x0($9)                              \n\t"
1977     "daddu      $12, $9, %[iStride]                       \n\t"
1978     "gsldlc1    $f4, 0x7($12)                             \n\t"
1979     "gsldrc1    $f4, 0x0($12)                             \n\t"
1980     "daddu      $12, $12, %[iStride]                      \n\t"
1981     "gsldlc1    $f8, 0x7($12)                             \n\t"
1982     "gsldrc1    $f8, 0x0($12)                             \n\t"
1983     "daddu      $12, $12, %[iStride]                      \n\t"
1984     "gsldlc1    $f12, 0x7($12)                            \n\t"
1985     "gsldlc1    $f16, 0x7($10)                            \n\t"
1986     "gsldrc1    $f12, 0x0($12)                            \n\t"
1987     "gsldrc1    $f16, 0x0($10)                            \n\t"
1988     "daddu      $12, $10, %[iStride]                      \n\t"
1989     "gsldlc1    $f20, 0x7($12)                            \n\t"
1990     "gsldrc1    $f20, 0x0($12)                            \n\t"
1991     "daddu      $12, $12, %[iStride]                      \n\t"
1992     "gsldlc1    $f24, 0x7($12)                            \n\t"
1993     "gsldrc1    $f24, 0x0($12)                            \n\t"
1994     "daddu      $12, $12, %[iStride]                      \n\t"
1995     "gsldlc1    $f28, 0x7($12)                            \n\t"
1996     "gsldrc1    $f28, 0x0($12)                            \n\t"
1997     "punpcklwd  $f0, $f0, $f16                            \n\t"
1998     "punpcklwd  $f4, $f4, $f20                            \n\t"
1999     "punpcklwd  $f8, $f8, $f24                            \n\t"
2000     "punpcklwd  $f12, $f12, $f28                          \n\t"
2001     "gsldlc1    $f16, 0x7(%[pPixCb])                      \n\t"
2002     "gsldlc1    $f20, 0x7(%[pPixCr])                      \n\t"
2003     "gsldrc1    $f16, 0x0(%[pPixCb])                      \n\t"
2004     "gsldrc1    $f20, 0x0(%[pPixCr])                      \n\t"
2005     "punpcklwd  $f16, $f16, $f20                          \n\t"
2006     "mov.d      $f2, $f16                                 \n\t"
2007     "daddu      $12, %[pPixCb], %[iStride]                \n\t"
2008     "daddu      $13, %[pPixCr], %[iStride]                \n\t"
2009     "gsldlc1    $f16, 0x7($12)                            \n\t"
2010     "gsldlc1    $f20, 0x7($13)                            \n\t"
2011     "gsldrc1    $f16, 0x0($12)                            \n\t"
2012     "gsldrc1    $f20, 0x0($13)                            \n\t"
2013     "punpcklwd  $f16, $f16, $f20                          \n\t"
2014     "mov.d      $f6, $f16                                 \n\t"
2015     "daddu      $12, $12, %[iStride]                      \n\t"
2016     "daddu      $13, $13, %[iStride]                      \n\t"
2017     "gsldlc1    $f16, 0x7($12)                            \n\t"
2018     "gsldlc1    $f20, 0x7($13)                            \n\t"
2019     "gsldrc1    $f16, 0x0($12)                            \n\t"
2020     "gsldrc1    $f20, 0x0($13)                            \n\t"
2021     "punpcklwd  $f16, $f16, $f20                          \n\t"
2022     "mov.d      $f10, $f16                                \n\t"
2023     "daddu      $12, $12, %[iStride]                      \n\t"
2024     "daddu      $13, $13, %[iStride]                      \n\t"
2025     "gsldlc1    $f16, 0x7($12)                            \n\t"
2026     "gsldlc1    $f20, 0x7($13)                            \n\t"
2027     "gsldrc1    $f16, 0x0($12)                            \n\t"
2028     "gsldrc1    $f20, 0x0($13)                            \n\t"
2029     "punpcklwd  $f16, $f16, $f20                          \n\t"
2030     "mov.d      $f14, $f16                                \n\t"
2031     "punpcklbh  $f24, $f2, $f6                            \n\t"
2032     "punpckhbh  $f26, $f2, $f6                            \n\t"
2033     "punpckhbh  $f2, $f0, $f4                             \n\t"
2034     "punpcklbh  $f0, $f0, $f4                             \n\t"
2035     "punpcklbh  $f28, $f10, $f14                          \n\t"
2036     "punpckhbh  $f30, $f10, $f14                          \n\t"
2037     "punpckhbh  $f10, $f8, $f12                           \n\t"
2038     "punpcklbh  $f8, $f8, $f12                            \n\t"
2039     "punpcklhw  $f16, $f2, $f10                           \n\t"
2040     "punpckhhw  $f18, $f2, $f10                           \n\t"
2041     "punpckhhw  $f2, $f0, $f8                             \n\t"
2042     "punpcklhw  $f0, $f0, $f8                             \n\t"
2043     "punpcklhw  $f20, $f26, $f30                          \n\t"
2044     "punpckhhw  $f22, $f26, $f30                          \n\t"
2045     "punpckhhw  $f26, $f24, $f28                          \n\t"
2046     "punpcklhw  $f24, $f24, $f28                          \n\t"
2047     "punpcklwd  $f4, $f2, $f26                            \n\t"
2048     "punpckhwd  $f6, $f2, $f26                            \n\t"
2049     "punpckhwd  $f2, $f0, $f24                            \n\t"
2050     "punpcklwd  $f0, $f0, $f24                            \n\t"
2051     "punpcklwd  $f8, $f18, $f22                           \n\t"
2052     "punpckhwd  $f10, $f18, $f22                          \n\t"
2053     "punpckhwd  $f18, $f16, $f20                          \n\t"
2054     "punpcklwd  $f16, $f16, $f20                          \n\t"
2055     "mov.d      $f20, $f2                                 \n\t"
2056     "mov.d      $f22, $f18                                \n\t"
2057     "mov.d      $f2, $f16                                 \n\t"
2058     "mov.d      $f24, $f6                                 \n\t"
2059     "mov.d      $f26, $f10                                \n\t"
2060     "mov.d      $f6, $f8                                  \n\t"
2061     "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
2062     "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
2063     "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
2064     "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
2065     "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
2066     "gslqc1     $f18, $f16, 0x90(%[tmp])                  \n\t"
2067     "gslqc1     $f22, $f20, 0xa0(%[tmp])                  \n\t"
2068     "gslqc1     $f30, $f28, 0xb0(%[tmp])                  \n\t"
2069     "xor        $f0, $f0, $f0                             \n\t"
2070     "dmtc1      %[iAlpha], $f4                            \n\t"
2071     "punpcklhw  $f8, $f4, $f4                             \n\t"
2072     "punpcklwd  $f4, $f8, $f8                             \n\t"
2073     "mov.d      $f6, $f4                                  \n\t"
2074     "dmtc1      %[iBeta], $f8                             \n\t"
2075     "punpcklhw  $f12, $f8, $f8                            \n\t"
2076     "punpcklwd  $f8, $f12, $f12                           \n\t"
2077     "mov.d      $f10, $f8                                 \n\t"
2078     "mov.d      $f12, $f24                                \n\t"
2079     "punpcklbh  $f24, $f26, $f0                           \n\t"
2080     "punpckhbh  $f26, $f26, $f0                           \n\t"
2081     "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
2082     "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
2083     "punpcklbh  $f24, $f26, $f0                           \n\t"
2084     "punpckhbh  $f26, $f26, $f0                           \n\t"
2085     "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
2086     "gslqc1     $f26, $f24, 0xa0(%[tmp])                  \n\t"
2087     "punpcklbh  $f24, $f26, $f0                           \n\t"
2088     "punpckhbh  $f26, $f26, $f0                           \n\t"
2089     "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
2090     "gslqc1     $f26, $f24, 0xb0(%[tmp])                  \n\t"
2091     "punpcklbh  $f24, $f26, $f0                           \n\t"
2092     "punpckhbh  $f26, $f26, $f0                           \n\t"
2093     "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
2094     "punpckhbh  $f30, $f28, $f0                           \n\t"
2095     "punpcklbh  $f28, $f28, $f0                           \n\t"
2096     "punpckhbh  $f18, $f16, $f0                           \n\t"
2097     "punpcklbh  $f16, $f16, $f0                           \n\t"
2098     "punpckhbh  $f22, $f20, $f0                           \n\t"
2099     "punpcklbh  $f20, $f20, $f0                           \n\t"
2100     "punpckhbh  $f14, $f12, $f0                           \n\t"
2101     "punpcklbh  $f12, $f12, $f0                           \n\t"
2102     "gssqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
2103     "psubh      $f24, $f16, $f20                          \n\t"
2104     "psubh      $f26, $f18, $f22                          \n\t"
2105     WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2106     "pcmpgth    $f0, $f4, $f24                            \n\t"
2107     "pcmpgth    $f2, $f6, $f26                            \n\t"
2108     "psubh      $f24, $f12, $f16                          \n\t"
2109     "psubh      $f26, $f14, $f18                          \n\t"
2110     WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
2111     "pcmpgth    $f28, $f8, $f24                           \n\t"
2112     "pcmpgth    $f30, $f10, $f26                          \n\t"
2113     "gslqc1     $f26, $f24, 0x50(%[tmp])                  \n\t"
2114     "psubh      $f24, $f24, $f20                          \n\t"
2115     "psubh      $f26, $f26, $f22                          \n\t"
2116     "and        $f0, $f0, $f28                            \n\t"
2117     "and        $f2, $f2, $f30                            \n\t"
2118     WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
2119     "dmfc1      %[iAlpha], $f20                           \n\t"
2120     "dmfc1      %[iBeta], $f22                            \n\t"
2121     "pcmpgth    $f28, $f8, $f24                           \n\t"
2122     "pcmpgth    $f30, $f10, $f26                          \n\t"
2123     "gslqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
2124     "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
2125     "psubh      $f24, $f24, $f20                          \n\t"
2126     "psubh      $f26, $f26, $f22                          \n\t"
2127     WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
2128     "pcmpgth    $f4, $f4, $f24                            \n\t"
2129     "pcmpgth    $f6, $f6, $f26                            \n\t"
2130     "gslqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
2131     "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
2132     "psubh      $f24, $f24, $f20                          \n\t"
2133     "psubh      $f26, $f26, $f22                          \n\t"
2134     WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
2135     "and        $f0, $f0, $f28                            \n\t"
2136     "and        $f2, $f2, $f30                            \n\t"
2137     "pcmpgth    $f28, $f8, $f24                           \n\t"
2138     "pcmpgth    $f30, $f10, $f26                          \n\t"
2139     "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
2140     "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
2141     "psubh      $f24, $f24, $f20                          \n\t"
2142     "psubh      $f26, $f26, $f22                          \n\t"
2143     WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
2144     "dli        $8, 0x2                                   \n\t"
2145     "and        $f4, $f4, $f28                            \n\t"
2146     "and        $f6, $f6, $f30                            \n\t"
2147     "pcmpgth    $f8, $f8, $f24                            \n\t"
2148     "pcmpgth    $f10, $f10, $f26                          \n\t"
2149     "and        $f4, $f4, $f8                             \n\t"
2150     "and        $f6, $f6, $f10                            \n\t"
2151     "dmtc1      $8, $f8                                   \n\t"
2152     "punpcklhw  $f24, $f8, $f8                            \n\t"
2153     "punpcklwd  $f8, $f24, $f24                           \n\t"
2154     "mov.d      $f10, $f8                                 \n\t"
2155     "gssqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
2156     "paddh      $f8, $f12, $f12                           \n\t"
2157     "paddh      $f10, $f14, $f14                          \n\t"
2158     "paddh      $f8, $f8, $f16                            \n\t"
2159     "paddh      $f10, $f10, $f18                          \n\t"
2160     "gslqc1     $f22, $f20, 0x50(%[tmp])                  \n\t"
2161     "paddh      $f8, $f8, $f20                            \n\t"
2162     "paddh      $f10, $f10, $f22                          \n\t"
2163     "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
2164     "paddh      $f8, $f8, $f24                            \n\t"
2165     "paddh      $f10, $f10, $f26                          \n\t"
2166     "dmtc1      $8, $f20                                  \n\t"
2167     "psrah      $f8, $f8, $f20                            \n\t"
2168     "psrah      $f10, $f10, $f20                          \n\t"
2169     "and        $f24, $f0, $f8                            \n\t"
2170     "and        $f26, $f2, $f10                           \n\t"
2171     "pandn      $f8, $f0, $f16                            \n\t"
2172     "pandn      $f10, $f2, $f18                           \n\t"
2173     "or         $f24, $f24, $f8                           \n\t"
2174     "or         $f26, $f26, $f10                          \n\t"
2175     "gslqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
2176     "paddh      $f28, $f8, $f8                            \n\t"
2177     "paddh      $f30, $f10, $f10                          \n\t"
2178     "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
2179     "paddh      $f28, $f28, $f20                          \n\t"
2180     "paddh      $f30, $f30, $f22                          \n\t"
2181     "gslqc1     $f18, $f16, 0x70(%[tmp])                  \n\t"
2182     "paddh      $f28, $f28, $f16                          \n\t"
2183     "paddh      $f30, $f30, $f18                          \n\t"
2184     "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
2185     "paddh      $f28, $f28, $f8                           \n\t"
2186     "paddh      $f30, $f30, $f10                          \n\t"
2187     "pandn      $f8, $f4, $f20                            \n\t"
2188     "pandn      $f10, $f6, $f22                           \n\t"
2189     "dmtc1      $8, $f20                                  \n\t"
2190     "psrah      $f28, $f28, $f20                          \n\t"
2191     "psrah      $f30, $f30, $f20                          \n\t"
2192     "and        $f16, $f4, $f28                           \n\t"
2193     "and        $f18, $f6, $f30                           \n\t"
2194     "or         $f16, $f16, $f8                           \n\t"
2195     "or         $f18, $f18, $f10                          \n\t"
2196     "gslqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
2197     "packushb   $f24, $f24, $f26                          \n\t"
2198     "packushb   $f26, $f16, $f18                          \n\t"
2199     "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
2200     "paddh      $f24, $f8, $f8                            \n\t"
2201     "paddh      $f26, $f10, $f10                          \n\t"
2202     "dmtc1      %[iAlpha], $f20                           \n\t"
2203     "dmtc1      %[iBeta], $f22                            \n\t"
2204     "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
2205     "paddh      $f24, $f24, $f20                          \n\t"
2206     "paddh      $f26, $f26, $f22                          \n\t"
2207     "paddh      $f24, $f24, $f12                          \n\t"
2208     "paddh      $f26, $f26, $f14                          \n\t"
2209     "mov.d      $f16, $f0                                 \n\t"
2210     "mov.d      $f18, $f2                                 \n\t"
2211     "pandn      $f0, $f0, $f20                            \n\t"
2212     "pandn      $f2, $f2, $f22                            \n\t"
2213     "dmtc1      $8, $f20                                  \n\t"
2214     "paddh      $f24, $f24, $f8                           \n\t"
2215     "paddh      $f26, $f26, $f10                          \n\t"
2216     "psrah      $f24, $f24, $f20                          \n\t"
2217     "psrah      $f26, $f26, $f20                          \n\t"
2218     "and        $f16, $f16, $f24                          \n\t"
2219     "and        $f18, $f18, $f26                          \n\t"
2220     "or         $f16, $f16, $f0                           \n\t"
2221     "or         $f18, $f18, $f2                           \n\t"
2222     "gslqc1     $f2, $f0, 0x70(%[tmp])                    \n\t"
2223     "paddh      $f20, $f0, $f0                            \n\t"
2224     "paddh      $f22, $f2, $f2                            \n\t"
2225     "gslqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
2226     "paddh      $f20, $f20, $f0                           \n\t"
2227     "paddh      $f22, $f22, $f2                           \n\t"
2228     "gslqc1     $f14, $f12, 0x60(%[tmp])                  \n\t"
2229     "paddh      $f20, $f20, $f12                          \n\t"
2230     "paddh      $f22, $f22, $f14                          \n\t"
2231     "paddh      $f20, $f20, $f8                           \n\t"
2232     "paddh      $f22, $f22, $f10                          \n\t"
2233     "dmtc1      $8, $f8                                   \n\t"
2234     "psrah      $f20, $f20, $f8                           \n\t"
2235     "psrah      $f22, $f22, $f8                           \n\t"
2236     "and        $f12, $f4, $f20                           \n\t"
2237     "and        $f14, $f6, $f22                           \n\t"
2238     "pandn      $f4, $f4, $f0                             \n\t"
2239     "pandn      $f6, $f6, $f2                             \n\t"
2240     "or         $f12, $f12, $f4                           \n\t"
2241     "or         $f14, $f14, $f6                           \n\t"
2242     "packushb   $f16, $f16, $f18                          \n\t"
2243     "packushb   $f18, $f12, $f14                          \n\t"
2244     "gssqc1     $f18, $f16, 0xa0(%[tmp])                  \n\t"
2245     "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
2246     "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
2247     "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
2248     "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
2249     "mov.d      $f26, $f2                                 \n\t"
2250     "punpckhbh  $f2, $f0, $f4                             \n\t"
2251     "punpcklbh  $f0, $f0, $f4                             \n\t"
2252     "punpcklbh  $f24, $f26, $f6                           \n\t"
2253     "punpckhbh  $f26, $f26, $f6                           \n\t"
2254     "mov.d      $f30, $f10                                \n\t"
2255     "punpckhbh  $f10, $f8, $f12                           \n\t"
2256     "punpcklbh  $f8, $f8, $f12                            \n\t"
2257     "punpcklbh  $f28, $f30, $f14                          \n\t"
2258     "punpckhbh  $f30, $f30, $f14                          \n\t"
2259     "punpcklhw  $f16, $f2, $f10                           \n\t"
2260     "punpckhhw  $f18, $f2, $f10                           \n\t"
2261     "punpcklhw  $f20, $f26, $f30                          \n\t"
2262     "punpckhhw  $f22, $f26, $f30                          \n\t"
2263     "punpckhhw  $f2, $f0, $f8                             \n\t"
2264     "punpcklhw  $f0, $f0, $f8                             \n\t"
2265     "punpckhhw  $f26, $f24, $f28                          \n\t"
2266     "punpcklhw  $f24, $f24, $f28                          \n\t"
2267     "punpcklwd  $f4, $f2, $f26                            \n\t"
2268     "punpckhwd  $f6, $f2, $f26                            \n\t"
2269     "punpcklwd  $f8, $f18, $f22                           \n\t"
2270     "punpckhwd  $f10, $f18, $f22                          \n\t"
2271     "punpckhwd  $f2, $f0, $f24                            \n\t"
2272     "punpcklwd  $f0, $f0, $f24                            \n\t"
2273     "punpckhwd  $f18, $f16, $f20                          \n\t"
2274     "punpcklwd  $f16, $f16, $f20                          \n\t"
2275     "mov.d      $f20, $f2                                 \n\t"
2276     "mov.d      $f24, $f6                                 \n\t"
2277     "mov.d      $f2, $f16                                 \n\t"
2278     "mov.d      $f22, $f18                                \n\t"
2279     "mov.d      $f6, $f8                                  \n\t"
2280     "mov.d      $f26, $f10                                \n\t"
2281     "dli        %[iAlpha], 0x20                           \n\t"
2282     "dmtc1      %[iAlpha], $f8                            \n\t"
2283     "gsswlc1    $f0, 0x3($9)                              \n\t"
2284     "gsswrc1    $f0, 0x0($9)                              \n\t"
2285     "daddu      $12, $9, %[iStride]                       \n\t"
2286     "gsswlc1    $f20, 0x3($12)                            \n\t"
2287     "gsswrc1    $f20, 0x0($12)                            \n\t"
2288     "daddu      $12, $12, %[iStride]                      \n\t"
2289     "gsswlc1    $f4, 0x3($12)                             \n\t"
2290     "gsswrc1    $f4, 0x0($12)                             \n\t"
2291     "daddu      $12, $12, %[iStride]                      \n\t"
2292     "gsswlc1    $f24, 0x3($12)                            \n\t"
2293     "gsswrc1    $f24, 0x0($12)                            \n\t"
2294     "dsrl       $f0, $f0, $f8                             \n\t"
2295     "dsrl       $f20, $f20, $f8                           \n\t"
2296     "dsrl       $f4, $f4, $f8                             \n\t"
2297     "dsrl       $f24, $f24, $f8                           \n\t"
2298     "gsswlc1    $f0, 0x3($10)                             \n\t"
2299     "gsswrc1    $f0, 0x0($10)                             \n\t"
2300     "daddu      $13, $10, %[iStride]                      \n\t"
2301     "daddu      $8, $13, %[iStride]                       \n\t"
2302     "gsswlc1    $f20, 0x3($13)                            \n\t"
2303     "gsswrc1    $f20, 0x0($13)                            \n\t"
2304     "daddu      $13, $8, %[iStride]                       \n\t"
2305     "gsswlc1    $f4, 0x3($8)                              \n\t"
2306     "gsswrc1    $f4, 0x0($8)                              \n\t"
2307     "gsswlc1    $f24, 0x3($13)                            \n\t"
2308     "gsswrc1    $f24, 0x0($13)                            \n\t"
2309     "gsswlc1    $f2, 0x3(%[pPixCb])                       \n\t"
2310     "gsswrc1    $f2, 0x0(%[pPixCb])                       \n\t"
2311     "daddu      $12, %[pPixCb], %[iStride]                \n\t"
2312     "gsswlc1    $f22, 0x3($12)                            \n\t"
2313     "gsswrc1    $f22, 0x0($12)                            \n\t"
2314     "daddu      $12, $12, %[iStride]                      \n\t"
2315     "gsswlc1    $f6, 0x3($12)                             \n\t"
2316     "gsswrc1    $f6, 0x0($12)                             \n\t"
2317     "daddu      $12, $12, %[iStride]                      \n\t"
2318     "gsswlc1    $f26, 0x3($12)                            \n\t"
2319     "gsswrc1    $f26, 0x0($12)                            \n\t"
2320     "dsrl       $f2, $f2, $f8                             \n\t"
2321     "dsrl       $f22, $f22, $f8                           \n\t"
2322     "dsrl       $f6, $f6, $f8                             \n\t"
2323     "dsrl       $f26, $f26, $f8                           \n\t"
2324     "gsswlc1    $f2, 0x3(%[pPixCr])                       \n\t"
2325     "gsswrc1    $f2, 0x0(%[pPixCr])                       \n\t"
2326     "daddu      $13, %[pPixCr], %[iStride]                \n\t"
2327     "daddu      $8, $13, %[iStride]                       \n\t"
2328     "gsswlc1    $f22, 0x3($13)                            \n\t"
2329     "gsswrc1    $f22, 0x0($13)                            \n\t"
2330     "daddu      $13, $8, %[iStride]                       \n\t"
2331     "gsswlc1    $f6, 0x3($8)                              \n\t"
2332     "gsswrc1    $f6, 0x0($8)                              \n\t"
2333     "gsswlc1    $f26, 0x3($13)                            \n\t"
2334     "gsswrc1    $f26, 0x0($13)                            \n\t"
2335     : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
2336     : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
2337       [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
2338     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
2339       "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2340       "$f24", "$f26", "$f28", "$f30"
2341   );
2342   RECOVER_REG;
2343 }
2344 
DeblockChromaLt4H_mmi(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTC)2345 void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
2346                            int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
2347   unsigned char tmp[320] __attribute__((aligned(32)));
2348   BACKUP_REG;
2349   __asm__ volatile (
2350     ".set       arch=loongson3a                           \n\t"
2351     "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
2352     "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
2353     "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
2354     "gsldlc1    $f0, 0x7(%[pPixCb])                       \n\t"
2355     "gsldlc1    $f4, 0x7($8)                              \n\t"
2356     "gsldrc1    $f0, 0x0(%[pPixCb])                       \n\t"
2357     "gsldrc1    $f4, 0x0($8)                              \n\t"
2358     "daddu      $9, $8, %[iStride]                        \n\t"
2359     "daddu      $8, $9, %[iStride]                        \n\t"
2360     "gsldlc1    $f8, 0x7($9)                              \n\t"
2361     "gsldlc1    $f12, 0x7($8)                             \n\t"
2362     "gsldrc1    $f8, 0x0($9)                              \n\t"
2363     "gsldrc1    $f12, 0x0($8)                             \n\t"
2364     "daddu      $9, $8, %[iStride]                        \n\t"
2365 
2366     "daddu      $10, %[pPixCr], %[iStride]                \n\t"
2367     "gsldlc1    $f16, 0x7(%[pPixCr])                      \n\t"
2368     "gsldlc1    $f20, 0x7($10)                            \n\t"
2369     "gsldrc1    $f16, 0x0(%[pPixCr])                      \n\t"
2370     "gsldrc1    $f20, 0x0($10)                            \n\t"
2371     "daddu      $11, $10, %[iStride]                      \n\t"
2372     "daddu      $10, $11, %[iStride]                      \n\t"
2373     "gsldlc1    $f24, 0x7($11)                            \n\t"
2374     "gsldlc1    $f28, 0x7($10)                            \n\t"
2375     "gsldrc1    $f24, 0x0($11)                            \n\t"
2376     "gsldrc1    $f28, 0x0($10)                            \n\t"
2377     "daddu      $11, $10, %[iStride]                      \n\t"
2378 
2379     "punpcklwd  $f0, $f0, $f16                            \n\t"
2380     "punpcklwd  $f4, $f4, $f20                            \n\t"
2381     "punpcklwd  $f8, $f8, $f24                            \n\t"
2382     "punpcklwd  $f12, $f12, $f28                          \n\t"
2383     "gsldlc1    $f16, 0x7($9)                             \n\t"
2384     "gsldlc1    $f20, 0x7($11)                            \n\t"
2385     "gsldrc1    $f16, 0x0($9)                             \n\t"
2386     "gsldrc1    $f20, 0x0($11)                            \n\t"
2387     "punpcklwd  $f16, $f16, $f20                          \n\t"
2388     "mov.d      $f2, $f16                                 \n\t"
2389     "daddu      $8, $9, %[iStride]                        \n\t"
2390     "daddu      $10, $11, %[iStride]                      \n\t"
2391     "gsldlc1    $f16, 0x7($8)                             \n\t"
2392     "gsldlc1    $f20, 0x7($10)                            \n\t"
2393     "gsldrc1    $f16, 0x0($8)                             \n\t"
2394     "gsldrc1    $f20, 0x0($10)                            \n\t"
2395     "punpcklwd  $f16, $f16, $f20                          \n\t"
2396     "mov.d      $f6, $f16                                 \n\t"
2397     "daddu      $9, $8, %[iStride]                        \n\t"
2398     "daddu      $11, $10, %[iStride]                      \n\t"
2399 
2400     "gsldlc1    $f16, 0x7($9)                             \n\t"
2401     "gsldlc1    $f20, 0x7($11)                            \n\t"
2402     "gsldrc1    $f16, 0x0($9)                             \n\t"
2403     "gsldrc1    $f20, 0x0($11)                            \n\t"
2404     "punpcklwd  $f16, $f16, $f20                          \n\t"
2405     "mov.d      $f10, $f16                                \n\t"
2406     "daddu      $8, $9, %[iStride]                        \n\t"
2407     "daddu      $10, $11, %[iStride]                      \n\t"
2408 
2409     "gsldlc1    $f16, 0x7($8)                             \n\t"
2410     "gsldlc1    $f20, 0x7($10)                            \n\t"
2411     "gsldrc1    $f16, 0x0($8)                             \n\t"
2412     "gsldrc1    $f20, 0x0($10)                            \n\t"
2413     "punpcklwd  $f16, $f16, $f20                          \n\t"
2414     "mov.d      $f14, $f16                                \n\t"
2415 
2416     "punpcklbh  $f24, $f2, $f6                            \n\t"
2417     "punpckhbh  $f26, $f2, $f6                            \n\t"
2418     "punpckhbh  $f2, $f0, $f4                             \n\t"
2419     "punpcklbh  $f0, $f0, $f4                             \n\t"
2420     "punpcklbh  $f28, $f10, $f14                          \n\t"
2421     "punpckhbh  $f30, $f10, $f14                          \n\t"
2422     "punpckhbh  $f10, $f8, $f12                           \n\t"
2423     "punpcklbh  $f8, $f8, $f12                            \n\t"
2424 
2425     "punpcklhw  $f16, $f2, $f10                           \n\t"
2426     "punpckhhw  $f18, $f2, $f10                           \n\t"
2427     "punpckhhw  $f2, $f0, $f8                             \n\t"
2428     "punpcklhw  $f0, $f0, $f8                             \n\t"
2429     "punpcklhw  $f20, $f26, $f30                          \n\t"
2430     "punpckhhw  $f22, $f26, $f30                          \n\t"
2431     "punpckhhw  $f26, $f24, $f28                          \n\t"
2432     "punpcklhw  $f24, $f24, $f28                          \n\t"
2433 
2434     "punpcklwd  $f4, $f2, $f26                            \n\t"
2435     "punpckhwd  $f6, $f2, $f26                            \n\t"
2436     "punpckhwd  $f2, $f0, $f24                            \n\t"
2437     "punpcklwd  $f0, $f0, $f24                            \n\t"
2438     "punpcklwd  $f8, $f18, $f22                           \n\t"
2439     "punpckhwd  $f10, $f18, $f22                          \n\t"
2440     "punpckhwd  $f18, $f16, $f20                          \n\t"
2441     "punpcklwd  $f16, $f16, $f20                          \n\t"
2442 
2443     "mov.d      $f20, $f2                                 \n\t"
2444     "mov.d      $f22, $f18                                \n\t"
2445     "mov.d      $f2, $f16                                 \n\t"
2446     "mov.d      $f24, $f6                                 \n\t"
2447     "mov.d      $f26, $f10                                \n\t"
2448     "mov.d      $f6, $f8                                  \n\t"
2449     "daddiu     $11, %[tmp], 0x70                         \n\t"
2450 
2451     "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
2452     "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
2453     "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
2454     "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
2455 
2456     "lb         $8, 0x3(%[pTC])                           \n\t"
2457     "lb         $9, 0x2(%[pTC])                           \n\t"
2458     "lb         $10, 0x1(%[pTC])                          \n\t"
2459     "lb         $11, 0x0(%[pTC])                          \n\t"
2460 
2461     "and        $12, $8, 0xFFFF                           \n\t"
2462     "dmtc1      $12, $f8                                  \n\t"
2463 
2464     "and        $9, $9, 0xFFFF                            \n\t"
2465     "dmtc1      $9, $f12                                  \n\t"
2466     "mov.d      $f16, $f12                                \n\t"
2467 
2468     "and        $9, $10, 0xFFFF                           \n\t"
2469     "dmtc1      $9, $f20                                  \n\t"
2470     "xor        $f0, $f0, $f0                             \n\t"
2471     "mov.d      $f24, $f20                                \n\t"
2472     "and        $9, $11, 0xFFFF                           \n\t"
2473     "punpcklhw  $f24, $f24, $f8                           \n\t"
2474 
2475     "mov.d      $f4, $f8                                  \n\t"
2476     "dmtc1      $9, $f28                                  \n\t"
2477     "mov.d      $f0, $f28                                 \n\t"
2478 
2479     "punpcklhw  $f28, $f28, $f12                          \n\t"
2480     "punpcklhw  $f20, $f20, $f4                           \n\t"
2481     "xor        $f4, $f4, $f4                             \n\t"
2482     "xor        $f6, $f6, $f6                             \n\t"
2483     "punpcklhw  $f28, $f28, $f20                          \n\t"
2484     "gslqc1     $f22, $f20, 0xA0(%[tmp])                  \n\t"
2485     "punpcklhw  $f0, $f0, $f16                            \n\t"
2486     "punpcklhw  $f0, $f0, $f24                            \n\t"
2487 
2488     "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
2489     "punpckhhw  $f2, $f0, $f28                            \n\t"
2490     "punpcklhw  $f0, $f0, $f28                            \n\t"
2491     "gslqc1     $f30, $f28, 0x80(%[tmp])                  \n\t"
2492     "psubh      $f8, $f4, $f0                             \n\t"
2493     "psubh      $f10, $f6, $f2                            \n\t"
2494     "gssqc1     $f10, $f8, 0xD0(%[tmp])                   \n\t"
2495     "dmtc1      %[iAlpha], $f8                            \n\t"
2496     "punpcklhw  $f12, $f8, $f8                            \n\t"
2497     "punpcklwd  $f16, $f12, $f12                          \n\t"
2498     "mov.d      $f18, $f16                                \n\t"
2499 
2500     "dmtc1      %[iBeta], $f8                             \n\t"
2501     "punpcklhw  $f12, $f8, $f8                            \n\t"
2502     "punpcklwd  $f8, $f12, $f12                           \n\t"
2503     "mov.d      $f10, $f8                                 \n\t"
2504 
2505     "gslqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
2506     "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
2507     "punpckhbh  $f10, $f24, $f4                           \n\t"
2508     "punpcklbh  $f8, $f24, $f4                            \n\t"
2509     "punpcklbh  $f24, $f26, $f6                           \n\t"
2510     "punpckhbh  $f26, $f26, $f6                           \n\t"
2511 
2512     "gssqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
2513     "gssqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
2514     "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
2515     "punpcklbh  $f8, $f28, $f4                            \n\t"
2516     "punpckhbh  $f10, $f28, $f4                           \n\t"
2517     "punpcklbh  $f28, $f30, $f6                           \n\t"
2518     "punpckhbh  $f30, $f30, $f6                           \n\t"
2519     "punpcklbh  $f24, $f26, $f6                           \n\t"
2520     "punpckhbh  $f26, $f26, $f6                           \n\t"
2521     "punpckhbh  $f14, $f12, $f4                           \n\t"
2522     "punpcklbh  $f12, $f12, $f4                           \n\t"
2523     "punpckhbh  $f22, $f20, $f4                           \n\t"
2524     "punpcklbh  $f20, $f20, $f4                           \n\t"
2525     "gssqc1     $f30, $f28, 0xF0(%[tmp])                  \n\t"
2526     "gssqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
2527     "gslqc1     $f26, $f24, 0xA0(%[tmp])                  \n\t"
2528     "punpcklbh  $f24, $f26, $f6                           \n\t"
2529     "punpckhbh  $f26, $f26, $f6                           \n\t"
2530 
2531     "dli        $13, 0x4                                  \n\t"
2532     "gssqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
2533     "dmtc1      $13, $f24                                 \n\t"
2534     "punpcklhw  $f28, $f24, $f24                          \n\t"
2535     "punpcklwd  $f24, $f28, $f28                          \n\t"
2536     "mov.d      $f26, $f24                                \n\t"
2537     "dli        $12, 0x2                                  \n\t"
2538     "dli        $13, 0x3                                  \n\t"
2539 
2540     "gssqc1     $f2, $f0, 0x20(%[tmp])                    \n\t"
2541     "dmfc1      %[iAlpha], $f0                            \n\t"
2542     "dmfc1      %[iBeta], $f2                             \n\t"
2543     "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
2544     "gslqc1     $f30, $f28, 0x40(%[tmp])                  \n\t"
2545     "psubh      $f28, $f28, $f20                          \n\t"
2546     "psubh      $f30, $f30, $f22                          \n\t"
2547     "pcmpgth    $f24, $f0, $f4                            \n\t"
2548     "pcmpgth    $f26, $f2, $f6                            \n\t"
2549 
2550     "dmtc1      $12, $f0                                  \n\t"
2551     "dmtc1      $13, $f2                                  \n\t"
2552     "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
2553     "gslqc1     $f6, $f4, 0xD0(%[tmp])                    \n\t"
2554     "psubh      $f24, $f12, $f8                           \n\t"
2555     "psubh      $f26, $f14, $f10                          \n\t"
2556     "psllh      $f24, $f24, $f0                           \n\t"
2557     "psllh      $f26, $f26, $f0                           \n\t"
2558     "paddh      $f24, $f24, $f28                          \n\t"
2559     "paddh      $f26, $f26, $f30                          \n\t"
2560     "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
2561     "paddh      $f24, $f24, $f28                          \n\t"
2562     "paddh      $f26, $f26, $f30                          \n\t"
2563     "psrah      $f24, $f24, $f2                           \n\t"
2564     "psrah      $f26, $f26, $f2                           \n\t"
2565     "pmaxsh     $f4, $f4, $f24                            \n\t"
2566     "pmaxsh     $f6, $f6, $f26                            \n\t"
2567 
2568     "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
2569     "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
2570     "pminsh     $f24, $f24, $f4                           \n\t"
2571     "pminsh     $f26, $f26, $f6                           \n\t"
2572 
2573     "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
2574     "psubh      $f4, $f8, $f12                            \n\t"
2575     "psubh      $f6, $f10, $f14                           \n\t"
2576     WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
2577     "pcmpgth    $f24, $f16, $f4                           \n\t"
2578     "pcmpgth    $f26, $f18, $f6                           \n\t"
2579     "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
2580     "psubh      $f4, $f4, $f8                             \n\t"
2581     "psubh      $f6, $f6, $f10                            \n\t"
2582     WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
2583     "pcmpgth    $f28, $f28, $f4                           \n\t"
2584     "pcmpgth    $f30, $f30, $f6                           \n\t"
2585 
2586     "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
2587     "and        $f24, $f24, $f28                          \n\t"
2588     "and        $f26, $f26, $f30                          \n\t"
2589     "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
2590     "psubh      $f20, $f20, $f12                          \n\t"
2591     "psubh      $f22, $f22, $f14                          \n\t"
2592     WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
2593     "pcmpgth    $f4, $f4, $f20                            \n\t"
2594     "pcmpgth    $f6, $f6, $f22                            \n\t"
2595 
2596     "gslqc1     $f22, $f20, 0xB0(%[tmp])                  \n\t"
2597     "gslqc1     $f2, $f0, 0xE0(%[tmp])                    \n\t"
2598     "psubh      $f20, $f20, $f0                           \n\t"
2599     "psubh      $f22, $f22, $f2                           \n\t"
2600     "and        $f24, $f24, $f4                           \n\t"
2601     "and        $f26, $f26, $f6                           \n\t"
2602     "gslqc1     $f2, $f0, 0x60(%[tmp])                    \n\t"
2603     "and        $f24, $f24, $f0                           \n\t"
2604     "and        $f26, $f26, $f2                           \n\t"
2605 
2606     "gslqc1     $f6, $f4, 0x20(%[tmp])                    \n\t"
2607     "and        $f4, $f4, $f24                            \n\t"
2608     "and        $f6, $f6, $f26                            \n\t"
2609     "gslqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
2610     "gssqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
2611     "gslqc1     $f6, $f4, 0xF0(%[tmp])                    \n\t"
2612 
2613     "dmtc1      $12, $f0                                  \n\t"
2614     "psubh      $f24, $f24, $f4                           \n\t"
2615     "psubh      $f26, $f26, $f6                           \n\t"
2616     "psllh      $f24, $f24, $f0                           \n\t"
2617     "psllh      $f26, $f26, $f0                           \n\t"
2618     "paddh      $f24, $f24, $f20                          \n\t"
2619     "paddh      $f26, $f26, $f22                          \n\t"
2620     "gslqc1     $f2, $f0, 0x30(%[tmp])                    \n\t"
2621     "paddh      $f24, $f24, $f0                           \n\t"
2622     "paddh      $f26, $f26, $f2                           \n\t"
2623     "dmtc1      %[iBeta], $f2                             \n\t"
2624 
2625     "dmtc1      $13, $f0                                  \n\t"
2626     "gslqc1     $f22, $f20, 0xD0(%[tmp])                  \n\t"
2627     "psrah      $f24, $f24, $f0                           \n\t"
2628     "psrah      $f26, $f26, $f0                           \n\t"
2629     "dmtc1      %[iAlpha], $f0                            \n\t"
2630     "pmaxsh     $f20, $f20, $f24                          \n\t"
2631     "pmaxsh     $f22, $f22, $f26                          \n\t"
2632     "pminsh     $f0, $f0, $f20                            \n\t"
2633     "pminsh     $f2, $f2, $f22                            \n\t"
2634 
2635     "dmfc1      %[iAlpha], $f0                            \n\t"
2636     "dmfc1      %[iBeta], $f2                             \n\t"
2637     "gslqc1     $f22, $f20, 0xC0(%[tmp])                  \n\t"
2638     "psubh      $f24, $f4, $f20                           \n\t"
2639     "psubh      $f26, $f6, $f22                           \n\t"
2640     WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2641     "pcmpgth    $f16, $f16, $f24                          \n\t"
2642     "pcmpgth    $f18, $f18, $f26                          \n\t"
2643 
2644     "gslqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
2645     "psubh      $f24, $f24, $f4                           \n\t"
2646     "psubh      $f26, $f26, $f6                           \n\t"
2647     WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2648     "pcmpgth    $f28, $f28, $f24                          \n\t"
2649     "pcmpgth    $f30, $f30, $f26                          \n\t"
2650 
2651     "gslqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
2652     "and        $f16, $f16, $f28                          \n\t"
2653     "and        $f18, $f18, $f30                          \n\t"
2654 
2655     "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
2656     "psubh      $f24, $f24, $f20                          \n\t"
2657     "psubh      $f26, $f26, $f22                          \n\t"
2658     WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
2659     "pcmpgth    $f28, $f28, $f24                          \n\t"
2660     "pcmpgth    $f30, $f30, $f26                          \n\t"
2661     "and        $f16, $f16, $f28                          \n\t"
2662     "and        $f18, $f18, $f30                          \n\t"
2663     "gslqc1     $f30, $f28, 0x60(%[tmp])                  \n\t"
2664     "dmtc1      %[iAlpha], $f0                            \n\t"
2665     "dmtc1      %[iBeta], $f2                             \n\t"
2666     "and        $f16, $f16, $f28                          \n\t"
2667     "and        $f18, $f18, $f30                          \n\t"
2668     "and        $f0, $f0, $f16                            \n\t"
2669     "and        $f2, $f2, $f18                            \n\t"
2670 
2671     "gslqc1     $f18, $f16, 0x40(%[tmp])                  \n\t"
2672     "paddh      $f8, $f8, $f16                            \n\t"
2673     "paddh      $f10, $f10, $f18                          \n\t"
2674     "paddh      $f4, $f4, $f0                             \n\t"
2675     "paddh      $f6, $f6, $f2                             \n\t"
2676     "psubh      $f12, $f12, $f16                          \n\t"
2677     "psubh      $f14, $f14, $f18                          \n\t"
2678     "psubh      $f20, $f20, $f0                           \n\t"
2679     "psubh      $f22, $f22, $f2                           \n\t"
2680     "packushb   $f8, $f8, $f10                            \n\t"
2681     "packushb   $f10, $f4, $f6                            \n\t"
2682     "packushb   $f12, $f12, $f14                          \n\t"
2683     "packushb   $f14, $f20, $f22                          \n\t"
2684 
2685     "gssqc1     $f10, $f8, 0x80(%[tmp])                   \n\t"
2686     "gssqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
2687     "daddiu     $11, %[tmp], 0x70                         \n\t"
2688 
2689     "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
2690     "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
2691     "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
2692     "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
2693 
2694     "punpcklbh  $f24, $f2, $f6                            \n\t"
2695     "punpckhbh  $f26, $f2, $f6                            \n\t"
2696     "punpckhbh  $f2, $f0, $f4                             \n\t"
2697     "punpcklbh  $f0, $f0, $f4                             \n\t"
2698 
2699     "punpcklbh  $f28, $f10, $f14                          \n\t"
2700     "punpckhbh  $f30, $f10, $f14                          \n\t"
2701     "punpckhbh  $f10, $f8, $f12                           \n\t"
2702     "punpcklbh  $f8, $f8, $f12                            \n\t"
2703 
2704     "punpcklhw  $f16, $f2, $f10                           \n\t"
2705     "punpckhhw  $f18, $f2, $f10                           \n\t"
2706     "punpckhhw  $f2, $f0, $f8                             \n\t"
2707     "punpcklhw  $f0, $f0, $f8                             \n\t"
2708     "punpcklhw  $f20, $f26, $f30                          \n\t"
2709     "punpckhhw  $f22, $f26, $f30                          \n\t"
2710     "punpckhhw  $f26, $f24, $f28                          \n\t"
2711     "punpcklhw  $f24, $f24, $f28                          \n\t"
2712 
2713     "punpcklwd  $f4, $f2, $f26                            \n\t"
2714     "punpckhwd  $f6, $f2, $f26                            \n\t"
2715     "punpckhwd  $f2, $f0, $f24                            \n\t"
2716     "punpcklwd  $f0, $f0, $f24                            \n\t"
2717     "punpcklwd  $f8, $f18, $f22                           \n\t"
2718     "punpckhwd  $f10, $f18, $f22                          \n\t"
2719     "punpckhwd  $f18, $f16, $f20                          \n\t"
2720     "punpcklwd  $f16, $f16, $f20                          \n\t"
2721 
2722     "mov.d      $f20, $f2                                 \n\t"
2723     "mov.d      $f22, $f18                                \n\t"
2724     "mov.d      $f2, $f16                                 \n\t"
2725     "mov.d      $f24, $f6                                 \n\t"
2726     "mov.d      $f26, $f10                                \n\t"
2727     "mov.d      $f6, $f8                                  \n\t"
2728 
2729     "dli        %[iAlpha], 0x20                           \n\t"
2730     "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
2731     "gsswlc1    $f0, 0x3(%[pPixCb])                       \n\t"
2732     "gsswlc1    $f20, 0x3($8)                             \n\t"
2733     "gsswrc1    $f0, 0x0(%[pPixCb])                       \n\t"
2734     "gsswrc1    $f20, 0x0($8)                             \n\t"
2735     "daddu      $9, $8, %[iStride]                        \n\t"
2736     "daddu      $8, $9, %[iStride]                        \n\t"
2737     "gsswlc1    $f4, 0x3($9)                              \n\t"
2738     "gsswlc1    $f24, 0x3($8)                             \n\t"
2739     "gsswrc1    $f4, 0x0($9)                              \n\t"
2740     "gsswrc1    $f24, 0x0($8)                             \n\t"
2741     "daddu      $9, $8, %[iStride]                        \n\t"
2742     "dmtc1      %[iAlpha], $f8                            \n\t"
2743 
2744     "dsrl       $f0, $f0, $f8                             \n\t"
2745     "dsrl       $f20, $f20, $f8                           \n\t"
2746     "dsrl       $f4, $f4, $f8                             \n\t"
2747     "dsrl       $f24, $f24, $f8                           \n\t"
2748     "daddu      $10, %[pPixCr], %[iStride]                \n\t"
2749     "gsswlc1    $f0, 0x3(%[pPixCr])                       \n\t"
2750     "gsswlc1    $f20, 0x3($10)                            \n\t"
2751     "gsswrc1    $f0, 0x0(%[pPixCr])                       \n\t"
2752     "gsswrc1    $f20, 0x0($10)                            \n\t"
2753     "daddu      $11, $10, %[iStride]                      \n\t"
2754     "daddu      $10, $11, %[iStride]                      \n\t"
2755     "gsswlc1    $f4, 0x3($11)                             \n\t"
2756     "gsswlc1    $f24, 0x3($10)                            \n\t"
2757     "gsswrc1    $f4, 0x0($11)                             \n\t"
2758     "gsswrc1    $f24, 0x0($10)                            \n\t"
2759     "daddu      $11, $10, %[iStride]                      \n\t"
2760 
2761     "daddu      $8, $9, %[iStride]                        \n\t"
2762     "gsswlc1    $f2, 0x3($9)                              \n\t"
2763     "gsswlc1    $f22, 0x3($8)                             \n\t"
2764     "gsswrc1    $f2, 0x0($9)                              \n\t"
2765     "gsswrc1    $f22, 0x0($8)                             \n\t"
2766     "daddu      $9, $8, %[iStride]                        \n\t"
2767     "daddu      $8, $9, %[iStride]                        \n\t"
2768     "gsswlc1    $f6, 0x3($9)                              \n\t"
2769     "gsswlc1    $f26, 0x3($8)                             \n\t"
2770     "gsswrc1    $f6, 0x0($9)                              \n\t"
2771     "gsswrc1    $f26, 0x0($8)                             \n\t"
2772 
2773     "dsrl       $f2, $f2, $f8                             \n\t"
2774     "dsrl       $f22, $f22, $f8                           \n\t"
2775     "dsrl       $f6, $f6, $f8                             \n\t"
2776     "dsrl       $f26, $f26, $f8                           \n\t"
2777     "daddu      $10, $11, %[iStride]                      \n\t"
2778     "gsswlc1    $f2, 0x3($11)                             \n\t"
2779     "gsswlc1    $f22, 0x3($10)                            \n\t"
2780     "gsswrc1    $f2, 0x0($11)                             \n\t"
2781     "gsswrc1    $f22, 0x0($10)                            \n\t"
2782     "daddu      $11, $10, %[iStride]                      \n\t"
2783     "daddu      $10, $11, %[iStride]                      \n\t"
2784     "gsswlc1    $f6, 0x3($11)                             \n\t"
2785     "gsswlc1    $f26, 0x3($10)                            \n\t"
2786     "gsswrc1    $f6, 0x0($11)                             \n\t"
2787     "gsswrc1    $f26, 0x0($10)                            \n\t"
2788     : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
2789     : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
2790       [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
2791     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
2792       "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
2793       "$f26", "$f28", "$f30"
2794   );
2795   RECOVER_REG;
2796 }
2797 
WelsNonZeroCount_mmi(int8_t * pNonZeroCount)2798 void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
2799   __asm__ volatile(
2800     ".set       arch=loongson3a                 \n\t"
2801     "gsldlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
2802     "gsldlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
2803     "gsldlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
2804     "gsldrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
2805     "gsldrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
2806     "gsldrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
2807     "pcmpeqh    $f8, $f8, $f8                   \n\t"
2808     "dli        $8, 0xF                         \n\t"
2809     "dmtc1      $8, $f6                         \n\t"
2810     "psrlh      $f8, $f8, $f6                   \n\t"
2811     "packushb   $f8, $f8, $f8                   \n\t"
2812 
2813     "pminub     $f0, $f0, $f8                   \n\t"
2814     "pminub     $f2, $f2, $f8                   \n\t"
2815     "pminub     $f4, $f4, $f8                   \n\t"
2816     "gssdlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
2817     "gssdlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
2818     "gssdlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
2819     "gssdrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
2820     "gssdrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
2821     "gssdrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
2822     :
2823     : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
2824     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
2825   );
2826 }
2827