• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    copy_mb_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    20/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
WelsCopy8x8_mmi(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)43 void WelsCopy8x8_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
44                      int32_t  iStrideS ) {
45   __asm__ volatile (
46     ".set       arch=loongson3a                 \n\t"
47     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
48     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
49     "gsldlc1    $f2, 0x7($8)                    \n\t"
50     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
51     "gsldrc1    $f2, 0x0($8)                    \n\t"
52     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
53     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
54     "gsldlc1    $f4, 0x7(%[pSrc])               \n\t"
55     "gsldlc1    $f6, 0x7($8)                    \n\t"
56     "gsldrc1    $f4, 0x0(%[pSrc])               \n\t"
57     "gsldrc1    $f6, 0x0($8)                    \n\t"
58     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
59     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
60     "gsldlc1    $f8, 0x7(%[pSrc])               \n\t"
61     "gsldlc1    $f10, 0x7($8)                   \n\t"
62     "gsldrc1    $f8, 0x0(%[pSrc])               \n\t"
63     "gsldrc1    $f10, 0x0($8)                   \n\t"
64     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
65     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
66     "gsldlc1    $f12, 0x7(%[pSrc])              \n\t"
67     "gsldlc1    $f14, 0x7($8)                   \n\t"
68     "gsldrc1    $f12, 0x0(%[pSrc])              \n\t"
69     "gsldrc1    $f14, 0x0($8)                   \n\t"
70 
71     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
72     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
73     "gssdlc1    $f2, 0x7($8)                    \n\t"
74     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
75     "gssdrc1    $f2, 0x0($8)                    \n\t"
76     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
77     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
78     "gssdlc1    $f4, 0x7(%[pDst])               \n\t"
79     "gssdlc1    $f6, 0x7($8)                    \n\t"
80     "gssdrc1    $f4, 0x0(%[pDst])               \n\t"
81     "gssdrc1    $f6, 0x0($8)                    \n\t"
82     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
83     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
84     "gssdlc1    $f8, 0x7(%[pDst])               \n\t"
85     "gssdlc1    $f10, 0x7($8)                   \n\t"
86     "gssdrc1    $f8, 0x0(%[pDst])               \n\t"
87     "gssdrc1    $f10, 0x0($8)                   \n\t"
88     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
89     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
90     "gssdlc1    $f12, 0x7(%[pDst])              \n\t"
91     "gssdlc1    $f14, 0x7($8)                   \n\t"
92     "gssdrc1    $f12, 0x0(%[pDst])              \n\t"
93     "gssdrc1    $f14, 0x0($8)                   \n\t"
94    : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
95    : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
96    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
97   );
98 }
99 
WelsCopy8x16_mmi(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)100 void WelsCopy8x16_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
101                       int32_t iStrideS) {
102   __asm__ volatile (
103     ".set       arch=loongson3a                 \n\t"
104     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
105     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
106     "gsldlc1    $f2, 0x7($8)                    \n\t"
107     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
108     "gsldrc1    $f2, 0x0($8)                    \n\t"
109     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
110     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
111     "gsldlc1    $f4, 0x7(%[pSrc])               \n\t"
112     "gsldlc1    $f6, 0x7($8)                    \n\t"
113     "gsldrc1    $f4, 0x0(%[pSrc])               \n\t"
114     "gsldrc1    $f6, 0x0($8)                    \n\t"
115     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
116     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
117     "gsldlc1    $f8, 0x7(%[pSrc])               \n\t"
118     "gsldlc1    $f10, 0x7($8)                   \n\t"
119     "gsldrc1    $f8, 0x0(%[pSrc])               \n\t"
120     "gsldrc1    $f10, 0x0($8)                   \n\t"
121     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
122     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
123     "gsldlc1    $f12, 0x7(%[pSrc])              \n\t"
124     "gsldlc1    $f14, 0x7($8)                   \n\t"
125     "gsldrc1    $f12, 0x0(%[pSrc])              \n\t"
126     "gsldrc1    $f14, 0x0($8)                   \n\t"
127     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
128 
129     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
130     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
131     "gssdlc1    $f2, 0x7($8)                    \n\t"
132     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
133     "gssdrc1    $f2, 0x0($8)                    \n\t"
134     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
135     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
136     "gssdlc1    $f4, 0x7(%[pDst])               \n\t"
137     "gssdlc1    $f6, 0x7($8)                    \n\t"
138     "gssdrc1    $f4, 0x0(%[pDst])               \n\t"
139     "gssdrc1    $f6, 0x0($8)                    \n\t"
140     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
141     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
142     "gssdlc1    $f8, 0x7(%[pDst])               \n\t"
143     "gssdlc1    $f10, 0x7($8)                   \n\t"
144     "gssdrc1    $f8, 0x0(%[pDst])               \n\t"
145     "gssdrc1    $f10, 0x0($8)                   \n\t"
146     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
147     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
148     "gssdlc1    $f12, 0x7(%[pDst])              \n\t"
149     "gssdlc1    $f14, 0x7($8)                   \n\t"
150     "gssdrc1    $f12, 0x0(%[pDst])              \n\t"
151     "gssdrc1    $f14, 0x0($8)                   \n\t"
152     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
153 
154     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
155     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
156     "gsldlc1    $f2, 0x7($8)                    \n\t"
157     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
158     "gsldrc1    $f2, 0x0($8)                    \n\t"
159     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
160     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
161     "gsldlc1    $f4, 0x7(%[pSrc])               \n\t"
162     "gsldlc1    $f6, 0x7($8)                    \n\t"
163     "gsldrc1    $f4, 0x0(%[pSrc])               \n\t"
164     "gsldrc1    $f6, 0x0($8)                    \n\t"
165     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
166     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
167     "gsldlc1    $f8, 0x7(%[pSrc])               \n\t"
168     "gsldlc1    $f10, 0x7($8)                   \n\t"
169     "gsldrc1    $f8, 0x0(%[pSrc])               \n\t"
170     "gsldrc1    $f10, 0x0($8)                   \n\t"
171     PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
172     PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
173     "gsldlc1    $f12, 0x7(%[pSrc])              \n\t"
174     "gsldlc1    $f14, 0x7($8)                   \n\t"
175     "gsldrc1    $f12, 0x0(%[pSrc])              \n\t"
176     "gsldrc1    $f14, 0x0($8)                   \n\t"
177 
178     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
179     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
180     "gssdlc1    $f2, 0x7($8)                    \n\t"
181     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
182     "gssdrc1    $f2, 0x0($8)                    \n\t"
183     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
184     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
185     "gssdlc1    $f4, 0x7(%[pDst])               \n\t"
186     "gssdlc1    $f6, 0x7($8)                    \n\t"
187     "gssdrc1    $f4, 0x0(%[pDst])               \n\t"
188     "gssdrc1    $f6, 0x0($8)                    \n\t"
189     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
190     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
191     "gssdlc1    $f8, 0x7(%[pDst])               \n\t"
192     "gssdlc1    $f10, 0x7($8)                   \n\t"
193     "gssdrc1    $f8, 0x0(%[pDst])               \n\t"
194     "gssdrc1    $f10, 0x0($8)                   \n\t"
195     PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
196     PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
197     "gssdlc1    $f12, 0x7(%[pDst])              \n\t"
198     "gssdlc1    $f14, 0x7($8)                   \n\t"
199     "gssdrc1    $f12, 0x0(%[pDst])              \n\t"
200     "gssdrc1    $f14, 0x0($8)                   \n\t"
201    : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
202    : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
203    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
204   );
205 }
206 
WelsCopy16x16_mmi(uint8_t * pDst,int32_t iDstStride,uint8_t * pSrc,int32_t iSrcStride)207 void WelsCopy16x16_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
208                        int32_t iSrcStride) {
209   BACKUP_REG;
210   __asm__ volatile (
211     ".set       arch=loongson3a                 \n\t"
212     "gslqc1     $f0, $f2, 0x0(%[pSrc])          \n\t"
213     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
214     "gslqc1     $f4, $f6, 0x0(%[pSrc])          \n\t"
215     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
216     "gslqc1     $f8, $f10, 0x0(%[pSrc])         \n\t"
217     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
218     "gslqc1     $f12, $f14, 0x0(%[pSrc])        \n\t"
219     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
220     "gslqc1     $f16, $f18, 0x0(%[pSrc])        \n\t"
221     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
222     "gslqc1     $f20, $f22, 0x0(%[pSrc])        \n\t"
223     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
224     "gslqc1     $f24, $f26, 0x0(%[pSrc])        \n\t"
225     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
226     "gslqc1     $f28, $f30, 0x0(%[pSrc])        \n\t"
227     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
228 
229     "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
230     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
231     "gssqc1     $f4, $f6, 0x0(%[pDst])          \n\t"
232     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
233     "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
234     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
235     "gssqc1     $f12, $f14, 0x0(%[pDst])        \n\t"
236     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
237     "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
238     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
239     "gssqc1     $f20, $f22, 0x0(%[pDst])        \n\t"
240     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
241     "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
242     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
243     "gssqc1     $f28, $f30, 0x0(%[pDst])        \n\t"
244     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
245 
246     "gslqc1     $f0, $f2, 0x0(%[pSrc])          \n\t"
247     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
248     "gslqc1     $f4, $f6, 0x0(%[pSrc])          \n\t"
249     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
250     "gslqc1     $f8, $f10, 0x0(%[pSrc])         \n\t"
251     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
252     "gslqc1     $f12, $f14, 0x0(%[pSrc])        \n\t"
253     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
254     "gslqc1     $f16, $f18, 0x0(%[pSrc])        \n\t"
255     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
256     "gslqc1     $f20, $f22, 0x0(%[pSrc])        \n\t"
257     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
258     "gslqc1     $f24, $f26, 0x0(%[pSrc])        \n\t"
259     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
260     "gslqc1     $f28, $f30, 0x0(%[pSrc])        \n\t"
261 
262     "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
263     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
264     "gssqc1     $f4, $f6, 0x0(%[pDst])          \n\t"
265     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
266     "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
267     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
268     "gssqc1     $f12, $f14, 0x0(%[pDst])        \n\t"
269     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
270     "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
271     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
272     "gssqc1     $f20, $f22, 0x0(%[pDst])        \n\t"
273     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
274     "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
275     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
276     "gssqc1     $f28, $f30, 0x0(%[pDst])        \n\t"
277    : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
278    : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
279    : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
280      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
281   );
282   RECOVER_REG;
283 }
284 
WelsCopy16x16NotAligned_mmi(uint8_t * pDst,int32_t iDstStride,uint8_t * pSrc,int32_t iSrcStride)285 void WelsCopy16x16NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
286                                  int32_t iSrcStride) {
287   BACKUP_REG;
288   __asm__ volatile (
289     ".set       arch=loongson3a                 \n\t"
290     "gsldlc1    $f2, 0x7(%[pSrc])               \n\t"
291     "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
292     "gsldrc1    $f2, 0x0(%[pSrc])               \n\t"
293     "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
294     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
295     "gsldlc1    $f6, 0x7(%[pSrc])               \n\t"
296     "gsldlc1    $f4, 0xF(%[pSrc])               \n\t"
297     "gsldrc1    $f6, 0x0(%[pSrc])               \n\t"
298     "gsldrc1    $f4, 0x8(%[pSrc])               \n\t"
299     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
300     "gsldlc1    $f10, 0x7(%[pSrc])              \n\t"
301     "gsldlc1    $f8, 0xF(%[pSrc])               \n\t"
302     "gsldrc1    $f10, 0x0(%[pSrc])              \n\t"
303     "gsldrc1    $f8, 0x8(%[pSrc])               \n\t"
304     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
305     "gsldlc1    $f14, 0x7(%[pSrc])              \n\t"
306     "gsldlc1    $f12, 0xF(%[pSrc])              \n\t"
307     "gsldrc1    $f14, 0x0(%[pSrc])              \n\t"
308     "gsldrc1    $f12, 0x8(%[pSrc])              \n\t"
309     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
310     "gsldlc1    $f18, 0x7(%[pSrc])              \n\t"
311     "gsldlc1    $f16, 0xF(%[pSrc])              \n\t"
312     "gsldrc1    $f18, 0x0(%[pSrc])              \n\t"
313     "gsldrc1    $f16, 0x8(%[pSrc])              \n\t"
314     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
315     "gsldlc1    $f22, 0x7(%[pSrc])              \n\t"
316     "gsldlc1    $f20, 0xF(%[pSrc])              \n\t"
317     "gsldrc1    $f22, 0x0(%[pSrc])              \n\t"
318     "gsldrc1    $f20, 0x8(%[pSrc])              \n\t"
319     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
320     "gsldlc1    $f26, 0x7(%[pSrc])              \n\t"
321     "gsldlc1    $f24, 0xF(%[pSrc])              \n\t"
322     "gsldrc1    $f26, 0x0(%[pSrc])              \n\t"
323     "gsldrc1    $f24, 0x8(%[pSrc])              \n\t"
324     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
325     "gsldlc1    $f30, 0x7(%[pSrc])              \n\t"
326     "gsldlc1    $f28, 0xF(%[pSrc])              \n\t"
327     "gsldrc1    $f30, 0x0(%[pSrc])              \n\t"
328     "gsldrc1    $f28, 0x8(%[pSrc])              \n\t"
329     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
330 
331     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
332     "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
333     "gssqc1     $f4, $f6, 0x0($8)               \n\t"
334     PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
335     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
336     "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
337     "gssqc1     $f12, $f14, 0x0($8)             \n\t"
338     PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
339     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
340     "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
341     "gssqc1     $f20, $f22, 0x0($8)             \n\t"
342     PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
343     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
344     "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
345     "gssqc1     $f28, $f30, 0x0($8)             \n\t"
346     PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
347 
348     "gsldlc1    $f2, 0x7(%[pSrc])               \n\t"
349     "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
350     "gsldrc1    $f2, 0x0(%[pSrc])               \n\t"
351     "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
352     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
353     "gsldlc1    $f6, 0x7(%[pSrc])               \n\t"
354     "gsldlc1    $f4, 0xF(%[pSrc])               \n\t"
355     "gsldrc1    $f6, 0x0(%[pSrc])               \n\t"
356     "gsldrc1    $f4, 0x8(%[pSrc])               \n\t"
357     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
358     "gsldlc1    $f10, 0x7(%[pSrc])              \n\t"
359     "gsldlc1    $f8, 0xF(%[pSrc])               \n\t"
360     "gsldrc1    $f10, 0x0(%[pSrc])              \n\t"
361     "gsldrc1    $f8, 0x8(%[pSrc])               \n\t"
362     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
363     "gsldlc1    $f14, 0x7(%[pSrc])              \n\t"
364     "gsldlc1    $f12, 0xF(%[pSrc])              \n\t"
365     "gsldrc1    $f14, 0x0(%[pSrc])              \n\t"
366     "gsldrc1    $f12, 0x8(%[pSrc])              \n\t"
367     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
368     "gsldlc1    $f18, 0x7(%[pSrc])              \n\t"
369     "gsldlc1    $f16, 0xF(%[pSrc])              \n\t"
370     "gsldrc1    $f18, 0x0(%[pSrc])              \n\t"
371     "gsldrc1    $f16, 0x8(%[pSrc])              \n\t"
372     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
373     "gsldlc1    $f22, 0x7(%[pSrc])              \n\t"
374     "gsldlc1    $f20, 0xF(%[pSrc])              \n\t"
375     "gsldrc1    $f22, 0x0(%[pSrc])              \n\t"
376     "gsldrc1    $f20, 0x8(%[pSrc])              \n\t"
377     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
378     "gsldlc1    $f26, 0x7(%[pSrc])              \n\t"
379     "gsldlc1    $f24, 0xF(%[pSrc])              \n\t"
380     "gsldrc1    $f26, 0x0(%[pSrc])              \n\t"
381     "gsldrc1    $f24, 0x8(%[pSrc])              \n\t"
382     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
383     "gsldlc1    $f30, 0x7(%[pSrc])              \n\t"
384     "gsldlc1    $f28, 0xF(%[pSrc])              \n\t"
385     "gsldrc1    $f30, 0x0(%[pSrc])              \n\t"
386     "gsldrc1    $f28, 0x8(%[pSrc])              \n\t"
387 
388     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
389     "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
390     "gssqc1     $f4, $f6, 0x0($8)               \n\t"
391     PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
392     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
393     "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
394     "gssqc1     $f12, $f14, 0x0($8)             \n\t"
395     PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
396     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
397     "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
398     "gssqc1     $f20, $f22, 0x0($8)             \n\t"
399     PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
400     PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
401     "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
402     "gssqc1     $f28, $f30, 0x0($8)             \n\t"
403    : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
404    : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
405    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
406      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
407   );
408   RECOVER_REG;
409 }
410 
WelsCopy16x8NotAligned_mmi(uint8_t * pDst,int32_t iDstStride,uint8_t * pSrc,int32_t iSrcStride)411 void WelsCopy16x8NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
412                                 int32_t iSrcStride) {
413   BACKUP_REG;
414   __asm__ volatile (
415     ".set       arch=loongson3a                 \n\t"
416     "gsldlc1    $f2, 0x7(%[pSrc])               \n\t"
417     "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
418     "gsldrc1    $f2, 0x0(%[pSrc])               \n\t"
419     "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
420     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
421     "gsldlc1    $f6, 0x7(%[pSrc])               \n\t"
422     "gsldlc1    $f4, 0xF(%[pSrc])               \n\t"
423     "gsldrc1    $f6, 0x0(%[pSrc])               \n\t"
424     "gsldrc1    $f4, 0x8(%[pSrc])               \n\t"
425     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
426     "gsldlc1    $f10, 0x7(%[pSrc])              \n\t"
427     "gsldlc1    $f8, 0xF(%[pSrc])               \n\t"
428     "gsldrc1    $f10, 0x0(%[pSrc])              \n\t"
429     "gsldrc1    $f8, 0x8(%[pSrc])               \n\t"
430     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
431     "gsldlc1    $f14, 0x7(%[pSrc])              \n\t"
432     "gsldlc1    $f12, 0xF(%[pSrc])              \n\t"
433     "gsldrc1    $f14, 0x0(%[pSrc])              \n\t"
434     "gsldrc1    $f12, 0x8(%[pSrc])              \n\t"
435     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
436     "gsldlc1    $f18, 0x7(%[pSrc])              \n\t"
437     "gsldlc1    $f16, 0xF(%[pSrc])              \n\t"
438     "gsldrc1    $f18, 0x0(%[pSrc])              \n\t"
439     "gsldrc1    $f16, 0x8(%[pSrc])              \n\t"
440     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
441     "gsldlc1    $f22, 0x7(%[pSrc])              \n\t"
442     "gsldlc1    $f20, 0xF(%[pSrc])              \n\t"
443     "gsldrc1    $f22, 0x0(%[pSrc])              \n\t"
444     "gsldrc1    $f20, 0x8(%[pSrc])              \n\t"
445     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
446     "gsldlc1    $f26, 0x7(%[pSrc])              \n\t"
447     "gsldlc1    $f24, 0xF(%[pSrc])              \n\t"
448     "gsldrc1    $f26, 0x0(%[pSrc])              \n\t"
449     "gsldrc1    $f24, 0x8(%[pSrc])              \n\t"
450     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
451     "gsldlc1    $f30, 0x7(%[pSrc])              \n\t"
452     "gsldlc1    $f28, 0xF(%[pSrc])              \n\t"
453     "gsldrc1    $f30, 0x0(%[pSrc])              \n\t"
454     "gsldrc1    $f28, 0x8(%[pSrc])              \n\t"
455 
456     "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
457     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
458     "gssqc1     $f4, $f6, 0x0(%[pDst])          \n\t"
459     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
460     "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
461     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
462     "gssqc1     $f12, $f14, 0x0(%[pDst])        \n\t"
463     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
464     "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
465     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
466     "gssqc1     $f20, $f22, 0x0(%[pDst])        \n\t"
467     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
468     "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
469     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
470     "gssqc1     $f28, $f30, 0x0(%[pDst])        \n\t"
471    : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
472    : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
473    : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
474      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
475   );
476   RECOVER_REG;
477 }
478