1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file copy_mb_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 20/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
WelsCopy8x8_mmi(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)43 void WelsCopy8x8_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
44 int32_t iStrideS ) {
45 __asm__ volatile (
46 ".set arch=loongson3a \n\t"
47 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
48 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
49 "gsldlc1 $f2, 0x7($8) \n\t"
50 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
51 "gsldrc1 $f2, 0x0($8) \n\t"
52 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
53 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
54 "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
55 "gsldlc1 $f6, 0x7($8) \n\t"
56 "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
57 "gsldrc1 $f6, 0x0($8) \n\t"
58 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
59 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
60 "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
61 "gsldlc1 $f10, 0x7($8) \n\t"
62 "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
63 "gsldrc1 $f10, 0x0($8) \n\t"
64 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
65 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
66 "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
67 "gsldlc1 $f14, 0x7($8) \n\t"
68 "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
69 "gsldrc1 $f14, 0x0($8) \n\t"
70
71 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
72 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
73 "gssdlc1 $f2, 0x7($8) \n\t"
74 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
75 "gssdrc1 $f2, 0x0($8) \n\t"
76 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
77 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
78 "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
79 "gssdlc1 $f6, 0x7($8) \n\t"
80 "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
81 "gssdrc1 $f6, 0x0($8) \n\t"
82 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
83 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
84 "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
85 "gssdlc1 $f10, 0x7($8) \n\t"
86 "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
87 "gssdrc1 $f10, 0x0($8) \n\t"
88 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
89 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
90 "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
91 "gssdlc1 $f14, 0x7($8) \n\t"
92 "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
93 "gssdrc1 $f14, 0x0($8) \n\t"
94 : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
95 : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
96 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
97 );
98 }
99
WelsCopy8x16_mmi(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)100 void WelsCopy8x16_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
101 int32_t iStrideS) {
102 __asm__ volatile (
103 ".set arch=loongson3a \n\t"
104 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
105 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
106 "gsldlc1 $f2, 0x7($8) \n\t"
107 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
108 "gsldrc1 $f2, 0x0($8) \n\t"
109 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
110 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
111 "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
112 "gsldlc1 $f6, 0x7($8) \n\t"
113 "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
114 "gsldrc1 $f6, 0x0($8) \n\t"
115 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
116 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
117 "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
118 "gsldlc1 $f10, 0x7($8) \n\t"
119 "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
120 "gsldrc1 $f10, 0x0($8) \n\t"
121 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
122 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
123 "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
124 "gsldlc1 $f14, 0x7($8) \n\t"
125 "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
126 "gsldrc1 $f14, 0x0($8) \n\t"
127 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
128
129 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
130 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
131 "gssdlc1 $f2, 0x7($8) \n\t"
132 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
133 "gssdrc1 $f2, 0x0($8) \n\t"
134 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
135 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
136 "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
137 "gssdlc1 $f6, 0x7($8) \n\t"
138 "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
139 "gssdrc1 $f6, 0x0($8) \n\t"
140 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
141 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
142 "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
143 "gssdlc1 $f10, 0x7($8) \n\t"
144 "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
145 "gssdrc1 $f10, 0x0($8) \n\t"
146 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
147 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
148 "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
149 "gssdlc1 $f14, 0x7($8) \n\t"
150 "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
151 "gssdrc1 $f14, 0x0($8) \n\t"
152 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
153
154 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
155 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
156 "gsldlc1 $f2, 0x7($8) \n\t"
157 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
158 "gsldrc1 $f2, 0x0($8) \n\t"
159 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
160 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
161 "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
162 "gsldlc1 $f6, 0x7($8) \n\t"
163 "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
164 "gsldrc1 $f6, 0x0($8) \n\t"
165 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
166 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
167 "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
168 "gsldlc1 $f10, 0x7($8) \n\t"
169 "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
170 "gsldrc1 $f10, 0x0($8) \n\t"
171 PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
172 PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
173 "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
174 "gsldlc1 $f14, 0x7($8) \n\t"
175 "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
176 "gsldrc1 $f14, 0x0($8) \n\t"
177
178 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
179 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
180 "gssdlc1 $f2, 0x7($8) \n\t"
181 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
182 "gssdrc1 $f2, 0x0($8) \n\t"
183 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
184 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
185 "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
186 "gssdlc1 $f6, 0x7($8) \n\t"
187 "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
188 "gssdrc1 $f6, 0x0($8) \n\t"
189 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
190 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
191 "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
192 "gssdlc1 $f10, 0x7($8) \n\t"
193 "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
194 "gssdrc1 $f10, 0x0($8) \n\t"
195 PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
196 PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
197 "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
198 "gssdlc1 $f14, 0x7($8) \n\t"
199 "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
200 "gssdrc1 $f14, 0x0($8) \n\t"
201 : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
202 : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
203 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
204 );
205 }
206
WelsCopy16x16_mmi(uint8_t * pDst,int32_t iDstStride,uint8_t * pSrc,int32_t iSrcStride)207 void WelsCopy16x16_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
208 int32_t iSrcStride) {
209 BACKUP_REG;
210 __asm__ volatile (
211 ".set arch=loongson3a \n\t"
212 "gslqc1 $f0, $f2, 0x0(%[pSrc]) \n\t"
213 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
214 "gslqc1 $f4, $f6, 0x0(%[pSrc]) \n\t"
215 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
216 "gslqc1 $f8, $f10, 0x0(%[pSrc]) \n\t"
217 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
218 "gslqc1 $f12, $f14, 0x0(%[pSrc]) \n\t"
219 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
220 "gslqc1 $f16, $f18, 0x0(%[pSrc]) \n\t"
221 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
222 "gslqc1 $f20, $f22, 0x0(%[pSrc]) \n\t"
223 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
224 "gslqc1 $f24, $f26, 0x0(%[pSrc]) \n\t"
225 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
226 "gslqc1 $f28, $f30, 0x0(%[pSrc]) \n\t"
227 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
228
229 "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
230 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
231 "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
232 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
233 "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
234 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
235 "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
236 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
237 "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
238 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
239 "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
240 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
241 "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
242 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
243 "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
244 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
245
246 "gslqc1 $f0, $f2, 0x0(%[pSrc]) \n\t"
247 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
248 "gslqc1 $f4, $f6, 0x0(%[pSrc]) \n\t"
249 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
250 "gslqc1 $f8, $f10, 0x0(%[pSrc]) \n\t"
251 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
252 "gslqc1 $f12, $f14, 0x0(%[pSrc]) \n\t"
253 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
254 "gslqc1 $f16, $f18, 0x0(%[pSrc]) \n\t"
255 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
256 "gslqc1 $f20, $f22, 0x0(%[pSrc]) \n\t"
257 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
258 "gslqc1 $f24, $f26, 0x0(%[pSrc]) \n\t"
259 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
260 "gslqc1 $f28, $f30, 0x0(%[pSrc]) \n\t"
261
262 "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
263 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
264 "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
265 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
266 "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
267 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
268 "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
269 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
270 "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
271 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
272 "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
273 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
274 "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
275 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
276 "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
277 : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
278 : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
279 : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
280 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
281 );
282 RECOVER_REG;
283 }
284
WelsCopy16x16NotAligned_mmi(uint8_t * pDst,int32_t iDstStride,uint8_t * pSrc,int32_t iSrcStride)285 void WelsCopy16x16NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
286 int32_t iSrcStride) {
287 BACKUP_REG;
288 __asm__ volatile (
289 ".set arch=loongson3a \n\t"
290 "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
291 "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
292 "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
293 "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
294 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
295 "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
296 "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
297 "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
298 "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
299 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
300 "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
301 "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
302 "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
303 "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
304 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
305 "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
306 "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
307 "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
308 "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
309 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
310 "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
311 "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
312 "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
313 "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
314 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
315 "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
316 "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
317 "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
318 "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
319 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
320 "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
321 "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
322 "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
323 "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
324 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
325 "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
326 "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
327 "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
328 "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
329 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
330
331 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
332 "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
333 "gssqc1 $f4, $f6, 0x0($8) \n\t"
334 PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
335 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
336 "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
337 "gssqc1 $f12, $f14, 0x0($8) \n\t"
338 PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
339 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
340 "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
341 "gssqc1 $f20, $f22, 0x0($8) \n\t"
342 PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
343 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
344 "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
345 "gssqc1 $f28, $f30, 0x0($8) \n\t"
346 PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
347
348 "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
349 "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
350 "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
351 "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
352 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
353 "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
354 "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
355 "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
356 "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
357 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
358 "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
359 "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
360 "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
361 "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
362 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
363 "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
364 "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
365 "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
366 "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
367 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
368 "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
369 "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
370 "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
371 "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
372 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
373 "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
374 "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
375 "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
376 "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
377 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
378 "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
379 "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
380 "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
381 "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
382 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
383 "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
384 "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
385 "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
386 "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
387
388 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
389 "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
390 "gssqc1 $f4, $f6, 0x0($8) \n\t"
391 PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
392 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
393 "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
394 "gssqc1 $f12, $f14, 0x0($8) \n\t"
395 PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
396 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
397 "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
398 "gssqc1 $f20, $f22, 0x0($8) \n\t"
399 PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
400 PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
401 "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
402 "gssqc1 $f28, $f30, 0x0($8) \n\t"
403 : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
404 : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
405 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
406 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
407 );
408 RECOVER_REG;
409 }
410
WelsCopy16x8NotAligned_mmi(uint8_t * pDst,int32_t iDstStride,uint8_t * pSrc,int32_t iSrcStride)411 void WelsCopy16x8NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
412 int32_t iSrcStride) {
413 BACKUP_REG;
414 __asm__ volatile (
415 ".set arch=loongson3a \n\t"
416 "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
417 "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
418 "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
419 "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
420 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
421 "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
422 "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
423 "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
424 "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
425 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
426 "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
427 "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
428 "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
429 "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
430 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
431 "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
432 "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
433 "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
434 "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
435 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
436 "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
437 "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
438 "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
439 "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
440 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
441 "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
442 "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
443 "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
444 "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
445 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
446 "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
447 "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
448 "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
449 "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
450 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
451 "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
452 "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
453 "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
454 "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
455
456 "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
457 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
458 "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
459 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
460 "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
461 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
462 "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
463 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
464 "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
465 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
466 "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
467 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
468 "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
469 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
470 "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
471 : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
472 : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
473 : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
474 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
475 );
476 RECOVER_REG;
477 }
478