1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file satd_sad_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 23/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
43 #define MMI_SumWHorizon1(f0, f2, f4, f6, f8, f10, r0) \
44 "dli "#r0", 0x10 \n\t" \
45 "dmtc1 "#r0", "#f8" \n\t" \
46 "dli "#r0", 0x20 \n\t" \
47 "dmtc1 "#r0", "#f10" \n\t" \
48 "mov.d "#f4", "#f2" \n\t" \
49 "xor "#f6", "#f6", "#f6" \n\t" \
50 "paddush "#f0", "#f0", "#f4" \n\t" \
51 "paddush "#f2", "#f2", "#f6" \n\t" \
52 "dsrl "#f6", "#f2", "#f10" \n\t" \
53 "punpcklwd "#f4", "#f2", "#f2" \n\t" \
54 "punpckhwd "#f4", "#f0", "#f4" \n\t" \
55 "paddush "#f0", "#f0", "#f4" \n\t" \
56 "paddush "#f2", "#f2", "#f6" \n\t" \
57 "dsrl "#f4", "#f0", "#f8" \n\t" \
58 "pinsrh_3 "#f4", "#f4", "#f2" \n\t" \
59 "dsrl "#f6", "#f2", "#f8" \n\t" \
60 "paddush "#f0", "#f0", "#f4" \n\t" \
61 "paddush "#f2", "#f2", "#f6" \n\t"
62
63 #define MMI_GetSad8x4 \
64 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
65 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t" \
66 "gsldlc1 $f4, 0x7($8) \n\t" \
67 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t" \
68 "gsldrc1 $f4, 0x0($8) \n\t" \
69 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
70 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
71 "gsldlc1 $f2, 0x7(%[pSample1]) \n\t" \
72 "gsldlc1 $f6, 0x7($8) \n\t" \
73 "gsldlc1 $f8, 0x7(%[pSample2]) \n\t" \
74 "gsldrc1 $f2, 0x0(%[pSample1]) \n\t" \
75 "gsldrc1 $f6, 0x0($8) \n\t" \
76 "gsldrc1 $f8, 0x0(%[pSample2]) \n\t" \
77 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
78 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
79 "gsldlc1 $f12, 0x7($9) \n\t" \
80 "gsldlc1 $f10, 0x7(%[pSample2]) \n\t" \
81 "gsldrc1 $f12, 0x0($9) \n\t" \
82 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
83 "gsldrc1 $f10, 0x0(%[pSample2]) \n\t" \
84 "gsldlc1 $f14, 0x7($9) \n\t" \
85 "gsldrc1 $f14, 0x0($9) \n\t" \
86 "pasubub $f0, $f0, $f8 \n\t" \
87 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
88 "pasubub $f2, $f2, $f10 \n\t" \
89 "biadd $f0, $f0 \n\t" \
90 "biadd $f2, $f2 \n\t" \
91 "pasubub $f4, $f4, $f12 \n\t" \
92 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
93 "pasubub $f6, $f6, $f14 \n\t" \
94 "biadd $f4, $f4 \n\t" \
95 "biadd $f6, $f6 \n\t" \
96 "paddh $f24, $f24, $f0 \n\t" \
97 "paddh $f26, $f26, $f2 \n\t" \
98 "paddh $f24, $f24, $f4 \n\t" \
99 "paddh $f26, $f26, $f6 \n\t"
100
101 #define MMI_GetSad8x4_End \
102 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
103 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t" \
104 "gsldlc1 $f4, 0x7($8) \n\t" \
105 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t" \
106 "gsldrc1 $f4, 0x0($8) \n\t" \
107 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
108 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
109 "gsldlc1 $f2, 0x7(%[pSample1]) \n\t" \
110 "gsldlc1 $f6, 0x7($8) \n\t" \
111 "gsldlc1 $f8, 0x7(%[pSample2]) \n\t" \
112 "gsldrc1 $f2, 0x0(%[pSample1]) \n\t" \
113 "gsldrc1 $f6, 0x0($8) \n\t" \
114 "gsldrc1 $f8, 0x0(%[pSample2]) \n\t" \
115 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
116 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
117 "gsldlc1 $f12, 0x7($9) \n\t" \
118 "gsldlc1 $f10, 0x7(%[pSample2]) \n\t" \
119 "gsldrc1 $f12, 0x0($9) \n\t" \
120 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
121 "gsldrc1 $f10, 0x0(%[pSample2]) \n\t" \
122 "gsldlc1 $f14, 0x7($9) \n\t" \
123 "gsldrc1 $f14, 0x0($9) \n\t" \
124 "pasubub $f0, $f0, $f8 \n\t" \
125 "pasubub $f2, $f2, $f10 \n\t" \
126 "biadd $f0, $f0 \n\t" \
127 "biadd $f2, $f2 \n\t" \
128 "pasubub $f4, $f4, $f12 \n\t" \
129 "pasubub $f6, $f6, $f14 \n\t" \
130 "biadd $f4, $f4 \n\t" \
131 "biadd $f6, $f6 \n\t" \
132 "paddh $f24, $f24, $f0 \n\t" \
133 "paddh $f26, $f26, $f2 \n\t" \
134 "paddh $f24, $f24, $f4 \n\t" \
135 "paddh $f26, $f26, $f6 \n\t"
136
137 #define CACHE_SPLIT_CHECK(r0, width, cacheline) \
138 "and "#r0", "#r0", 0x1f \n\t" \
139 PTR_ADDIU ""#r0", "#r0", -0x1f \n\t"
140
141 #define MMI_GetSad2x16 \
142 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
143 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
144 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
145 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
146 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
147 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
148 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
149 "pasubub $f4, $f4, $f8 \n\t" \
150 "pasubub $f6, $f6, $f10 \n\t" \
151 "biadd $f4, $f4 \n\t" \
152 "biadd $f6, $f6 \n\t" \
153 "paddh $f0, $f0, $f4 \n\t" \
154 "paddh $f2, $f2, $f6 \n\t" \
155 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
156 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
157 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
158 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
159 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
160 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
161 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
162 "pasubub $f4, $f4, $f8 \n\t" \
163 "pasubub $f6, $f6, $f10 \n\t" \
164 "biadd $f4, $f4 \n\t" \
165 "biadd $f6, $f6 \n\t" \
166 "paddh $f0, $f0, $f4 \n\t" \
167 "paddh $f2, $f2, $f6 \n\t"
168
169 #define MMI_GetSad4x16 \
170 "gsldlc1 $f0, 0x7(%[pSample2]) \n\t" \
171 "gsldlc1 $f2, 0xF(%[pSample2]) \n\t" \
172 "gsldrc1 $f0, 0x0(%[pSample2]) \n\t" \
173 "gsldrc1 $f2, 0x8(%[pSample2]) \n\t" \
174 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
175 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
176 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
177 "pasubub $f0, $f0, $f8 \n\t" \
178 "pasubub $f2, $f2, $f10 \n\t" \
179 "biadd $f0, $f0 \n\t" \
180 "biadd $f2, $f2 \n\t" \
181 "paddh $f28, $f28, $f0 \n\t" \
182 "paddh $f30, $f30, $f2 \n\t" \
183 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
184 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
185 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
186 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
187 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
188 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
189 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
190 "pasubub $f4, $f4, $f8 \n\t" \
191 "pasubub $f6, $f6, $f10 \n\t" \
192 "biadd $f4, $f4 \n\t" \
193 "biadd $f6, $f6 \n\t" \
194 "paddh $f28, $f28, $f4 \n\t" \
195 "paddh $f30, $f30, $f6 \n\t" \
196 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
197 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
198 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
199 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
200 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
201 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
202 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
203 "pasubub $f4, $f4, $f8 \n\t" \
204 "pasubub $f6, $f6, $f10 \n\t" \
205 "biadd $f4, $f4 \n\t" \
206 "biadd $f6, $f6 \n\t" \
207 "paddh $f28, $f28, $f4 \n\t" \
208 "paddh $f30, $f30, $f6 \n\t" \
209 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
210 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
211 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
212 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
213 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
214 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
215 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
216 "pasubub $f4, $f4, $f8 \n\t" \
217 "pasubub $f6, $f6, $f10 \n\t" \
218 "biadd $f4, $f4 \n\t" \
219 "biadd $f6, $f6 \n\t" \
220 "paddh $f28, $f28, $f4 \n\t" \
221 "paddh $f30, $f30, $f6 \n\t"
222
223 #define MMI_GetSad4x16_Aligned \
224 "gslqc1 $f2, $f0, 0x0(%[pSample2]) \n\t" \
225 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
226 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
227 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
228 "pasubub $f0, $f0, $f8 \n\t" \
229 "pasubub $f2, $f2, $f10 \n\t" \
230 "biadd $f0, $f0 \n\t" \
231 "biadd $f2, $f2 \n\t" \
232 "paddh $f28, $f28, $f0 \n\t" \
233 "paddh $f30, $f30, $f2 \n\t" \
234 "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
235 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
236 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
237 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
238 "pasubub $f4, $f4, $f8 \n\t" \
239 "pasubub $f6, $f6, $f10 \n\t" \
240 "biadd $f4, $f4 \n\t" \
241 "biadd $f6, $f6 \n\t" \
242 "paddh $f28, $f28, $f4 \n\t" \
243 "paddh $f30, $f30, $f6 \n\t" \
244 "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
245 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
246 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
247 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
248 "pasubub $f4, $f4, $f8 \n\t" \
249 "pasubub $f6, $f6, $f10 \n\t" \
250 "biadd $f4, $f4 \n\t" \
251 "biadd $f6, $f6 \n\t" \
252 "paddh $f28, $f28, $f4 \n\t" \
253 "paddh $f30, $f30, $f6 \n\t" \
254 "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
255 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
256 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
257 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
258 "pasubub $f4, $f4, $f8 \n\t" \
259 "pasubub $f6, $f6, $f10 \n\t" \
260 "biadd $f4, $f4 \n\t" \
261 "biadd $f6, $f6 \n\t" \
262 "paddh $f28, $f28, $f4 \n\t" \
263 "paddh $f30, $f30, $f6 \n\t"
264
265 #define MMI_GetSad4x16_End \
266 "gsldlc1 $f0, 0x7(%[pSample2]) \n\t" \
267 "gsldlc1 $f2, 0xF(%[pSample2]) \n\t" \
268 "gsldrc1 $f0, 0x0(%[pSample2]) \n\t" \
269 "gsldrc1 $f2, 0x8(%[pSample2]) \n\t" \
270 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
271 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
272 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
273 "pasubub $f0, $f0, $f8 \n\t" \
274 "pasubub $f2, $f2, $f10 \n\t" \
275 "biadd $f0, $f0 \n\t" \
276 "biadd $f2, $f2 \n\t" \
277 "paddh $f28, $f28, $f0 \n\t" \
278 "paddh $f30, $f30, $f2 \n\t" \
279 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
280 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
281 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
282 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
283 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
284 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
285 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
286 "pasubub $f4, $f4, $f8 \n\t" \
287 "pasubub $f6, $f6, $f10 \n\t" \
288 "biadd $f4, $f4 \n\t" \
289 "biadd $f6, $f6 \n\t" \
290 "paddh $f28, $f28, $f4 \n\t" \
291 "paddh $f30, $f30, $f6 \n\t" \
292 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
293 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
294 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
295 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
296 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
297 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
298 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
299 "pasubub $f4, $f4, $f8 \n\t" \
300 "pasubub $f6, $f6, $f10 \n\t" \
301 "biadd $f4, $f4 \n\t" \
302 "biadd $f6, $f6 \n\t" \
303 "paddh $f28, $f28, $f4 \n\t" \
304 "paddh $f30, $f30, $f6 \n\t" \
305 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
306 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
307 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
308 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
309 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
310 "pasubub $f4, $f4, $f8 \n\t" \
311 "pasubub $f6, $f6, $f10 \n\t" \
312 "biadd $f4, $f4 \n\t" \
313 "biadd $f6, $f6 \n\t" \
314 "paddh $f28, $f28, $f4 \n\t" \
315 "paddh $f30, $f30, $f6 \n\t"
316
317 #define MMI_GetSad4x16_Aligned_End \
318 "gslqc1 $f2, $f0, 0x0(%[pSample2]) \n\t" \
319 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
320 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
321 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
322 "pasubub $f0, $f0, $f8 \n\t" \
323 "pasubub $f2, $f2, $f10 \n\t" \
324 "biadd $f0, $f0 \n\t" \
325 "biadd $f2, $f2 \n\t" \
326 "paddh $f28, $f28, $f0 \n\t" \
327 "paddh $f30, $f30, $f2 \n\t" \
328 "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
329 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
330 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
331 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
332 "pasubub $f4, $f4, $f8 \n\t" \
333 "pasubub $f6, $f6, $f10 \n\t" \
334 "biadd $f4, $f4 \n\t" \
335 "biadd $f6, $f6 \n\t" \
336 "paddh $f28, $f28, $f4 \n\t" \
337 "paddh $f30, $f30, $f6 \n\t" \
338 "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
339 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
340 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
341 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
342 "pasubub $f4, $f4, $f8 \n\t" \
343 "pasubub $f6, $f6, $f10 \n\t" \
344 "biadd $f4, $f4 \n\t" \
345 "biadd $f6, $f6 \n\t" \
346 "paddh $f28, $f28, $f4 \n\t" \
347 "paddh $f30, $f30, $f6 \n\t" \
348 "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
349 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
350 "pasubub $f4, $f4, $f8 \n\t" \
351 "pasubub $f6, $f6, $f10 \n\t" \
352 "biadd $f4, $f4 \n\t" \
353 "biadd $f6, $f6 \n\t" \
354 "paddh $f28, $f28, $f4 \n\t" \
355 "paddh $f30, $f30, $f6 \n\t"
356
357 #define MMI_Get4LW16Sad(f0, f2, f4, f6, f8, f10, f12, f14, r0) \
358 "pasubub "#f0", "#f0", "#f12" \n\t" \
359 "pasubub "#f2", "#f2", "#f14" \n\t" \
360 "pasubub "#f12", "#f12", "#f8" \n\t" \
361 "pasubub "#f14", "#f14", "#f10" \n\t" \
362 "biadd "#f0", "#f0" \n\t" \
363 "biadd "#f2", "#f2" \n\t" \
364 "biadd "#f12", "#f12" \n\t" \
365 "biadd "#f14", "#f14" \n\t" \
366 "paddh $f20, $f20, "#f0" \n\t" \
367 "paddh $f22, $f22, "#f2" \n\t" \
368 "paddh $f16, $f16, "#f12" \n\t" \
369 "paddh $f18, $f18, "#f14" \n\t" \
370 "gsldlc1 "#f12", 0x6("#r0") \n\t" \
371 "gsldlc1 "#f14", 0xE("#r0") \n\t" \
372 "gsldrc1 "#f12", -0x1("#r0") \n\t" \
373 "gsldrc1 "#f14", 0x7("#r0") \n\t" \
374 "pasubub "#f12", "#f12", "#f4" \n\t" \
375 "pasubub "#f14", "#f14", "#f6" \n\t" \
376 "biadd "#f12", "#f12" \n\t" \
377 "biadd "#f14", "#f14" \n\t" \
378 "paddh $f24, $f24, "#f12" \n\t" \
379 "paddh $f26, $f26, "#f14" \n\t" \
380 "gsldlc1 "#f12", 0x8("#r0") \n\t" \
381 "gsldlc1 "#f14", 0x10("#r0") \n\t" \
382 "gsldrc1 "#f12", 0x1("#r0") \n\t" \
383 "gsldrc1 "#f14", 0x9("#r0") \n\t" \
384 "pasubub "#f12", "#f12", "#f4" \n\t" \
385 "pasubub "#f14", "#f14", "#f6" \n\t" \
386 "biadd "#f12", "#f12" \n\t" \
387 "biadd "#f14", "#f14" \n\t" \
388 "paddh $f28, $f28, "#f12" \n\t" \
389 "paddh $f30, $f30, "#f14" \n\t"
390
391 #define MMI_HDMTwo4x4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
392 MMI_SumSub(f0, f2, f4, f6, f16, f18) \
393 MMI_SumSub(f8, f10, f12, f14, f16, f18) \
394 MMI_SumSub(f4, f6, f12, f14, f16, f18) \
395 MMI_SumSub(f0, f2, f8, f10, f16, f18)
396
397 #define MMI_SumAbs4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26) \
398 WELS_AbsH(f0, f2, f0, f2, f8, f10) \
399 WELS_AbsH(f4, f6, f4, f6, f8, f10) \
400 WELS_AbsH(f12, f14, f12, f14, f20, f22) \
401 WELS_AbsH(f16, f18, f16, f18, f20, f22) \
402 "paddush "#f0", "#f0", "#f4" \n\t" \
403 "paddush "#f2", "#f2", "#f6" \n\t" \
404 "paddush "#f12", "#f12", "#f16" \n\t" \
405 "paddush "#f14", "#f14", "#f18" \n\t" \
406 "paddush "#f24", "#f24", "#f0" \n\t" \
407 "paddush "#f26", "#f26", "#f2" \n\t" \
408 "paddush "#f24", "#f24", "#f12" \n\t" \
409 "paddush "#f26", "#f26", "#f14" \n\t"
410
411 #define MMI_SumWHorizon(f0, f2, f4, f6, f8, f10) \
412 "paddh "#f0", "#f0", "#f2" \n\t" \
413 "punpckhhw "#f2", "#f0", "#f8" \n\t" \
414 "punpcklhw "#f0", "#f0", "#f8" \n\t" \
415 "paddw "#f0", "#f0", "#f2" \n\t" \
416 "pshufh "#f2", "#f0", "#f10" \n\t" \
417 "paddw "#f0", "#f0", "#f2" \n\t"
418
419 #define MMI_LoadDiff8P_Offset_Stride0(f0, f2, f4, f6, f8, r0, r1) \
420 "gsldlc1 "#f0", 0x7("#r0") \n\t" \
421 "gsldlc1 "#f4", 0x7("#r1") \n\t" \
422 PTR_ADDU "$11, %[pSample1], %[iStride1] \n\t" \
423 "gsldrc1 "#f0", 0x0("#r0") \n\t" \
424 "gsldrc1 "#f4", 0x0("#r1") \n\t" \
425 PTR_ADDU "$12, %[pSample2], %[iStride2] \n\t" \
426 "punpckhbh "#f2", "#f0", "#f8" \n\t" \
427 "punpcklbh "#f0", "#f0", "#f8" \n\t" \
428 "punpckhbh "#f6", "#f4", "#f8" \n\t" \
429 "punpcklbh "#f4", "#f4", "#f8" \n\t" \
430 "psubh "#f0", "#f0", "#f4" \n\t" \
431 "psubh "#f2", "#f2", "#f6" \n\t"
432
433 #define MMI_LoadDiff8P_Offset_Stride1(f0, f2, f4, f6, f8, r0, r1) \
434 "gsldlc1 "#f0", 0x7("#r0") \n\t" \
435 "gsldlc1 "#f4", 0x7("#r1") \n\t" \
436 PTR_ADDU "%[pSample1], $11, %[iStride1] \n\t" \
437 "gsldrc1 "#f0", 0x0("#r0") \n\t" \
438 "gsldrc1 "#f4", 0x0("#r1") \n\t" \
439 PTR_ADDU "%[pSample2], $12, %[iStride2] \n\t" \
440 "punpckhbh "#f2", "#f0", "#f8" \n\t" \
441 "punpcklbh "#f0", "#f0", "#f8" \n\t" \
442 "punpckhbh "#f6", "#f4", "#f8" \n\t" \
443 "punpcklbh "#f4", "#f4", "#f8" \n\t" \
444 "psubh "#f0", "#f0", "#f4" \n\t" \
445 "psubh "#f2", "#f2", "#f6" \n\t"
446
447 #define MMI_LoadDiff8P_Offset8(f0, f2, f4, f6, f8, r0, r1) \
448 "gsldlc1 "#f0", 0x7("#r0") \n\t" \
449 "gsldlc1 "#f4", 0x7("#r1") \n\t" \
450 PTR_ADDU "%[pSample1], $9, 0x8 \n\t" \
451 "gsldrc1 "#f0", 0x0("#r0") \n\t" \
452 "gsldrc1 "#f4", 0x0("#r1") \n\t" \
453 PTR_ADDU "%[pSample2], $10, 0x8 \n\t" \
454 "punpckhbh "#f2", "#f0", "#f8" \n\t" \
455 "punpcklbh "#f0", "#f0", "#f8" \n\t" \
456 "punpckhbh "#f6", "#f4", "#f8" \n\t" \
457 "punpcklbh "#f4", "#f4", "#f8" \n\t" \
458 "psubh "#f0", "#f0", "#f4" \n\t" \
459 "psubh "#f2", "#f2", "#f6" \n\t"
460
461 #define MMI_GetSatd8x8 \
462 MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
463 MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
464 MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
465 MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
466 MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
467 MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
468 MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
469 MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
470 MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
471 MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
472 MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
473 MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
474 MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
475 MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
476 MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
477 MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
478
479 #define MMI_GetSatd8x8_Offset8 \
480 MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
481 MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
482 MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
483 MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
484 MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
485 MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
486 MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
487 MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
488 MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
489 MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
490 MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
491 MMI_LoadDiff8P_Offset8($f12, $f14, $f20, $f22, $f28, $11, $12) \
492 MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
493 MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
494 MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
495 MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
496
497 #define MMI_GetSatd8x8_End \
498 MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
499 MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
500 MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
501 MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
502 MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
503 MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
504 MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
505 MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
506 MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
507 MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
508 MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
509 MMI_LoadDiff8P($f12, $f14, $f20, $f22, $f28, $11, $12) \
510 MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
511 MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
512 MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
513 MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
514
WelsSampleSad16x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)515 int32_t WelsSampleSad16x16_mmi (uint8_t* pSample1, int32_t iStride1,
516 uint8_t* pSample2, int32_t iStride2) {
517 int32_t iSadSum = 0;
518 BACKUP_REG;
519 __asm__ volatile (
520 ".set arch=loongson3a \n\t"
521 "and $8, %[pSample2], 0xF \n\t"
522 "xor $f28, $f28, $f28 \n\t"
523 "xor $f30, $f30, $f30 \n\t"
524 "bnez $8, unaligned \n\t"
525 "aligned: \n\t"
526 MMI_GetSad4x16_Aligned
527 MMI_GetSad4x16_Aligned
528 MMI_GetSad4x16_Aligned
529 MMI_GetSad4x16_Aligned_End
530 "b out \n\t"
531
532 "unaligned: \n\t"
533 MMI_GetSad4x16
534 MMI_GetSad4x16
535 MMI_GetSad4x16
536 MMI_GetSad4x16_End
537 "out: \n\t"
538 "mov.d $f0, $f30 \n\t"
539 "paddh $f0, $f0, $f28 \n\t"
540 "dmfc1 %[iSadSum], $f0 \n\t"
541 : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
542 [pSample2]"+&r"((unsigned char *)pSample2)
543 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
544 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
545 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
546 );
547 RECOVER_REG;
548 return iSadSum;
549 }
550
WelsSampleSad16x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)551 int32_t WelsSampleSad16x8_mmi (uint8_t* pSample1, int32_t iStride1,
552 uint8_t* pSample2, int32_t iStride2) {
553 int32_t iSadSum = 0;
554 BACKUP_REG;
555 __asm__ volatile (
556 ".set arch=loongson3a \n\t"
557 "gsldlc1 $f0, 0x7(%[pSample2]) \n\t"
558 "gsldlc1 $f2, 0xF(%[pSample2]) \n\t"
559 "gsldrc1 $f0, 0x0(%[pSample2]) \n\t"
560 "gsldrc1 $f2, 0x8(%[pSample2]) \n\t"
561 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
562 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
563 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
564 "pasubub $f0, $f0, $f8 \n\t"
565 "pasubub $f2, $f2, $f10 \n\t"
566 "biadd $f0, $f0 \n\t"
567 "biadd $f2, $f2 \n\t"
568 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
569 "gsldlc1 $f6, 0xF(%[pSample2]) \n\t"
570 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
571 "gsldrc1 $f6, 0x8(%[pSample2]) \n\t"
572 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
573 "pasubub $f4, $f4, $f8 \n\t"
574 "pasubub $f6, $f6, $f10 \n\t"
575 "biadd $f4, $f4 \n\t"
576 "biadd $f6, $f6 \n\t"
577 "paddh $f0, $f0, $f4 \n\t"
578 "paddh $f2, $f2, $f6 \n\t"
579
580 MMI_GetSad2x16
581 MMI_GetSad2x16
582 MMI_GetSad2x16
583
584 "paddh $f0, $f0, $f2 \n\t"
585 "dmfc1 %[iSadSum], $f0 \n\t"
586 : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
587 [pSample2]"+&r"((unsigned char *)pSample2)
588 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
589 : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
590 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
591 );
592 RECOVER_REG;
593 return iSadSum;
594 }
595
WelsSampleSad8x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)596 int32_t WelsSampleSad8x16_mmi (uint8_t* pSample1, int32_t iStride1,
597 uint8_t* pSample2, int32_t iStride2) {
598 int32_t iSadSum = 0;
599 BACKUP_REG;
600 __asm__ volatile (
601 ".set arch=loongson3a \n\t"
602 "xor $f24, $f24, $f24 \n\t"
603 "xor $f26, $f26, $f26 \n\t"
604 MMI_GetSad8x4
605 MMI_GetSad8x4
606 MMI_GetSad8x4
607 MMI_GetSad8x4_End
608 "paddh $f0, $f26, $f24 \n\t"
609 "dmfc1 %[iSadSum], $f0 \n\t"
610 : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
611 [pSample2]"+&r"((unsigned char *)pSample2)
612 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
613 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
614 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
615 );
616 RECOVER_REG;
617 return iSadSum;
618 }
619
WelsSampleSad4x4_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)620 int32_t WelsSampleSad4x4_mmi (uint8_t* pSample1, int32_t iStride1,
621 uint8_t* pSample2, int32_t iStride2) {
622 int32_t iSadSum = 0;
623 __asm__ volatile (
624 ".set arch=loongson3a \n\t"
625 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
626 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
627 "gsldlc1 $f2, 0x7($8) \n\t"
628 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
629 "gsldrc1 $f2, 0x0($8) \n\t"
630 "punpcklwd $f0, $f0, $f2 \n\t"
631
632 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
633 "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
634 "gsldlc1 $f8, 0x7($9) \n\t"
635 "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
636 "gsldrc1 $f8, 0x0($9) \n\t"
637 "punpcklwd $f6, $f6, $f8 \n\t"
638 "pasubub $f0, $f0, $f6 \n\t"
639 "biadd $f0, $f0 \n\t"
640
641 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
642 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
643
644 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
645 "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
646 "gsldlc1 $f4, 0x7($8) \n\t"
647 "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
648 "gsldrc1 $f4, 0x0($8) \n\t"
649 "punpcklwd $f2, $f2, $f4 \n\t"
650
651 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
652 "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
653 "gsldlc1 $f8, 0x7($9) \n\t"
654 "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
655 "gsldrc1 $f8, 0x0($9) \n\t"
656 "punpcklwd $f6, $f6, $f8 \n\t"
657 "pasubub $f2, $f2, $f6 \n\t"
658 "biadd $f2, $f2 \n\t"
659 "paddh $f0, $f0, $f2 \n\t"
660
661 "dmfc1 %[iSadSum], $f0 \n\t"
662 : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
663 [pSample2]"+&r"((unsigned char *)pSample2)
664 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
665 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8"
666 );
667 return iSadSum;
668 }
669
WelsSampleSad8x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)670 int32_t WelsSampleSad8x8_mmi (uint8_t* pSample1, int32_t iStride1,
671 uint8_t* pSample2, int32_t iStride2) {
672 int32_t iSadSum = 0;
673 BACKUP_REG;
674 __asm__ volatile (
675 ".set arch=loongson3a \n\t"
676 CACHE_SPLIT_CHECK($8, 8, 32)
677 "blez $8, 1f \n\t"
678 "nop \n\t"
679 "xor $f28, $f28, $f28 \n\t"
680 "xor $f30, $f30, $f30 \n\t"
681
682 "move $9, %[pSample2] \n\t"
683 "and $9, $9, 0x7 \n\t"
684 PTR_SUBU "%[pSample2], %[pSample2], $9 \n\t"
685 "dli $8, 0x8 \n\t"
686 PTR_SUBU "$8, $8, $9 \n\t"
687
688 "dsll $9, $9, 0x3 \n\t"
689 "dsll $8, $8, 0x3 \n\t"
690 "dmtc1 $9, $f20 \n\t"
691 "dmtc1 $8, $f24 \n\t"
692 "dli $9, 0x8 \n\t"
693 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
694 PTR_ADDU "$9, $9, %[pSample2] \n\t"
695 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
696 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
697 "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
698
699 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
700 "gsldlc1 $f8, 0x7($9) \n\t"
701 "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
702 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
703 "gsldrc1 $f8, 0x0($9) \n\t"
704 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
705 "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
706 PTR_ADDU "$9, $9, %[iStride2] \n\t"
707 "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
708 "gsldlc1 $f10, 0x7($9) \n\t"
709 "dsrl $f4, $f4, $f20 \n\t"
710 "gsldrc1 $f10, 0x0($9) \n\t"
711 "dsrl $f6, $f6, $f20 \n\t"
712 "dsll $f8, $f8, $f24 \n\t"
713 "dsll $f10, $f10, $f24 \n\t"
714 "or $f4, $f4, $f8 \n\t"
715 "or $f6, $f6, $f10 \n\t"
716
717 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
718 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
719 "pasubub $f0, $f0, $f4 \n\t"
720 "pasubub $f2, $f2, $f6 \n\t"
721 "biadd $f0, $f0 \n\t"
722 "biadd $f2, $f2 \n\t"
723 "paddh $f28, $f28, $f0 \n\t"
724 "paddh $f30, $f30, $f2 \n\t"
725
726 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
727 PTR_ADDU "$9, $9, %[iStride2] \n\t"
728 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
729
730 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
731 "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
732
733 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
734 "gsldlc1 $f8, 0x7($9) \n\t"
735 "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
736 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
737 "gsldrc1 $f8, 0x0($9) \n\t"
738 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
739 PTR_ADDU "$9, $9, %[iStride2] \n\t"
740 "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
741 "gsldlc1 $f10, 0x7($9) \n\t"
742 "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
743 "gsldrc1 $f10, 0x0($9) \n\t"
744 "dsrl $f4, $f4, $f20 \n\t"
745 "dsrl $f6, $f6, $f20 \n\t"
746 "dsll $f8, $f8, $f24 \n\t"
747 "dsll $f10, $f10, $f24 \n\t"
748 "or $f4, $f4, $f8 \n\t"
749 "or $f6, $f6, $f10 \n\t"
750
751 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
752 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
753 PTR_ADDU "$9, $9, %[iStride2] \n\t"
754
755 "pasubub $f0, $f0, $f4 \n\t"
756 "pasubub $f2, $f2, $f6 \n\t"
757 "biadd $f0, $f0 \n\t"
758 "biadd $f2, $f2 \n\t"
759 "paddh $f28, $f28, $f0 \n\t"
760 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
761 "paddh $f30, $f30, $f2 \n\t"
762 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
763
764 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
765 "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
766
767 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
768 "gsldlc1 $f8, 0x7($9) \n\t"
769 "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
770 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
771 "gsldrc1 $f8, 0x0($9) \n\t"
772 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
773 PTR_ADDU "$9, $9, %[iStride2] \n\t"
774 "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
775 "gsldlc1 $f10, 0x7($9) \n\t"
776 "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
777 "gsldrc1 $f10, 0x0($9) \n\t"
778 "dsrl $f4, $f4, $f20 \n\t"
779 "dsrl $f6, $f6, $f20 \n\t"
780 "dsll $f8, $f8, $f24 \n\t"
781 "dsll $f10, $f10, $f24 \n\t"
782 "or $f4, $f4, $f8 \n\t"
783 "or $f6, $f6, $f10 \n\t"
784
785 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
786 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
787 PTR_ADDU "$9, $9, %[iStride2] \n\t"
788
789 "pasubub $f0, $f0, $f4 \n\t"
790 "pasubub $f2, $f2, $f6 \n\t"
791 "biadd $f0, $f0 \n\t"
792 "biadd $f2, $f2 \n\t"
793 "paddh $f28, $f28, $f0 \n\t"
794 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
795 "paddh $f30, $f30, $f2 \n\t"
796
797 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
798 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
799 "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
800
801 "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
802 "gsldlc1 $f8, 0x7($9) \n\t"
803 "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
804 "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
805 "gsldrc1 $f8, 0x0($9) \n\t"
806 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
807 PTR_ADDU "$9, $9, %[iStride2] \n\t"
808 "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
809 "gsldlc1 $f10, 0x7($9) \n\t"
810 "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
811 "gsldrc1 $f10, 0x0($9) \n\t"
812 "dsrl $f4, $f4, $f20 \n\t"
813 "dsrl $f6, $f6, $f20 \n\t"
814 "dsll $f8, $f8, $f24 \n\t"
815 "dsll $f10, $f10, $f24 \n\t"
816 "or $f4, $f4, $f8 \n\t"
817 "or $f6, $f6, $f10 \n\t"
818
819 "pasubub $f0, $f0, $f4 \n\t"
820 "pasubub $f2, $f2, $f6 \n\t"
821 "biadd $f0, $f0 \n\t"
822 "biadd $f2, $f2 \n\t"
823 "paddh $f28, $f28, $f0 \n\t"
824 "paddh $f30, $f30, $f2 \n\t"
825
826 "mov.d $f0, $f30 \n\t"
827 "paddh $f0, $f0, $f28 \n\t"
828 "dmfc1 %[iSadSum], $f0 \n\t"
829 "j 2f \n\t"
830 "nop \n\t"
831
832 "1: \n\t"
833 "xor $f24, $f24, $f24 \n\t"
834 "xor $f26, $f26, $f26 \n\t"
835 MMI_GetSad8x4
836 MMI_GetSad8x4_End
837 "paddh $f0, $f26, $f24 \n\t"
838 "dmfc1 %[iSadSum], $f0 \n\t"
839 "2: \n\t"
840 : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
841 [pSample2]"+&r"((unsigned char *)pSample2)
842 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
843 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
844 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
845 );
846 RECOVER_REG;
847 return iSadSum;
848 }
849
WelsSampleSatd4x4_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)850 int32_t WelsSampleSatd4x4_mmi (uint8_t* pSample1, int32_t iStride1,
851 uint8_t* pSample2, int32_t iStride2) {
852 int32_t iSatdSum = 0;
853 BACKUP_REG;
854 __asm__ volatile (
855 ".set arch=loongson3a \n\t"
856 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
857 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
858 "gsldlc1 $f4, 0x7($8) \n\t"
859 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
860 "gsldrc1 $f4, 0x0($8) \n\t"
861 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
862 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
863 "gsldlc1 $f8, 0x7(%[pSample1]) \n\t"
864 "gsldlc1 $f12, 0x7($8) \n\t"
865 "gsldrc1 $f8, 0x0(%[pSample1]) \n\t"
866 "gsldrc1 $f12, 0x0($8) \n\t"
867 "punpcklwd $f0, $f0, $f8 \n\t"
868 "punpcklwd $f4, $f4, $f12 \n\t"
869
870 PTR_ADDU "$8, %[pSample2], %[iStride2] \n\t"
871 "gsldlc1 $f16, 0x7(%[pSample2]) \n\t"
872 "gsldlc1 $f20, 0x7($8) \n\t"
873 "gsldrc1 $f16, 0x0(%[pSample2]) \n\t"
874 "gsldrc1 $f20, 0x0($8) \n\t"
875 PTR_ADDU "%[pSample2], $8, %[iStride2] \n\t"
876 PTR_ADDU "$8, %[pSample2], %[iStride2] \n\t"
877 "gsldlc1 $f24, 0x7(%[pSample2]) \n\t"
878 "gsldlc1 $f28, 0x7($8) \n\t"
879 "gsldrc1 $f24, 0x0(%[pSample2]) \n\t"
880 "gsldrc1 $f28, 0x0($8) \n\t"
881 "punpcklwd $f16, $f16, $f24 \n\t"
882 "punpcklwd $f20, $f20, $f28 \n\t"
883
884 "xor $f24, $f24, $f24 \n\t"
885 "xor $f26, $f26, $f26 \n\t"
886 "punpckhbh $f2, $f0, $f24 \n\t"
887 "punpcklbh $f0, $f0, $f24 \n\t"
888 "punpckhbh $f6, $f4, $f24 \n\t"
889 "punpcklbh $f4, $f4, $f24 \n\t"
890 "punpckhbh $f18, $f16, $f24 \n\t"
891 "punpcklbh $f16, $f16, $f24 \n\t"
892 "punpckhbh $f22, $f20, $f24 \n\t"
893 "punpcklbh $f20, $f20, $f24 \n\t"
894
895 "psubh $f0, $f0, $f16 \n\t"
896 "psubh $f2, $f2, $f18 \n\t"
897 "psubh $f4, $f4, $f20 \n\t"
898 "psubh $f6, $f6, $f22 \n\t"
899
900 "mov.d $f8, $f0 \n\t"
901 "mov.d $f10, $f2 \n\t"
902 "paddh $f0, $f0, $f4 \n\t"
903 "paddh $f2, $f2, $f6 \n\t"
904 "psubh $f8, $f8, $f4 \n\t"
905 "psubh $f10, $f10, $f6 \n\t"
906 MMI_XSawp_DQ($f0, $f2, $f8, $f10, $f12, $f14)
907
908 "mov.d $f16, $f0 \n\t"
909 "mov.d $f18, $f2 \n\t"
910 "paddh $f0, $f0, $f12 \n\t"
911 "paddh $f2, $f2, $f14 \n\t"
912 "psubh $f16, $f16, $f12 \n\t"
913 "psubh $f18, $f18, $f14 \n\t"
914
915 "mov.d $f8, $f2 \n\t"
916 "punpckhhw $f2, $f0, $f16 \n\t"
917 "punpcklhw $f0, $f0, $f16 \n\t"
918 "punpcklhw $f16, $f18, $f8 \n\t"
919 "punpckhhw $f18, $f18, $f8 \n\t"
920
921 MMI_XSawp_WD($f0, $f2, $f16, $f18, $f12, $f14)
922 MMI_XSawp_DQ($f0, $f2, $f12, $f14, $f20, $f22)
923
924 "mov.d $f28, $f0 \n\t"
925 "mov.d $f30, $f2 \n\t"
926 "paddh $f0, $f0, $f20 \n\t"
927 "paddh $f2, $f2, $f22 \n\t"
928 "psubh $f28, $f28, $f20 \n\t"
929 "psubh $f30, $f30, $f22 \n\t"
930
931 MMI_XSawp_DQ($f0, $f2, $f28, $f30, $f4, $f6)
932
933 "psubh $f8, $f0, $f4 \n\t"
934 "psubh $f10, $f2, $f6 \n\t"
935 "paddh $f0, $f0, $f4 \n\t"
936 "paddh $f2, $f2, $f6 \n\t"
937
938 WELS_AbsH($f0, $f2, $f0, $f2, $f12, $f14)
939 "paddush $f24, $f24, $f0 \n\t"
940 "paddush $f26, $f26, $f2 \n\t"
941 WELS_AbsH($f8, $f10, $f8, $f10, $f16, $f18)
942 "paddush $f24, $f24, $f8 \n\t"
943 "paddush $f26, $f26, $f10 \n\t"
944 MMI_SumWHorizon1($f24, $f26, $f16, $f18, $f28, $f30, $8)
945
946 "dmfc1 $8, $f24 \n\t"
947 "dli $9, 0xffff \n\t"
948 "and $8, $8, $9 \n\t"
949 "dsrl %[iSatdSum], $8, 0x1 \n\t"
950 : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
951 [pSample2]"+&r"((unsigned char *)pSample2)
952 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
953 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
954 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
955 );
956 RECOVER_REG;
957 return iSatdSum;
958 }
959
WelsSampleSatd8x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)960 int32_t WelsSampleSatd8x8_mmi (uint8_t* pSample1, int32_t iStride1,
961 uint8_t* pSample2, int32_t iStride2) {
962 int32_t iSatdSum = 0;
963 BACKUP_REG;
964 __asm__ volatile (
965 ".set arch=loongson3a \n\t"
966 "xor $f24, $f24, $f24 \n\t"
967 "xor $f26, $f26, $f26 \n\t"
968 "dli $8, 0x1 \n\t"
969 "xor $f28, $f28, $f28 \n\t"
970 "dmtc1 $8, $f30 \n\t"
971 MMI_GetSatd8x8_End
972 "psrlh $f24, $f24, $f30 \n\t"
973 "dli $8, 0x4e \n\t"
974 "psrlh $f26, $f26, $f30 \n\t"
975 "dmtc1 $8, $f30 \n\t"
976 MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
977 "mfc1 %[iSatdSum], $f24 \n\t"
978 : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
979 [pSample2]"+&r"((unsigned char *)pSample2)
980 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
981 : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
982 "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
983 );
984 RECOVER_REG;
985 return iSatdSum;
986 }
987
WelsSampleSatd8x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)988 int32_t WelsSampleSatd8x16_mmi (uint8_t* pSample1, int32_t iStride1,
989 uint8_t* pSample2, int32_t iStride2) {
990 int32_t iSatdSum = 0;
991 BACKUP_REG;
992 __asm__ volatile (
993 ".set arch=loongson3a \n\t"
994 "xor $f24, $f24, $f24 \n\t"
995 "xor $f26, $f26, $f26 \n\t"
996 "dli $8, 0x1 \n\t"
997 "xor $f28, $f28, $f28 \n\t"
998 "dmtc1 $8, $f30 \n\t"
999 MMI_GetSatd8x8
1000 MMI_GetSatd8x8_End
1001 "psrlh $f24, $f24, $f30 \n\t"
1002 "dli $8, 0x4e \n\t"
1003 "psrlh $f26, $f26, $f30 \n\t"
1004 "dmtc1 $8, $f30 \n\t"
1005 MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
1006 "mfc1 %[iSatdSum], $f24 \n\t"
1007 : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
1008 [pSample2]"+&r"((unsigned char *)pSample2)
1009 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
1010 : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
1011 "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
1012 );
1013 RECOVER_REG;
1014 return iSatdSum;
1015 }
1016
WelsSampleSatd16x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)1017 int32_t WelsSampleSatd16x8_mmi (uint8_t* pSample1, int32_t iStride1,
1018 uint8_t* pSample2, int32_t iStride2) {
1019 int32_t iSatdSum = 0;
1020 BACKUP_REG;
1021 __asm__ volatile (
1022 ".set arch=loongson3a \n\t"
1023 "xor $f24, $f24, $f24 \n\t"
1024 "xor $f26, $f26, $f26 \n\t"
1025 "dli $8, 0x1 \n\t"
1026 "xor $f28, $f28, $f28 \n\t"
1027 "dmtc1 $8, $f30 \n\t"
1028 "move $9, %[pSample1] \n\t"
1029 "move $10, %[pSample2] \n\t"
1030 MMI_GetSatd8x8_Offset8
1031
1032 MMI_GetSatd8x8_End
1033 "psrlh $f24, $f24, $f30 \n\t"
1034 "dli $8, 0x4e \n\t"
1035 "psrlh $f26, $f26, $f30 \n\t"
1036 "dmtc1 $8, $f30 \n\t"
1037 MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
1038 "mfc1 %[iSatdSum], $f24 \n\t"
1039 : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
1040 [pSample2]"+&r"((unsigned char *)pSample2)
1041 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
1042 : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
1043 "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
1044 "$f26", "$f28", "$f30"
1045 );
1046 RECOVER_REG;
1047 return iSatdSum;
1048 }
1049
WelsSampleSatd16x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)1050 int32_t WelsSampleSatd16x16_mmi (uint8_t* pSample1, int32_t iStride1,
1051 uint8_t* pSample2, int32_t iStride2) {
1052 int32_t iSatdSum = 0;
1053 BACKUP_REG;
1054 __asm__ volatile (
1055 ".set arch=loongson3a \n\t"
1056 "xor $f24, $f24, $f24 \n\t"
1057 "xor $f26, $f26, $f26 \n\t"
1058 "dli $8, 0x1 \n\t"
1059 "xor $f28, $f28, $f28 \n\t"
1060 "dmtc1 $8, $f30 \n\t"
1061 "move $9, %[pSample1] \n\t"
1062 "move $10, %[pSample2] \n\t"
1063
1064 MMI_GetSatd8x8
1065 MMI_GetSatd8x8_Offset8
1066
1067 MMI_GetSatd8x8
1068 MMI_GetSatd8x8_End
1069
1070 "dli $8, 0x4e \n\t"
1071 "psrlh $f24, $f24, $f30 \n\t"
1072 "dmtc1 $8, $f0 \n\t"
1073 "psrlh $f26, $f26, $f30 \n\t"
1074 MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f0)
1075 "mfc1 %[iSatdSum], $f24 \n\t"
1076 : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
1077 [pSample2]"+&r"((unsigned char *)pSample2)
1078 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
1079 : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
1080 "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
1081 "$f26", "$f28", "$f30"
1082 );
1083 RECOVER_REG;
1084 return iSatdSum;
1085 }
1086
WelsSampleSadFour16x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1087 void WelsSampleSadFour16x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1088 int32_t iStride2, int32_t* pSad) {
1089 BACKUP_REG;
1090 __asm__ volatile (
1091 ".set arch=loongson3a \n\t"
1092 "xor $f16, $f16, $f16 \n\t"
1093 "xor $f18, $f18, $f18 \n\t"
1094 "xor $f20, $f20, $f20 \n\t"
1095 "xor $f22, $f22, $f22 \n\t"
1096 PTR_SUBU "%[pSample2], %[pSample2], %[iStride2] \n\t"
1097 "xor $f24, $f24, $f24 \n\t"
1098 "xor $f26, $f26, $f26 \n\t"
1099 "xor $f28, $f28, $f28 \n\t"
1100 "xor $f30, $f30, $f30 \n\t"
1101 "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
1102 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1103 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1104 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1105 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1106 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
1107 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
1108 "pasubub $f12, $f12, $f0 \n\t"
1109 "pasubub $f14, $f14, $f2 \n\t"
1110 "biadd $f12, $f12 \n\t"
1111 "biadd $f14, $f14 \n\t"
1112 "paddh $f16, $f16, $f12 \n\t"
1113 "paddh $f18, $f18, $f14 \n\t"
1114
1115 "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
1116 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1117 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1118 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1119 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1120 "pasubub $f12, $f12, $f4 \n\t"
1121 "pasubub $f14, $f14, $f6 \n\t"
1122 "biadd $f12, $f12 \n\t"
1123 "biadd $f14, $f14 \n\t"
1124 "paddh $f16, $f16, $f12 \n\t"
1125 "paddh $f18, $f18, $f14 \n\t"
1126
1127 "gsldlc1 $f8, 0x6(%[pSample2]) \n\t"
1128 "gsldlc1 $f10, 0xE(%[pSample2]) \n\t"
1129 "gsldrc1 $f8, -0x1(%[pSample2]) \n\t"
1130 "gsldrc1 $f10, 0x7(%[pSample2]) \n\t"
1131 "pasubub $f8, $f8, $f0 \n\t"
1132 "pasubub $f10, $f10, $f2 \n\t"
1133 "biadd $f8, $f8 \n\t"
1134 "biadd $f10, $f10 \n\t"
1135 "paddh $f24, $f24, $f8 \n\t"
1136 "paddh $f26, $f26, $f10 \n\t"
1137
1138 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1139 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1140 "gsldlc1 $f14, 0x10(%[pSample2]) \n\t"
1141 "gsldrc1 $f14, 0x9(%[pSample2]) \n\t"
1142 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1143 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1144 "pasubub $f12, $f12, $f0 \n\t"
1145 "pasubub $f14, $f14, $f2 \n\t"
1146 "biadd $f12, $f12 \n\t"
1147 "biadd $f14, $f14 \n\t"
1148 "paddh $f28, $f28, $f12 \n\t"
1149 "paddh $f30, $f30, $f14 \n\t"
1150
1151 "gslqc1 $f10, $f8, 0x0($8) \n\t"
1152 "gsldlc1 $f12, 0x7($9) \n\t"
1153 "gsldlc1 $f14, 0xF($9) \n\t"
1154 "gsldrc1 $f12, 0x0($9) \n\t"
1155 "gsldrc1 $f14, 0x8($9) \n\t"
1156 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1157 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1158 MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1159 "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
1160 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1161 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1162 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1163 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1164 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1165 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1166 MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1167 "gslqc1 $f6, $f4, 0x0($8) \n\t"
1168 "gsldlc1 $f12, 0x7($9) \n\t"
1169 "gsldlc1 $f14, 0xF($9) \n\t"
1170 "gsldrc1 $f12, 0x0($9) \n\t"
1171 "gsldrc1 $f14, 0x8($9) \n\t"
1172 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1173 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1174 MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
1175 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
1176 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1177 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1178 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1179 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1180 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1181 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1182 MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
1183 "gslqc1 $f2, $f0, 0x0($8) \n\t"
1184 "gsldlc1 $f12, 0x7($9) \n\t"
1185 "gsldlc1 $f14, 0xF($9) \n\t"
1186 "gsldrc1 $f12, 0x0($9) \n\t"
1187 "gsldrc1 $f14, 0x8($9) \n\t"
1188 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1189 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1190 MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
1191 "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
1192 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1193 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1194 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1195 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1196 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1197 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1198 MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
1199
1200 "gslqc1 $f10, $f8, 0x0($8) \n\t"
1201 "gsldlc1 $f12, 0x7($9) \n\t"
1202 "gsldlc1 $f14, 0xF($9) \n\t"
1203 "gsldrc1 $f12, 0x0($9) \n\t"
1204 "gsldrc1 $f14, 0x8($9) \n\t"
1205 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1206 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1207 MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1208 "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
1209 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1210 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1211 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1212 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1213 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1214 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1215 MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1216 "gslqc1 $f6, $f4, 0x0($8) \n\t"
1217 "gsldlc1 $f12, 0x7($9) \n\t"
1218 "gsldlc1 $f14, 0xF($9) \n\t"
1219 "gsldrc1 $f12, 0x0($9) \n\t"
1220 "gsldrc1 $f14, 0x8($9) \n\t"
1221 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1222 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1223 MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
1224 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
1225 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1226 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1227 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1228 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1229 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1230 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1231 MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
1232 "gslqc1 $f2, $f0, 0x0($8) \n\t"
1233 "gsldlc1 $f12, 0x7($9) \n\t"
1234 "gsldlc1 $f14, 0xF($9) \n\t"
1235 "gsldrc1 $f12, 0x0($9) \n\t"
1236 "gsldrc1 $f14, 0x8($9) \n\t"
1237 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1238 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1239 MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
1240 "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
1241 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1242 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1243 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1244 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1245 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1246 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1247 MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
1248
1249 "gslqc1 $f10, $f8, 0x0($8) \n\t"
1250 "gsldlc1 $f12, 0x7($9) \n\t"
1251 "gsldlc1 $f14, 0xF($9) \n\t"
1252 "gsldrc1 $f12, 0x0($9) \n\t"
1253 "gsldrc1 $f14, 0x8($9) \n\t"
1254 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1255 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1256 MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1257 "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
1258 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1259 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1260 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1261 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1262 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1263 MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1264 "gsldlc1 $f12, 0x7($9) \n\t"
1265 "gsldlc1 $f14, 0xF($9) \n\t"
1266 "gsldrc1 $f12, 0x0($9) \n\t"
1267 "gsldrc1 $f14, 0x8($9) \n\t"
1268 "pasubub $f8, $f8, $f12 \n\t"
1269 "pasubub $f10, $f10, $f14 \n\t"
1270 "biadd $f8, $f8 \n\t"
1271 "biadd $f10, $f10 \n\t"
1272 "paddh $f20, $f20, $f8 \n\t"
1273 "paddh $f22, $f22, $f10 \n\t"
1274
1275 "gsldlc1 $f8, 0x6($9) \n\t"
1276 "gsldlc1 $f10, 0xE($9) \n\t"
1277 "gsldrc1 $f8, -0x1($9) \n\t"
1278 "gsldrc1 $f10, 0x7($9) \n\t"
1279 "pasubub $f8, $f8, $f0 \n\t"
1280 "pasubub $f10, $f10, $f2 \n\t"
1281 "biadd $f8, $f8 \n\t"
1282 "biadd $f10, $f10 \n\t"
1283 "paddh $f24, $f24, $f8 \n\t"
1284 "paddh $f26, $f26, $f10 \n\t"
1285
1286 "gsldlc1 $f12, 0x8($9) \n\t"
1287 "gsldlc1 $f14, 0x10($9) \n\t"
1288 "gsldrc1 $f12, 0x1($9) \n\t"
1289 "gsldrc1 $f14, 0x9($9) \n\t"
1290 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1291 "pasubub $f12, $f12, $f0 \n\t"
1292 "pasubub $f14, $f14, $f2 \n\t"
1293 "biadd $f12, $f12 \n\t"
1294 "biadd $f14, $f14 \n\t"
1295 "paddh $f28, $f28, $f12 \n\t"
1296 "paddh $f30, $f30, $f14 \n\t"
1297
1298 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1299 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1300 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1301 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1302 "pasubub $f0, $f0, $f12 \n\t"
1303 "pasubub $f2, $f2, $f14 \n\t"
1304 "biadd $f0, $f0 \n\t"
1305 "biadd $f2, $f2 \n\t"
1306 "paddh $f20, $f20, $f0 \n\t"
1307 "paddh $f22, $f22, $f2 \n\t"
1308
1309 "paddh $f16, $f16, $f18 \n\t"
1310 "paddh $f20, $f20, $f22 \n\t"
1311 "paddh $f24, $f24, $f26 \n\t"
1312 "paddh $f28, $f28, $f30 \n\t"
1313 "punpcklwd $f16, $f16, $f20 \n\t"
1314 "punpcklwd $f24, $f24, $f28 \n\t"
1315 "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
1316 : [pSample1]"+&r"((unsigned char *)pSample1),
1317 [pSample2]"+&r"((unsigned char *)pSample2)
1318 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
1319 [pSad]"r"((int *)pSad)
1320 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1321 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
1322 );
1323 RECOVER_REG;
1324 }
1325
WelsSampleSadFour16x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1326 void WelsSampleSadFour16x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1327 int32_t iStride2, int32_t* pSad) {
1328 BACKUP_REG;
1329 __asm__ volatile (
1330 ".set arch=loongson3a \n\t"
1331 "xor $f16, $f16, $f16 \n\t"
1332 "xor $f18, $f18, $f18 \n\t"
1333 "xor $f20, $f20, $f20 \n\t"
1334 "xor $f22, $f22, $f22 \n\t"
1335 "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
1336 PTR_SUBU "%[pSample2], %[pSample2], %[iStride2] \n\t"
1337 "xor $f24, $f24, $f24 \n\t"
1338 "xor $f26, $f26, $f26 \n\t"
1339 "xor $f28, $f28, $f28 \n\t"
1340 "xor $f30, $f30, $f30 \n\t"
1341 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1342 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1343 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1344 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1345 PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
1346 "pasubub $f12, $f12, $f0 \n\t"
1347 "pasubub $f14, $f14, $f2 \n\t"
1348 PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
1349 "biadd $f12, $f12 \n\t"
1350 "biadd $f14, $f14 \n\t"
1351 "paddh $f16, $f16, $f12 \n\t"
1352 "paddh $f18, $f18, $f14 \n\t"
1353
1354 "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
1355 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1356 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1357 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1358 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1359 "pasubub $f12, $f12, $f4 \n\t"
1360 "pasubub $f14, $f14, $f6 \n\t"
1361 "biadd $f12, $f12 \n\t"
1362 "biadd $f14, $f14 \n\t"
1363 "paddh $f16, $f16, $f12 \n\t"
1364 "paddh $f18, $f18, $f14 \n\t"
1365
1366 "gsldlc1 $f8, 0x6(%[pSample2]) \n\t"
1367 "gsldlc1 $f10, 0xE(%[pSample2]) \n\t"
1368 "gsldrc1 $f8, -0x1(%[pSample2]) \n\t"
1369 "gsldrc1 $f10, 0x7(%[pSample2]) \n\t"
1370 "pasubub $f8, $f8, $f0 \n\t"
1371 "pasubub $f10, $f10, $f2 \n\t"
1372 "biadd $f8, $f8 \n\t"
1373 "biadd $f10, $f10 \n\t"
1374 "paddh $f24, $f24, $f8 \n\t"
1375 "paddh $f26, $f26, $f10 \n\t"
1376
1377 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1378 "gsldlc1 $f14, 0x10(%[pSample2]) \n\t"
1379 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1380 "gsldrc1 $f14, 0x9(%[pSample2]) \n\t"
1381 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1382 "pasubub $f12, $f12, $f0 \n\t"
1383 "pasubub $f14, $f14, $f2 \n\t"
1384 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1385 "biadd $f12, $f12 \n\t"
1386 "biadd $f14, $f14 \n\t"
1387 "paddh $f28, $f28, $f12 \n\t"
1388 "paddh $f30, $f30, $f14 \n\t"
1389
1390 "gslqc1 $f10, $f8, 0x0($8) \n\t"
1391 "gsldlc1 $f12, 0x7($9) \n\t"
1392 "gsldlc1 $f14, 0xF($9) \n\t"
1393 "gsldrc1 $f12, 0x0($9) \n\t"
1394 "gsldrc1 $f14, 0x8($9) \n\t"
1395 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1396 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1397 MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1398 "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
1399 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1400 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1401 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1402 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1403 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1404 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1405 MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1406 "gslqc1 $f6, $f4, 0x0($8) \n\t"
1407 "gsldlc1 $f12, 0x7($9) \n\t"
1408 "gsldlc1 $f14, 0xF($9) \n\t"
1409 "gsldrc1 $f12, 0x0($9) \n\t"
1410 "gsldrc1 $f14, 0x8($9) \n\t"
1411 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1412 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1413 MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
1414 "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
1415 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1416 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1417 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1418 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1419 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1420 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1421 MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
1422 "gslqc1 $f2, $f0, 0x0($8) \n\t"
1423 "gsldlc1 $f12, 0x7($9) \n\t"
1424 "gsldlc1 $f14, 0xF($9) \n\t"
1425 "gsldrc1 $f12, 0x0($9) \n\t"
1426 "gsldrc1 $f14, 0x8($9) \n\t"
1427 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1428 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1429 MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
1430 "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
1431 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1432 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1433 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1434 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1435 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1436 MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
1437 "gsldlc1 $f12, 0x7($9) \n\t"
1438 "gsldlc1 $f14, 0xF($9) \n\t"
1439 "gsldrc1 $f12, 0x0($9) \n\t"
1440 "gsldrc1 $f14, 0x8($9) \n\t"
1441 "pasubub $f0, $f0, $f12 \n\t"
1442 "pasubub $f2, $f2, $f14 \n\t"
1443 "biadd $f0, $f0 \n\t"
1444 "biadd $f2, $f2 \n\t"
1445 "paddh $f20, $f20, $f0 \n\t"
1446 "paddh $f22, $f22, $f2 \n\t"
1447
1448 "gsldlc1 $f0, 0x6($9) \n\t"
1449 "gsldlc1 $f2, 0xE($9) \n\t"
1450 "gsldrc1 $f0, -0x1($9) \n\t"
1451 "gsldrc1 $f2, 0x7($9) \n\t"
1452 "pasubub $f0, $f0, $f4 \n\t"
1453 "pasubub $f2, $f2, $f6 \n\t"
1454 "biadd $f0, $f0 \n\t"
1455 "biadd $f2, $f2 \n\t"
1456 "paddh $f24, $f24, $f0 \n\t"
1457 "paddh $f26, $f26, $f2 \n\t"
1458
1459 "gsldlc1 $f12, 0x8($9) \n\t"
1460 "gsldlc1 $f14, 0x10($9) \n\t"
1461 "gsldrc1 $f12, 0x1($9) \n\t"
1462 "gsldrc1 $f14, 0x9($9) \n\t"
1463 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1464 "pasubub $f12, $f12, $f4 \n\t"
1465 "pasubub $f14, $f14, $f6 \n\t"
1466 "biadd $f12, $f12 \n\t"
1467 "biadd $f14, $f14 \n\t"
1468 "paddh $f28, $f28, $f12 \n\t"
1469 "paddh $f30, $f30, $f14 \n\t"
1470
1471 "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
1472 "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
1473 "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
1474 "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
1475 "pasubub $f4, $f4, $f12 \n\t"
1476 "pasubub $f6, $f6, $f14 \n\t"
1477 "biadd $f4, $f4 \n\t"
1478 "biadd $f6, $f6 \n\t"
1479 "paddh $f20, $f20, $f4 \n\t"
1480 "paddh $f22, $f22, $f6 \n\t"
1481
1482 "paddh $f16, $f16, $f18 \n\t"
1483 "paddh $f20, $f20, $f22 \n\t"
1484 "paddh $f24, $f24, $f26 \n\t"
1485 "paddh $f28, $f28, $f30 \n\t"
1486 "punpcklwd $f16, $f16, $f20 \n\t"
1487 "punpcklwd $f24, $f24, $f28 \n\t"
1488 "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
1489 : [pSample1]"+&r"((unsigned char *)pSample1),
1490 [pSample2]"+&r"((unsigned char *)pSample2)
1491 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
1492 [pSad]"r"((int *)pSad)
1493 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1494 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
1495 );
1496 RECOVER_REG;
1497 }
1498
WelsSampleSadFour8x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1499 void WelsSampleSadFour8x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1500 int32_t iStride2, int32_t* pSad) {
1501 BACKUP_REG;
1502 __asm__ volatile (
1503 ".set arch=loongson3a \n\t"
1504 "xor $f16, $f16, $f16 \n\t"
1505 "xor $f18, $f18, $f18 \n\t"
1506 "xor $f20, $f20, $f20 \n\t"
1507 "xor $f22, $f22, $f22 \n\t"
1508 "xor $f24, $f24, $f24 \n\t"
1509 "xor $f26, $f26, $f26 \n\t"
1510 "xor $f28, $f28, $f28 \n\t"
1511 "xor $f30, $f30, $f30 \n\t"
1512 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1513 PTR_SUBU "$9, %[pSample2], %[iStride2] \n\t"
1514 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1515 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1516 "gsldlc1 $f2, 0x7($8) \n\t"
1517 "gsldlc1 $f12, 0x7($9) \n\t"
1518 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1519 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1520 "gsldrc1 $f2, 0x0($8) \n\t"
1521 "gsldrc1 $f12, 0x0($9) \n\t"
1522 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1523 "pasubub $f12, $f12, $f0 \n\t"
1524 "pasubub $f14, $f14, $f2 \n\t"
1525 "biadd $f12, $f12 \n\t"
1526 "biadd $f14, $f14 \n\t"
1527 "paddh $f16, $f16, $f12 \n\t"
1528 "paddh $f18, $f18, $f14 \n\t"
1529
1530 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1531 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1532 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1533 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1534 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1535
1536 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1537 "gsldlc1 $f6, 0x6($9) \n\t"
1538 "gsldlc1 $f14, 0x8($9) \n\t"
1539 "gsldrc1 $f6, -0x1($9) \n\t"
1540 "gsldrc1 $f14, 0x1($9) \n\t"
1541 "pasubub $f4, $f4, $f0 \n\t"
1542 "pasubub $f6, $f6, $f2 \n\t"
1543 "biadd $f4, $f4 \n\t"
1544 "biadd $f6, $f6 \n\t"
1545 "paddh $f24, $f24, $f4 \n\t"
1546 "paddh $f26, $f26, $f6 \n\t"
1547 "pasubub $f12, $f12, $f0 \n\t"
1548 "pasubub $f14, $f14, $f2 \n\t"
1549 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1550 "biadd $f12, $f12 \n\t"
1551 "biadd $f14, $f14 \n\t"
1552 "paddh $f28, $f28, $f12 \n\t"
1553 "paddh $f30, $f30, $f14 \n\t"
1554
1555 "gsldlc1 $f12, 0x7($9) \n\t"
1556 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1557 "gsldrc1 $f12, 0x0($9) \n\t"
1558 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1559 "pasubub $f0, $f0, $f12 \n\t"
1560 "pasubub $f2, $f2, $f14 \n\t"
1561 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1562 "biadd $f0, $f0 \n\t"
1563 "biadd $f2, $f2 \n\t"
1564 "paddh $f20, $f20, $f0 \n\t"
1565 "paddh $f22, $f22, $f2 \n\t"
1566
1567 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1568 "gsldlc1 $f2, 0x7($8) \n\t"
1569 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1570 "gsldrc1 $f2, 0x0($8) \n\t"
1571 "pasubub $f12, $f12, $f0 \n\t"
1572 "pasubub $f14, $f14, $f2 \n\t"
1573 "biadd $f12, $f12 \n\t"
1574 "biadd $f14, $f14 \n\t"
1575 "paddh $f16, $f16, $f12 \n\t"
1576 "paddh $f18, $f18, $f14 \n\t"
1577
1578 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1579 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1580 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1581 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1582 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1583
1584 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1585 "gsldlc1 $f6, 0x6($9) \n\t"
1586 "gsldlc1 $f14, 0x8($9) \n\t"
1587 "gsldrc1 $f6, -0x1($9) \n\t"
1588 "gsldrc1 $f14, 0x1($9) \n\t"
1589
1590 "pasubub $f4, $f4, $f0 \n\t"
1591 "pasubub $f6, $f6, $f2 \n\t"
1592 "biadd $f4, $f4 \n\t"
1593 "biadd $f6, $f6 \n\t"
1594 "paddh $f24, $f24, $f4 \n\t"
1595 "paddh $f26, $f26, $f6 \n\t"
1596 "pasubub $f12, $f12, $f0 \n\t"
1597 "pasubub $f14, $f14, $f2 \n\t"
1598 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1599 "biadd $f12, $f12 \n\t"
1600 "biadd $f14, $f14 \n\t"
1601 "paddh $f28, $f28, $f12 \n\t"
1602 "paddh $f30, $f30, $f14 \n\t"
1603
1604 "gsldlc1 $f12, 0x7($9) \n\t"
1605 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1606 "gsldrc1 $f12, 0x0($9) \n\t"
1607 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1608 "pasubub $f0, $f0, $f12 \n\t"
1609 "pasubub $f2, $f2, $f14 \n\t"
1610 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1611 "biadd $f0, $f0 \n\t"
1612 "biadd $f2, $f2 \n\t"
1613 "paddh $f20, $f20, $f0 \n\t"
1614 "paddh $f22, $f22, $f2 \n\t"
1615
1616 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1617 "gsldlc1 $f2, 0x7($8) \n\t"
1618 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1619 "gsldrc1 $f2, 0x0($8) \n\t"
1620 "pasubub $f12, $f12, $f0 \n\t"
1621 "pasubub $f14, $f14, $f2 \n\t"
1622 "biadd $f12, $f12 \n\t"
1623 "biadd $f14, $f14 \n\t"
1624 "paddh $f16, $f16, $f12 \n\t"
1625 "paddh $f18, $f18, $f14 \n\t"
1626
1627 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1628 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1629 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1630 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1631 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1632
1633 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1634 "gsldlc1 $f6, 0x6($9) \n\t"
1635 "gsldlc1 $f14, 0x8($9) \n\t"
1636 "gsldrc1 $f6, -0x1($9) \n\t"
1637 "gsldrc1 $f14, 0x1($9) \n\t"
1638
1639 "pasubub $f4, $f4, $f0 \n\t"
1640 "pasubub $f6, $f6, $f2 \n\t"
1641 "biadd $f4, $f4 \n\t"
1642 "biadd $f6, $f6 \n\t"
1643 "paddh $f24, $f24, $f4 \n\t"
1644 "paddh $f26, $f26, $f6 \n\t"
1645 "pasubub $f12, $f12, $f0 \n\t"
1646 "pasubub $f14, $f14, $f2 \n\t"
1647 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1648 "biadd $f12, $f12 \n\t"
1649 "biadd $f14, $f14 \n\t"
1650 "paddh $f28, $f28, $f12 \n\t"
1651 "paddh $f30, $f30, $f14 \n\t"
1652
1653 "gsldlc1 $f12, 0x7($9) \n\t"
1654 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1655 "gsldrc1 $f12, 0x0($9) \n\t"
1656 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1657 "pasubub $f0, $f0, $f12 \n\t"
1658 "pasubub $f2, $f2, $f14 \n\t"
1659 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1660 "biadd $f0, $f0 \n\t"
1661 "biadd $f2, $f2 \n\t"
1662 "paddh $f20, $f20, $f0 \n\t"
1663 "paddh $f22, $f22, $f2 \n\t"
1664
1665 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1666 "gsldlc1 $f2, 0x7($8) \n\t"
1667 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1668 "gsldrc1 $f2, 0x0($8) \n\t"
1669 "pasubub $f12, $f12, $f0 \n\t"
1670 "pasubub $f14, $f14, $f2 \n\t"
1671 "biadd $f12, $f12 \n\t"
1672 "biadd $f14, $f14 \n\t"
1673 "paddh $f16, $f16, $f12 \n\t"
1674 "paddh $f18, $f18, $f14 \n\t"
1675
1676 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1677 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1678 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1679 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1680 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1681
1682 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1683 "gsldlc1 $f6, 0x6($9) \n\t"
1684 "gsldlc1 $f14, 0x8($9) \n\t"
1685 "gsldrc1 $f6, -0x1($9) \n\t"
1686 "gsldrc1 $f14, 0x1($9) \n\t"
1687 "pasubub $f4, $f4, $f0 \n\t"
1688 "pasubub $f6, $f6, $f2 \n\t"
1689 "biadd $f4, $f4 \n\t"
1690 "biadd $f6, $f6 \n\t"
1691 "paddh $f24, $f24, $f4 \n\t"
1692 "paddh $f26, $f26, $f6 \n\t"
1693 "pasubub $f12, $f12, $f0 \n\t"
1694 "pasubub $f14, $f14, $f2 \n\t"
1695 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1696 "biadd $f12, $f12 \n\t"
1697 "biadd $f14, $f14 \n\t"
1698 "paddh $f28, $f28, $f12 \n\t"
1699 "paddh $f30, $f30, $f14 \n\t"
1700
1701 "gsldlc1 $f12, 0x7($9) \n\t"
1702 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1703 "gsldrc1 $f12, 0x0($9) \n\t"
1704 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1705 "pasubub $f0, $f0, $f12 \n\t"
1706 "pasubub $f2, $f2, $f14 \n\t"
1707 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1708 "biadd $f0, $f0 \n\t"
1709 "biadd $f2, $f2 \n\t"
1710 "paddh $f20, $f20, $f0 \n\t"
1711 "paddh $f22, $f22, $f2 \n\t"
1712
1713 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1714 "gsldlc1 $f2, 0x7($8) \n\t"
1715 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1716 "gsldrc1 $f2, 0x0($8) \n\t"
1717 "pasubub $f12, $f12, $f0 \n\t"
1718 "pasubub $f14, $f14, $f2 \n\t"
1719 "biadd $f12, $f12 \n\t"
1720 "biadd $f14, $f14 \n\t"
1721 "paddh $f16, $f16, $f12 \n\t"
1722 "paddh $f18, $f18, $f14 \n\t"
1723
1724 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1725 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1726 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1727 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1728 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1729
1730 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1731 "gsldlc1 $f6, 0x6($9) \n\t"
1732 "gsldlc1 $f14, 0x8($9) \n\t"
1733 "gsldrc1 $f6, -0x1($9) \n\t"
1734 "gsldrc1 $f14, 0x1($9) \n\t"
1735
1736 "pasubub $f4, $f4, $f0 \n\t"
1737 "pasubub $f6, $f6, $f2 \n\t"
1738 "biadd $f4, $f4 \n\t"
1739 "biadd $f6, $f6 \n\t"
1740 "paddh $f24, $f24, $f4 \n\t"
1741 "paddh $f26, $f26, $f6 \n\t"
1742 "pasubub $f12, $f12, $f0 \n\t"
1743 "pasubub $f14, $f14, $f2 \n\t"
1744 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1745 "biadd $f12, $f12 \n\t"
1746 "biadd $f14, $f14 \n\t"
1747 "paddh $f28, $f28, $f12 \n\t"
1748 "paddh $f30, $f30, $f14 \n\t"
1749
1750 "gsldlc1 $f12, 0x7($9) \n\t"
1751 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1752 "gsldrc1 $f12, 0x0($9) \n\t"
1753 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1754 "pasubub $f0, $f0, $f12 \n\t"
1755 "pasubub $f2, $f2, $f14 \n\t"
1756 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1757 "biadd $f0, $f0 \n\t"
1758 "biadd $f2, $f2 \n\t"
1759 "paddh $f20, $f20, $f0 \n\t"
1760 "paddh $f22, $f22, $f2 \n\t"
1761
1762 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1763 "gsldlc1 $f2, 0x7($8) \n\t"
1764 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1765 "gsldrc1 $f2, 0x0($8) \n\t"
1766 "pasubub $f12, $f12, $f0 \n\t"
1767 "pasubub $f14, $f14, $f2 \n\t"
1768 "biadd $f12, $f12 \n\t"
1769 "biadd $f14, $f14 \n\t"
1770 "paddh $f16, $f16, $f12 \n\t"
1771 "paddh $f18, $f18, $f14 \n\t"
1772
1773 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1774 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1775 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1776 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1777 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1778
1779 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1780 "gsldlc1 $f6, 0x6($9) \n\t"
1781 "gsldlc1 $f14, 0x8($9) \n\t"
1782 "gsldrc1 $f6, -0x1($9) \n\t"
1783 "gsldrc1 $f14, 0x1($9) \n\t"
1784
1785 "pasubub $f4, $f4, $f0 \n\t"
1786 "pasubub $f6, $f6, $f2 \n\t"
1787 "biadd $f4, $f4 \n\t"
1788 "biadd $f6, $f6 \n\t"
1789 "paddh $f24, $f24, $f4 \n\t"
1790 "paddh $f26, $f26, $f6 \n\t"
1791 "pasubub $f12, $f12, $f0 \n\t"
1792 "pasubub $f14, $f14, $f2 \n\t"
1793 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1794 "biadd $f12, $f12 \n\t"
1795 "biadd $f14, $f14 \n\t"
1796 "paddh $f28, $f28, $f12 \n\t"
1797 "paddh $f30, $f30, $f14 \n\t"
1798
1799 "gsldlc1 $f12, 0x7($9) \n\t"
1800 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1801 "gsldrc1 $f12, 0x0($9) \n\t"
1802 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1803 "pasubub $f0, $f0, $f12 \n\t"
1804 "pasubub $f2, $f2, $f14 \n\t"
1805 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1806 "biadd $f0, $f0 \n\t"
1807 "biadd $f2, $f2 \n\t"
1808 "paddh $f20, $f20, $f0 \n\t"
1809 "paddh $f22, $f22, $f2 \n\t"
1810
1811 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1812 "gsldlc1 $f2, 0x7($8) \n\t"
1813 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1814 "gsldrc1 $f2, 0x0($8) \n\t"
1815 "pasubub $f12, $f12, $f0 \n\t"
1816 "pasubub $f14, $f14, $f2 \n\t"
1817 "biadd $f12, $f12 \n\t"
1818 "biadd $f14, $f14 \n\t"
1819 "paddh $f16, $f16, $f12 \n\t"
1820 "paddh $f18, $f18, $f14 \n\t"
1821
1822 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1823 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1824 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1825 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1826 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1827
1828 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1829 "gsldlc1 $f6, 0x6($9) \n\t"
1830 "gsldlc1 $f14, 0x8($9) \n\t"
1831 "gsldrc1 $f6, -0x1($9) \n\t"
1832 "gsldrc1 $f14, 0x1($9) \n\t"
1833
1834 "pasubub $f4, $f4, $f0 \n\t"
1835 "pasubub $f6, $f6, $f2 \n\t"
1836 "biadd $f4, $f4 \n\t"
1837 "biadd $f6, $f6 \n\t"
1838 "paddh $f24, $f24, $f4 \n\t"
1839 "paddh $f26, $f26, $f6 \n\t"
1840 "pasubub $f12, $f12, $f0 \n\t"
1841 "pasubub $f14, $f14, $f2 \n\t"
1842 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1843 "biadd $f12, $f12 \n\t"
1844 "biadd $f14, $f14 \n\t"
1845 "paddh $f28, $f28, $f12 \n\t"
1846 "paddh $f30, $f30, $f14 \n\t"
1847
1848 "gsldlc1 $f12, 0x7($9) \n\t"
1849 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1850 "gsldrc1 $f12, 0x0($9) \n\t"
1851 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1852 "pasubub $f0, $f0, $f12 \n\t"
1853 "pasubub $f2, $f2, $f14 \n\t"
1854 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1855 "biadd $f0, $f0 \n\t"
1856 "biadd $f2, $f2 \n\t"
1857 "paddh $f20, $f20, $f0 \n\t"
1858 "paddh $f22, $f22, $f2 \n\t"
1859
1860 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1861 "gsldlc1 $f2, 0x7($8) \n\t"
1862 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1863 "gsldrc1 $f2, 0x0($8) \n\t"
1864 "pasubub $f12, $f12, $f0 \n\t"
1865 "pasubub $f14, $f14, $f2 \n\t"
1866 "biadd $f12, $f12 \n\t"
1867 "biadd $f14, $f14 \n\t"
1868 "paddh $f16, $f16, $f12 \n\t"
1869 "paddh $f18, $f18, $f14 \n\t"
1870
1871 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1872 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1873 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1874 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1875 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1876
1877 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1878 "gsldlc1 $f6, 0x6($9) \n\t"
1879 "gsldlc1 $f14, 0x8($9) \n\t"
1880 "gsldrc1 $f6, -0x1($9) \n\t"
1881 "gsldrc1 $f14, 0x1($9) \n\t"
1882
1883 "pasubub $f4, $f4, $f0 \n\t"
1884 "pasubub $f6, $f6, $f2 \n\t"
1885 "biadd $f4, $f4 \n\t"
1886 "biadd $f6, $f6 \n\t"
1887 "paddh $f24, $f24, $f4 \n\t"
1888 "paddh $f26, $f26, $f6 \n\t"
1889 "pasubub $f12, $f12, $f0 \n\t"
1890 "pasubub $f14, $f14, $f2 \n\t"
1891 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1892 "biadd $f12, $f12 \n\t"
1893 "biadd $f14, $f14 \n\t"
1894 "paddh $f28, $f28, $f12 \n\t"
1895 "paddh $f30, $f30, $f14 \n\t"
1896
1897 "gsldlc1 $f12, 0x7($9) \n\t"
1898 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1899 "gsldrc1 $f12, 0x0($9) \n\t"
1900 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1901 "pasubub $f0, $f0, $f12 \n\t"
1902 "pasubub $f2, $f2, $f14 \n\t"
1903 "biadd $f0, $f0 \n\t"
1904 "biadd $f2, $f2 \n\t"
1905 "paddh $f20, $f20, $f0 \n\t"
1906 "paddh $f22, $f22, $f2 \n\t"
1907
1908 "paddh $f16, $f16, $f18 \n\t"
1909 "paddh $f20, $f20, $f22 \n\t"
1910 "paddh $f24, $f24, $f26 \n\t"
1911 "paddh $f28, $f28, $f30 \n\t"
1912 "punpcklwd $f16, $f16, $f20 \n\t"
1913 "punpcklwd $f24, $f24, $f28 \n\t"
1914 "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
1915 : [pSample1]"+&r"((unsigned char *)pSample1),
1916 [pSample2]"+&r"((unsigned char *)pSample2)
1917 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
1918 [pSad]"r"((int *)pSad)
1919 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1920 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
1921 );
1922 RECOVER_REG;
1923 }
1924
WelsSampleSadFour8x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1925 void WelsSampleSadFour8x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1926 int32_t iStride2, int32_t* pSad) {
1927 BACKUP_REG;
1928 __asm__ volatile (
1929 ".set arch=loongson3a \n\t"
1930 "xor $f16, $f16, $f16 \n\t"
1931 "xor $f18, $f18, $f18 \n\t"
1932 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1933 "xor $f20, $f20, $f20 \n\t"
1934 "xor $f22, $f22, $f22 \n\t"
1935 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1936 "xor $f24, $f24, $f24 \n\t"
1937 "xor $f26, $f26, $f26 \n\t"
1938 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1939 PTR_SUBU "$9, %[pSample2], %[iStride2] \n\t"
1940 "xor $f28, $f28, $f28 \n\t"
1941 "xor $f30, $f30, $f30 \n\t"
1942 "gsldlc1 $f2, 0x7($8) \n\t"
1943 "gsldlc1 $f12, 0x7($9) \n\t"
1944 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1945 "gsldrc1 $f2, 0x0($8) \n\t"
1946 "gsldrc1 $f12, 0x0($9) \n\t"
1947 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1948 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1949 "pasubub $f12, $f12, $f0 \n\t"
1950 "pasubub $f14, $f14, $f2 \n\t"
1951 "biadd $f12, $f12 \n\t"
1952 "biadd $f14, $f14 \n\t"
1953 "paddh $f16, $f16, $f12 \n\t"
1954 "paddh $f18, $f18, $f14 \n\t"
1955
1956 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
1957 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
1958 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
1959 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
1960 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
1961 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
1962
1963 "gsldlc1 $f6, 0x6($9) \n\t"
1964 "gsldlc1 $f14, 0x8($9) \n\t"
1965 "gsldrc1 $f6, -0x1($9) \n\t"
1966 "gsldrc1 $f14, 0x1($9) \n\t"
1967 "pasubub $f4, $f4, $f0 \n\t"
1968 "pasubub $f6, $f6, $f2 \n\t"
1969 "biadd $f4, $f4 \n\t"
1970 "biadd $f6, $f6 \n\t"
1971 "paddh $f24, $f24, $f4 \n\t"
1972 "paddh $f26, $f26, $f6 \n\t"
1973 "pasubub $f12, $f12, $f0 \n\t"
1974 "pasubub $f14, $f14, $f2 \n\t"
1975 "biadd $f12, $f12 \n\t"
1976 "biadd $f14, $f14 \n\t"
1977 "paddh $f28, $f28, $f12 \n\t"
1978 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
1979 "paddh $f30, $f30, $f14 \n\t"
1980
1981 "gsldlc1 $f12, 0x7($9) \n\t"
1982 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
1983 "gsldrc1 $f12, 0x0($9) \n\t"
1984 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
1985 "pasubub $f0, $f0, $f12 \n\t"
1986 "pasubub $f2, $f2, $f14 \n\t"
1987 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
1988 "biadd $f0, $f0 \n\t"
1989 "biadd $f2, $f2 \n\t"
1990 "paddh $f20, $f20, $f0 \n\t"
1991 "paddh $f22, $f22, $f2 \n\t"
1992
1993 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
1994 "gsldlc1 $f2, 0x7($8) \n\t"
1995 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
1996 "gsldrc1 $f2, 0x0($8) \n\t"
1997 "pasubub $f12, $f12, $f0 \n\t"
1998 "pasubub $f14, $f14, $f2 \n\t"
1999 "biadd $f12, $f12 \n\t"
2000 "biadd $f14, $f14 \n\t"
2001 "paddh $f16, $f16, $f12 \n\t"
2002 "paddh $f18, $f18, $f14 \n\t"
2003
2004 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
2005 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
2006 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
2007 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
2008 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
2009 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
2010
2011 "gsldlc1 $f6, 0x6($9) \n\t"
2012 "gsldlc1 $f14, 0x8($9) \n\t"
2013 "gsldrc1 $f6, -0x1($9) \n\t"
2014 "gsldrc1 $f14, 0x1($9) \n\t"
2015
2016 "pasubub $f4, $f4, $f0 \n\t"
2017 "pasubub $f6, $f6, $f2 \n\t"
2018 "biadd $f4, $f4 \n\t"
2019 "biadd $f6, $f6 \n\t"
2020 "paddh $f24, $f24, $f4 \n\t"
2021 "paddh $f26, $f26, $f6 \n\t"
2022 "pasubub $f12, $f12, $f0 \n\t"
2023 "pasubub $f14, $f14, $f2 \n\t"
2024 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
2025 "biadd $f12, $f12 \n\t"
2026 "biadd $f14, $f14 \n\t"
2027 "paddh $f28, $f28, $f12 \n\t"
2028 "paddh $f30, $f30, $f14 \n\t"
2029
2030 "gsldlc1 $f12, 0x7($9) \n\t"
2031 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
2032 "gsldrc1 $f12, 0x0($9) \n\t"
2033 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
2034 "pasubub $f0, $f0, $f12 \n\t"
2035 "pasubub $f2, $f2, $f14 \n\t"
2036 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
2037 "biadd $f0, $f0 \n\t"
2038 "biadd $f2, $f2 \n\t"
2039 "paddh $f20, $f20, $f0 \n\t"
2040 "paddh $f22, $f22, $f2 \n\t"
2041
2042 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
2043 "gsldlc1 $f2, 0x7($8) \n\t"
2044 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
2045 "gsldrc1 $f2, 0x0($8) \n\t"
2046 "pasubub $f12, $f12, $f0 \n\t"
2047 "pasubub $f14, $f14, $f2 \n\t"
2048 "biadd $f12, $f12 \n\t"
2049 "biadd $f14, $f14 \n\t"
2050 "paddh $f16, $f16, $f12 \n\t"
2051 "paddh $f18, $f18, $f14 \n\t"
2052
2053 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
2054 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
2055 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
2056 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
2057 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
2058 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
2059
2060 "gsldlc1 $f6, 0x6($9) \n\t"
2061 "gsldlc1 $f14, 0x8($9) \n\t"
2062 "gsldrc1 $f6, -0x1($9) \n\t"
2063 "gsldrc1 $f14, 0x1($9) \n\t"
2064
2065 "pasubub $f4, $f4, $f0 \n\t"
2066 "pasubub $f6, $f6, $f2 \n\t"
2067 "biadd $f4, $f4 \n\t"
2068 "biadd $f6, $f6 \n\t"
2069 "paddh $f24, $f24, $f4 \n\t"
2070 "paddh $f26, $f26, $f6 \n\t"
2071 "pasubub $f12, $f12, $f0 \n\t"
2072 "pasubub $f14, $f14, $f2 \n\t"
2073 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
2074 "biadd $f12, $f12 \n\t"
2075 "biadd $f14, $f14 \n\t"
2076 "paddh $f28, $f28, $f12 \n\t"
2077 "paddh $f30, $f30, $f14 \n\t"
2078
2079 "gsldlc1 $f12, 0x7($9) \n\t"
2080 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
2081 "gsldrc1 $f12, 0x0($9) \n\t"
2082 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
2083 "pasubub $f0, $f0, $f12 \n\t"
2084 "pasubub $f2, $f2, $f14 \n\t"
2085 PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
2086 "biadd $f0, $f0 \n\t"
2087 "biadd $f2, $f2 \n\t"
2088 "paddh $f20, $f20, $f0 \n\t"
2089 "paddh $f22, $f22, $f2 \n\t"
2090
2091 "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
2092 "gsldlc1 $f2, 0x7($8) \n\t"
2093 "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
2094 "gsldrc1 $f2, 0x0($8) \n\t"
2095 "pasubub $f12, $f12, $f0 \n\t"
2096 "pasubub $f14, $f14, $f2 \n\t"
2097 "biadd $f12, $f12 \n\t"
2098 "biadd $f14, $f14 \n\t"
2099 "paddh $f16, $f16, $f12 \n\t"
2100 "paddh $f18, $f18, $f14 \n\t"
2101
2102 "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
2103 "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
2104 PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
2105 PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
2106 "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
2107 "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
2108
2109 "gsldlc1 $f6, 0x6($9) \n\t"
2110 "gsldlc1 $f14, 0x8($9) \n\t"
2111 "gsldrc1 $f6, -0x1($9) \n\t"
2112 "gsldrc1 $f14, 0x1($9) \n\t"
2113
2114 "pasubub $f4, $f4, $f0 \n\t"
2115 "pasubub $f6, $f6, $f2 \n\t"
2116 "biadd $f4, $f4 \n\t"
2117 "biadd $f6, $f6 \n\t"
2118 "paddh $f24, $f24, $f4 \n\t"
2119 "paddh $f26, $f26, $f6 \n\t"
2120 "pasubub $f12, $f12, $f0 \n\t"
2121 "pasubub $f14, $f14, $f2 \n\t"
2122 PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
2123 "biadd $f12, $f12 \n\t"
2124 "biadd $f14, $f14 \n\t"
2125 "paddh $f28, $f28, $f12 \n\t"
2126 "paddh $f30, $f30, $f14 \n\t"
2127
2128 "gsldlc1 $f12, 0x7($9) \n\t"
2129 "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
2130 "gsldrc1 $f12, 0x0($9) \n\t"
2131 "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
2132 "pasubub $f0, $f0, $f12 \n\t"
2133 "pasubub $f2, $f2, $f14 \n\t"
2134 "biadd $f0, $f0 \n\t"
2135 "biadd $f2, $f2 \n\t"
2136 "paddh $f20, $f20, $f0 \n\t"
2137 "paddh $f22, $f22, $f2 \n\t"
2138
2139 "paddh $f16, $f16, $f18 \n\t"
2140 "paddh $f20, $f20, $f22 \n\t"
2141 "paddh $f24, $f24, $f26 \n\t"
2142 "paddh $f28, $f28, $f30 \n\t"
2143 "punpcklwd $f16, $f16, $f20 \n\t"
2144 "punpcklwd $f24, $f24, $f28 \n\t"
2145 "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
2146 : [pSample1]"+&r"((unsigned char *)pSample1),
2147 [pSample2]"+&r"((unsigned char *)pSample2)
2148 : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
2149 [pSad]"r"((int *)pSad)
2150 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
2151 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
2152 );
2153 RECOVER_REG;
2154 }
2155