1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file vaa_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 23/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
43 //f4 is 0x1, f6 is 0x8
44 #define WELS_MAX_REG_MMI(f0, f2, f4, f6) \
45 "punpckhwd $f4, "#f0", "#f0" \n\t" \
46 "punpckhwd $f6, "#f2", "#f2" \n\t" \
47 "pmaxub "#f0", "#f0", $f4 \n\t" \
48 "pmaxub "#f2", "#f2", $f6 \n\t" \
49 "pshufh $f4, "#f0", "#f4" \n\t" \
50 "pshufh $f6, "#f2", "#f4" \n\t" \
51 "pmaxub "#f0", "#f0", $f4 \n\t" \
52 "pmaxub "#f2", "#f2", $f6 \n\t" \
53 "dsrl $f4, "#f0", "#f6" \n\t" \
54 "dsrl $f6, "#f2", "#f6" \n\t" \
55 "pmaxub "#f0", "#f0", $f4 \n\t" \
56 "pmaxub "#f2", "#f2", $f6 \n\t"
57
58 #define WELS_SAD_SD_MAD_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
59 "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
60 "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
61 "pasubub $f12, $f4, $f0 \n\t" \
62 "pasubub $f14, $f6, $f2 \n\t" \
63 "biadd $f12, $f12 \n\t" \
64 "biadd $f14, $f14 \n\t" \
65 "paddw "#f4", "#f4", $f12 \n\t" \
66 "paddw "#f6", "#f6", $f14 \n\t" \
67 "pasubub $f12, $f8, $f0 \n\t" \
68 "pasubub $f14, $f10, $f2 \n\t" \
69 "biadd $f12, $f12 \n\t" \
70 "biadd $f14, $f14 \n\t" \
71 "paddw "#f8", "#f8", $f12 \n\t" \
72 "paddw "#f10", "#f10", $f14 \n\t" \
73 "pasubub $f12, $f4, $f8 \n\t" \
74 "pasubub $f14, $f6, $f10 \n\t" \
75 "pmaxub "#f12", "#f12", $f12 \n\t" \
76 "pmaxub "#f14", "#f14", $f14 \n\t" \
77 "pasubub $f12, $f12, $f0 \n\t" \
78 "pasubub $f14, $f14, $f2 \n\t" \
79 "biadd $f12, $f12 \n\t" \
80 "biadd $f14, $f14 \n\t" \
81 "paddw "#f0", "#f0", $f12 \n\t" \
82 "paddw "#f2", "#f2", $f14 \n\t" \
83 PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
84 PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
85
86 #define WELS_SAD_16x2_MMI(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, r1, r2, r3) \
87 "gslqc1 "#f1", "#f2", 0x00("#r1") \n\t" \
88 "gslqc1 "#f3", "#f4", 0x00("#r2") \n\t" \
89 PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
90 "gslqc1 "#f5", "#f6", 0x00("#r1") \n\t" \
91 PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
92 "gslqc1 "#f7", "#f8", 0x00("#r2") \n\t" \
93 "pasubub "#f1", "#f1", "#f3" \n\t" \
94 "pasubub "#f2", "#f2", "#f4" \n\t" \
95 "biadd "#f1", "#f1" \n\t" \
96 "biadd "#f2", "#f2" \n\t" \
97 "pasubub "#f5", "#f5", "#f7" \n\t" \
98 "pasubub "#f6", "#f6", "#f8" \n\t" \
99 "biadd "#f5", "#f5" \n\t" \
100 "biadd "#f6", "#f6" \n\t" \
101 "paddw "#f9", "#f9", "#f1" \n\t" \
102 "paddw "#f9", "#f9", "#f5" \n\t" \
103 "paddw "#f10", "#f10", "#f2" \n\t" \
104 "paddw "#f10", "#f10", "#f6" \n\t" \
105 PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
106 PTR_ADDU ""#r2", "#r2", "#r3" \n\t"
107
108 #define WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI(r0, r1, r2) \
109 "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
110 "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
111 "pasubub $f12, $f4, $f8 \n\t" \
112 "pasubub $f14, $f6, $f10 \n\t" \
113 "biadd $f12, $f12 \n\t" \
114 "biadd $f14, $f14 \n\t" \
115 "paddw $f28, $f28, $f12 \n\t" \
116 "paddw $f30, $f30, $f14 \n\t" \
117 "pasubub $f12, $f4, $f8 \n\t" \
118 "pasubub $f14, $f6, $f10 \n\t" \
119 "pasubub $f8, $f4, $f0 \n\t" \
120 "pasubub $f10, $f6, $f2 \n\t" \
121 "biadd $f8, $f8 \n\t" \
122 "biadd $f10, $f10 \n\t" \
123 "paddw $f24, $f24, $f8 \n\t" \
124 "paddw $f26, $f26, $f10 \n\t" \
125 "punpcklbh $f8, $f6, $f2 \n\t" \
126 "punpckhbh $f10, $f6, $f2 \n\t" \
127 "punpckhbh $f6, $f4, $f0 \n\t" \
128 "punpcklbh $f4, $f4, $f0 \n\t" \
129 "pmaddhw $f4, $f4, $f4 \n\t" \
130 "pmaddhw $f6, $f6, $f6 \n\t" \
131 "pmaddhw $f8, $f8, $f8 \n\t" \
132 "pmaddhw $f10, $f10, $f10 \n\t" \
133 "paddw $f20, $f20, $f4 \n\t" \
134 "paddw $f22, $f22, $f6 \n\t" \
135 "paddw $f20, $f20, $f8 \n\t" \
136 "paddw $f22, $f22, $f10 \n\t" \
137 "punpcklbh $f4, $f12, $f0 \n\t" \
138 "punpckhbh $f6, $f12, $f0 \n\t" \
139 "punpcklbh $f12, $f14, $f2 \n\t" \
140 "punpckhbh $f14, $f14, $f2 \n\t" \
141 "pmaddhw $f4, $f4, $f4 \n\t" \
142 "pmaddhw $f6, $f6, $f6 \n\t" \
143 "pmaddhw $f12, $f12, $f12 \n\t" \
144 "pmaddhw $f14, $f14, $f14 \n\t" \
145 "paddw $f16, $f16, $f4 \n\t" \
146 "paddw $f18, $f18, $f6 \n\t" \
147 "paddw $f16, $f16, $f12 \n\t" \
148 "paddw $f18, $f18, $f14 \n\t" \
149 PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
150 PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
151
152 #define WELS_SAD_BGD_SQDIFF_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
153 "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
154 "punpcklbh $f8, $f4, $f0 \n\t" \
155 "punpckhbh $f10, $f4, $f0 \n\t" \
156 "punpcklbh $f12, $f6, $f2 \n\t" \
157 "punpckhbh $f14, $f6, $f2 \n\t" \
158 "pmaddhw $f8, $f8, $f8 \n\t" \
159 "pmaddhw $f10, $f10, $f10 \n\t" \
160 "pmaddhw $f12, $f12, $f12 \n\t" \
161 "pmaddhw $f14, $f14, $f14 \n\t" \
162 "paddw $f8, $f8, $f12 \n\t" \
163 "paddw $f10, $f10, $f14 \n\t" \
164 "punpckhwd $f12, $f0, $f8 \n\t" \
165 "punpckhwd $f14, $f0, $f10 \n\t" \
166 "punpcklwd $f8, $f0, $f8 \n\t" \
167 "punpcklwd $f10, $f0, $f10 \n\t" \
168 "paddw $f8, $f8, $f12 \n\t" \
169 "paddw $f10, $f10, $f14 \n\t" \
170 "paddw "#f0", "#f0", $f8 \n\t" \
171 "paddw "#f2", "#f2", $f10 \n\t" \
172 "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
173 "pasubub $f12, $f4, $f0 \n\t" \
174 "pasubub $f14, $f6, $f2 \n\t" \
175 "biadd $f12, $f12 \n\t" \
176 "biadd $f14, $f14 \n\t" \
177 "paddw "#f4", "#f4", $f12 \n\t" \
178 "paddw "#f6", "#f6", $f14 \n\t" \
179 "pasubub $f12, $f8, $f0 \n\t" \
180 "pasubub $f14, $f10, $f2 \n\t" \
181 "biadd $f12, $f12 \n\t" \
182 "biadd $f14, $f14 \n\t" \
183 "punpcklwd $f14, $f14, $f14 \n\t" \
184 "punpckhwd $f14, $f12, $f14 \n\t" \
185 "punpcklwd $f12, $f0, $f12 \n\t" \
186 "paddw "#f4", "#f4", $f12 \n\t" \
187 "paddw "#f6", "#f6", $f14 \n\t" \
188 "pasubub $f12, $f4, $f8 \n\t" \
189 "pasubub $f14, $f6, $f10 \n\t" \
190 "pmaxub "#f8", "#f8", $f12 \n\t" \
191 "pmaxub "#f10", "#f10", $f14 \n\t" \
192 "paddw $f4, $f0, $f12 \n\t" \
193 "paddw $f6, $f0, $f14 \n\t" \
194 "pasubub $f12, $f12, $f0 \n\t" \
195 "pasubub $f14, $f14, $f2 \n\t" \
196 "biadd $f12, $f12 \n\t" \
197 "biadd $f14, $f14 \n\t" \
198 "paddw "#f0", "#f0", $f12 \n\t" \
199 "paddw "#f2", "#f2", $f14 \n\t" \
200 "paddw $f12, $f0, $f4 \n\t" \
201 "paddw $f14, $f0, $f6 \n\t" \
202 "punpcklbh $f4, $f12, $f0 \n\t" \
203 "punpckhbh $f6, $f12, $f0 \n\t" \
204 "punpcklbh $f12, $f14, $f2 \n\t" \
205 "punpckhbh $f14, $f14, $f2 \n\t" \
206 "pmaddhw $f4, $f4, $f4 \n\t" \
207 "pmaddhw $f6, $f6, $f6 \n\t" \
208 "pmaddhw $f12, $f12, $f12 \n\t" \
209 "pmaddhw $f14, $f14, $f14 \n\t" \
210 "paddw "#f12", "#f12", $f4 \n\t" \
211 "paddw "#f14", "#f14", $f6 \n\t" \
212 "paddw "#f12", "#f12", $f12 \n\t" \
213 "paddw "#f14", "#f14", $f14 \n\t" \
214 PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
215 PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
216
217 #define WELS_SAD_SUM_SQSUM_16x1_MMI(r0, r1, r2) \
218 "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
219 "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
220 "pasubub $f12, $f4, $f8 \n\t" \
221 "pasubub $f14, $f6, $f10 \n\t" \
222 "biadd $f12, $f12 \n\t" \
223 "biadd $f14, $f14 \n\t" \
224 "paddw $f24, $f24, $f12 \n\t" \
225 "paddw $f26, $f26, $f14 \n\t" \
226 "pasubub $f12, $f4, $f0 \n\t" \
227 "pasubub $f14, $f6, $f2 \n\t" \
228 "biadd $f12, $f12 \n\t" \
229 "biadd $f14, $f14 \n\t" \
230 "paddw $f20, $f20, $f12 \n\t" \
231 "paddw $f22, $f22, $f14 \n\t" \
232 "punpcklbh $f8, $f6, $f2 \n\t" \
233 "punpckhbh $f10, $f6, $f2 \n\t" \
234 "punpckhbh $f6, $f4, $f0 \n\t" \
235 "punpcklbh $f4, $f4, $f0 \n\t" \
236 "pmaddhw $f4, $f4, $f4 \n\t" \
237 "pmaddhw $f6, $f6, $f6 \n\t" \
238 "pmaddhw $f8, $f8, $f8 \n\t" \
239 "pmaddhw $f10, $f10, $f10 \n\t" \
240 "paddw $f16, $f16, $f4 \n\t" \
241 "paddw $f18, $f18, $f6 \n\t" \
242 "paddw $f16, $f16, $f8 \n\t" \
243 "paddw $f18, $f18, $f10 \n\t" \
244 PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
245 PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
246
VAACalcSad_mmi(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8)247 void VAACalcSad_mmi(const uint8_t* pCurData, const uint8_t* pRefData,
248 int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
249 int32_t* pFrameSad, int32_t* pSad8x8) {
250 double ftmp[13];
251 uint64_t tmp[2];
252 mips_reg addr[3];
253
254 __asm__ volatile (
255 ".set arch=loongson3a \n\t"
256 PTR_SRL "%[iPicWidth], %[iPicWidth], 0x04 \n\t"
257 PTR_SRL "%[iPicHeight], %[iPicHeight], 0x04 \n\t"
258 "move %[addr2], %[iPicStride] \n\t"
259 PTR_SLL "%[iPicStride], %[iPicStride], 0x04 \n\t"
260 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
261 "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
262 "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
263 "1: \n\t"
264 "move %[addr0], %[pCurData] \n\t"
265 "move %[addr1], %[pRefData] \n\t"
266 "move %[tmp0], %[iPicWidth] \n\t"
267 "2: \n\t"
268 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
269 "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
270 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
271 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
272 %[addr0], %[addr1], %[addr2])
273 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
274 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
275 %[addr0], %[addr1], %[addr2])
276 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
277 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
278 %[addr0], %[addr1], %[addr2])
279 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
280 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
281 %[addr0], %[addr1], %[addr2])
282 "paddw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
283 "paddw %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
284 "swc1 %[ftmp10], 0x00(%[pSad8x8]) \n\t"
285 "swc1 %[ftmp9], 0x04(%[pSad8x8]) \n\t"
286
287 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
288 "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
289 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
290 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
291 %[addr0], %[addr1], %[addr2])
292 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
293 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
294 %[addr0], %[addr1], %[addr2])
295 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
296 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
297 %[addr0], %[addr1], %[addr2])
298 WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
299 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
300 %[addr0], %[addr1], %[addr2])
301 "paddw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
302 "paddw %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
303 "swc1 %[ftmp10], 0x08(%[pSad8x8]) \n\t"
304 "swc1 %[ftmp9], 0x0c(%[pSad8x8]) \n\t"
305
306 PTR_ADDU "%[pSad8x8], %[pSad8x8], 0x10 \n\t"
307 PTR_SUBU "%[addr0], %[addr0], %[iPicStride] \n\t"
308 PTR_SUBU "%[addr1], %[addr1], %[iPicStride] \n\t"
309 PTR_ADDI "%[tmp0], %[tmp0], -0x01 \n\t"
310 PTR_ADDU "%[addr0], %[addr0], 0x10 \n\t"
311 PTR_ADDU "%[addr1], %[addr1], 0x10 \n\t"
312 "bnez %[tmp0], 2b \n\t"
313
314 PTR_ADDI "%[iPicHeight], %[iPicHeight], -0x01 \n\t"
315 PTR_ADDU "%[pCurData], %[pCurData], %[iPicStride] \n\t"
316 PTR_ADDU "%[pRefData], %[pRefData], %[iPicStride] \n\t"
317 "bnez %[iPicHeight], 1b \n\t"
318
319 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
320 "swc1 %[ftmp11], 0x00(%[pFrameSad]) \n\t"
321 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
322 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
323 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
324 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
325 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
326 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
327 [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
328 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
329 [pCurData]"+&r"(pCurData), [pRefData]"+&r"(pRefData),
330 [iPicHeight]"+&r"(iPicHeight), [iPicWidth]"+&r"(iPicWidth),
331 [pSad8x8]"+&r"(pSad8x8), [iPicStride]"+&r"(iPicStride),
332 [addr2]"=&r"(addr[2])
333 : [pFrameSad]"r"(pFrameSad)
334 : "memory"
335 );
336 }
337
VAACalcSadBgd_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * p_sd8x8,uint8_t * p_mad8x8)338 void VAACalcSadBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
339 int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
340 int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8,
341 uint8_t *p_mad8x8) {
342 BACKUP_REG;
343 __asm__ volatile (
344 ".set arch=loongson3a \n\t"
345 "move $15, %[cur_data] \n\t"
346 "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
347 "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
348 "dsll $13, %[iPicStride], 0x4 \n\t"
349 "xor $f0, $f0, $f0 \n\t"
350 "xor $f2, $f2, $f2 \n\t"
351 "xor $14, $14, $14 \n\t"
352 "1: \n\t"
353 "move $9, %[iPicWidth] \n\t"
354 "move $10, $15 \n\t"
355 "move $11, %[ref_data] \n\t"
356 "2: \n\t"
357 "xor $f28, $f28, $f28 \n\t"
358 "xor $f30, $f30, $f30 \n\t"
359 "xor $f24, $f24, $f24 \n\t"
360 "xor $f26, $f26, $f26 \n\t"
361 "xor $f20, $f20, $f20 \n\t"
362 "xor $f22, $f22, $f22 \n\t"
363 "xor $f16, $f16, $f16 \n\t"
364 "xor $f18, $f18, $f18 \n\t"
365 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
366 $15, %[ref_data], %[iPicStride])
367 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
368 $15, %[ref_data], %[iPicStride])
369 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
370 $15, %[ref_data], %[iPicStride])
371 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
372 $15, %[ref_data], %[iPicStride])
373 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
374 $15, %[ref_data], %[iPicStride])
375 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
376 $15, %[ref_data], %[iPicStride])
377 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
378 $15, %[ref_data], %[iPicStride])
379 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
380 $15, %[ref_data], %[iPicStride])
381
382 "dli $8, 0x1 \n\t"
383 "dmtc1 $8, $f8 \n\t"
384 "dli $8, 0x8 \n\t"
385 "dmtc1 $8, $f10 \n\t"
386 WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
387
388 "dmfc1 $8, $f16 \n\t"
389 "sb $8, 0x0(%[p_mad8x8]) \n\t"
390 "dmfc1 $8, $f18 \n\t"
391 "sb $8, 0x1(%[p_mad8x8]) \n\t"
392 PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
393
394 "xor $f16, $f16, $f16 \n\t"
395 "xor $f18, $f18, $f18 \n\t"
396 "punpcklwd $f30, $f30, $f30 \n\t"
397 "punpcklwd $f26, $f26, $f26 \n\t"
398 "punpcklwd $f22, $f22, $f22 \n\t"
399
400 "punpckhwd $f30, $f28, $f30 \n\t"
401 "punpckhwd $f26, $f24, $f26 \n\t"
402 "punpckhwd $f22, $f20, $f22 \n\t"
403
404 "punpcklwd $f28, $f16, $f28 \n\t"
405 "punpcklwd $f24, $f16, $f24 \n\t"
406 "punpcklwd $f20, $f16, $f20 \n\t"
407
408 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
409 $15, %[ref_data], %[iPicStride])
410 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
411 $15, %[ref_data], %[iPicStride])
412 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
413 $15, %[ref_data], %[iPicStride])
414 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
415 $15, %[ref_data], %[iPicStride])
416 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
417 $15, %[ref_data], %[iPicStride])
418 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
419 $15, %[ref_data], %[iPicStride])
420 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
421 $15, %[ref_data], %[iPicStride])
422 WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
423 $15, %[ref_data], %[iPicStride])
424
425 "dli $8, 0x1 \n\t"
426 "dmtc1 $8, $f8 \n\t"
427 "dli $8, 0x8 \n\t"
428 "dmtc1 $8, $f10 \n\t"
429 WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
430
431 "dmfc1 $8, $f16 \n\t"
432 "sb $8, 0x0(%[p_mad8x8]) \n\t"
433 "dmfc1 $8, $f18 \n\t"
434 "sb $8, 0x1(%[p_mad8x8]) \n\t"
435 "punpckhwd $f4, $f28, $f30 \n\t"
436 PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
437
438 "punpcklwd $f6, $f28, $f30 \n\t"
439 "gssqc1 $f6, $f4, 0x0(%[psad8x8]) \n\t"
440 PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
441
442 "paddw $f6, $f6, $f30 \n\t"
443 "paddw $f4, $f4, $f28 \n\t"
444 "punpckhwd $f8, $f6, $f6 \n\t"
445 "paddw $f4, $f4, $f8 \n\t"
446 "dmtc1 $14, $f6 \n\t"
447 "paddw $f6, $f6, $f4 \n\t"
448 "dmfc1 $14, $f6 \n\t"
449
450 "psubw $f24, $f24, $f20 \n\t"
451 "psubw $f26, $f26, $f22 \n\t"
452 "punpckhwd $f4, $f24, $f26 \n\t"
453 "punpcklwd $f6, $f24, $f26 \n\t"
454 "gssqc1 $f6, $f4, 0x0(%[p_sd8x8]) \n\t"
455 PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x10 \n\t"
456
457 PTR_SUBU "$15, $15, $13 \n\t"
458 PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
459 PTR_ADDIU "$15, $15, 0x10 \n\t"
460 PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
461
462 PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
463 "bnez %[iPicWidth], 2b \n\t"
464 "move %[iPicWidth], $9 \n\t"
465 "move $15, $10 \n\t"
466 "move %[ref_data], $11 \n\t"
467 PTR_ADDU "$15, $15, $13 \n\t"
468 PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
469
470 PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
471 "bnez %[iPicHeight], 1b \n\t"
472
473 "swl $14, 0x3(%[psadframe]) \n\t"
474 "swr $14, 0x0(%[psadframe]) \n\t"
475 : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
476 [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
477 [p_sd8x8]"+&r"((int *)p_sd8x8), [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
478 : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
479 [psadframe]"r"((int *)psadframe)
480 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
481 "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
482 "$f24", "$f26", "$f28", "$f30"
483 );
484 RECOVER_REG;
485 }
486
VAACalcSadSsd_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * psum16x16,int32_t * psqsum16x16,int32_t * psqdiff16x16)487 void VAACalcSadSsd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
488 int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
489 int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
490 int32_t *psqsum16x16, int32_t *psqdiff16x16) {
491 BACKUP_REG;
492 __asm__ volatile (
493 ".set arch=loongson3a \n\t"
494 "move $15, %[cur_data] \n\t"
495 "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
496 "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
497 "dsll $13, %[iPicStride], 0x4 \n\t"
498 "xor $f0, $f0, $f0 \n\t"
499 "xor $f2, $f2, $f2 \n\t"
500 "xor $12, $12, $12 \n\t"
501 "xor $14, $14, $14 \n\t"
502 "1: \n\t"
503 "move $9, %[iPicWidth] \n\t"
504 "move $10, $15 \n\t"
505 "move $11, %[ref_data] \n\t"
506 "2: \n\t"
507 "xor $f28, $f28, $f28 \n\t"
508 "xor $f30, $f30, $f30 \n\t"
509 "xor $f24, $f24, $f24 \n\t"
510 "xor $f26, $f26, $f26 \n\t"
511 "xor $f20, $f20, $f20 \n\t"
512 "xor $f22, $f22, $f22 \n\t"
513 "xor $f16, $f16, $f16 \n\t"
514 "xor $f18, $f18, $f18 \n\t"
515 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
516 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
517 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
518 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
519 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
520 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
521 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
522 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
523 "dmfc1 $8, $f28 \n\t"
524 "sw $8, 0x0(%[psad8x8]) \n\t"
525 "dmfc1 $8, $f30 \n\t"
526 "sw $8, 0x4(%[psad8x8]) \n\t"
527 "paddw $f4, $f28, $f30 \n\t"
528 "dmfc1 $12, $f4 \n\t"
529 PTR_ADDU "$14, $14, $12 \n\t"
530
531 "xor $f28, $f28, $f28 \n\t"
532 "xor $f30, $f30, $f30 \n\t"
533 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
534 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
535 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
536 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
537 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
538 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
539 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
540 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
541 "dmfc1 $8, $f28 \n\t"
542 "sw $8, 0x8(%[psad8x8]) \n\t"
543 "dmfc1 $8, $f30 \n\t"
544 "paddw $f4, $f28, $f30 \n\t"
545 "sw $8, 0xc(%[psad8x8]) \n\t"
546 "dmfc1 $12, $f4 \n\t"
547 PTR_ADDU "$14, $14, $12 \n\t"
548 PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
549
550 "paddw $f24, $f24, $f26 \n\t"
551 "dmfc1 $8, $f24 \n\t"
552 "sw $8, 0x0(%[psum16x16]) \n\t"
553 PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
554 "paddw $f24, $f20, $f22 \n\t"
555 "punpcklwd $f20, $f24, $f24 \n\t"
556 "punpckhwd $f22, $f24, $f24 \n\t"
557 "paddw $f20, $f20, $f22 \n\t"
558 "dmfc1 $8, $f20 \n\t"
559 "sw $8, 0x0(%[psqsum16x16]) \n\t"
560 PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
561
562 "paddw $f20, $f16, $f18 \n\t"
563 "punpcklwd $f16, $f20, $f20 \n\t"
564 "punpckhwd $f18, $f20, $f20 \n\t"
565 "paddw $f16, $f16, $f18 \n\t"
566 "dmfc1 $8, $f16 \n\t"
567 "sw $8, 0x0(%[psqdiff16x16]) \n\t"
568 PTR_ADDIU "%[psqdiff16x16], %[psqdiff16x16], 0x4 \n\t"
569
570 PTR_SUBU "$15, $15, $13 \n\t"
571 PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
572 PTR_ADDIU "$15, $15, 0x10 \n\t"
573 PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
574
575 PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
576 "bnez %[iPicWidth], 2b \n\t"
577 "nop \n\t"
578 "move %[iPicWidth], $9 \n\t"
579 "move $15, $10 \n\t"
580 "move %[ref_data], $11 \n\t"
581 PTR_ADDU "$15, $15, $13 \n\t"
582 PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
583
584 PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
585 "bnez %[iPicHeight], 1b \n\t"
586 "nop \n\t"
587
588 "sw $14, 0x0(%[psadframe]) \n\t"
589 : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
590 [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
591 [psqsum16x16]"+&r"((int *)psqsum16x16), [psqdiff16x16]"+&r"((int *)psqdiff16x16)
592 : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
593 [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
594 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
595 "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
596 "$f24", "$f26", "$f28", "$f30"
597 );
598 RECOVER_REG;
599 }
600
VAACalcSadSsdBgd_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * psum16x16,int32_t * psqsum16x16,int32_t * psqdiff16x16,int32_t * p_sd8x8,uint8_t * p_mad8x8)601 void VAACalcSadSsdBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
602 int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
603 int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
604 int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8,
605 uint8_t *p_mad8x8) {
606 BACKUP_REG;
607 __asm__ volatile (
608 ".set arch=loongson3a \n\t"
609 "move $15, %[cur_data] \n\t"
610 "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
611 "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
612 "dsll $13, %[iPicStride], 0x4 \n\t"
613 "xor $f0, $f0, $f0 \n\t"
614 "xor $f2, $f2, $f2 \n\t"
615 "xor $12, $12, $12 \n\t"
616 "xor $14, $14, $14 \n\t"
617 "1: \n\t"
618 "move $9, %[iPicWidth] \n\t"
619 "move $10, $15 \n\t"
620 "move $11, %[ref_data] \n\t"
621 "2: \n\t"
622 "xor $f28, $f28, $f28 \n\t"
623 "xor $f30, $f30, $f30 \n\t"
624 "xor $f24, $f24, $f24 \n\t"
625 "xor $f26, $f26, $f26 \n\t"
626 "xor $f20, $f20, $f20 \n\t"
627 "xor $f22, $f22, $f22 \n\t"
628 "xor $f16, $f16, $f16 \n\t"
629 "xor $f18, $f18, $f18 \n\t"
630 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
631 $f18, $15, %[ref_data], %[iPicStride])
632 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
633 $f18, $15, %[ref_data], %[iPicStride])
634 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
635 $f18, $15, %[ref_data], %[iPicStride])
636 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
637 $f18, $15, %[ref_data], %[iPicStride])
638 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
639 $f18, $15, %[ref_data], %[iPicStride])
640 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
641 $f18, $15, %[ref_data], %[iPicStride])
642 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
643 $f18, $15, %[ref_data], %[iPicStride])
644 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
645 $f18, $15, %[ref_data], %[iPicStride])
646
647 "dmfc1 $8, $f28 \n\t"
648 "sw $8, 0x0(%[psad8x8]) \n\t"
649 "dmfc1 $8, $f30 \n\t"
650 "sw $8, 0x4(%[psad8x8]) \n\t"
651 PTR_ADDIU "%[psad8x8], %[psad8x8], 0x8 \n\t"
652
653 "paddw $f4, $f28, $f30 \n\t"
654 "dmfc1 $12, $f4 \n\t"
655 PTR_ADDU "$14, $14, $12 \n\t"
656
657 "paddw $f4, $f24, $f26 \n\t"
658 "dmfc1 $8, $f4 \n\t"
659 "sw $8, 0x0(%[psum16x16]) \n\t"
660
661 "punpckhwd $f4, $f24, $f26 \n\t"
662 "punpcklwd $f6, $f24, $f26 \n\t"
663 "psubw $f6, $f6, $f4 \n\t"
664 "dmfc1 $8, $f6 \n\t"
665 PTR_S "$8, 0x0(%[p_sd8x8]) \n\t"
666 PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x8 \n\t"
667
668 "dli $8, 0x1 \n\t"
669 "dmtc1 $8, $f8 \n\t"
670 "dli $8, 0x8 \n\t"
671 "dmtc1 $8, $f10 \n\t"
672 WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
673
674 "dmfc1 $8, $f20 \n\t"
675 "sb $8, 0x0(%[p_mad8x8]) \n\t"
676 "dmfc1 $8, $f22 \n\t"
677 "sb $8, 0x1(%[p_mad8x8]) \n\t"
678 PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
679
680 "xor $f20, $f20, $f20 \n\t"
681 "xor $f22, $f22, $f22 \n\t"
682 "punpckhwd $f28, $f20, $f28 \n\t"
683 "xor $f24, $f24, $f24 \n\t"
684 "xor $f26, $f26, $f26 \n\t"
685 "punpckhwd $f30, $f20, $f30 \n\t"
686 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
687 $f18, $15, %[ref_data], %[iPicStride])
688 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
689 $f18, $15, %[ref_data], %[iPicStride])
690 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
691 $f18, $15, %[ref_data], %[iPicStride])
692 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
693 $f18, $15, %[ref_data], %[iPicStride])
694 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
695 $f18, $15, %[ref_data], %[iPicStride])
696 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
697 $f18, $15, %[ref_data], %[iPicStride])
698 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
699 $f18, $15, %[ref_data], %[iPicStride])
700 WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
701 $f18, $15, %[ref_data], %[iPicStride])
702
703 "dmfc1 $8, $f28 \n\t"
704 "sw $8, 0x0(%[psad8x8]) \n\t"
705 "dmfc1 $8, $f30 \n\t"
706 "sw $8, 0x4(%[psad8x8]) \n\t"
707 PTR_ADDIU "%[psad8x8], %[psad8x8], 0x8 \n\t"
708
709 "paddw $f4, $f28, $f30 \n\t"
710 "dmfc1 $12, $f4 \n\t"
711 PTR_ADDU "$14, $14, $12 \n\t"
712
713 "paddw $f4, $f24, $f26 \n\t"
714 "dmfc1 $8, $f4 \n\t"
715 "lw $12, 0x0(%[psum16x16]) \n\t"
716 PTR_ADDU "$8, $8, $12 \n\t"
717 "sw $8, 0x0(%[psum16x16]) \n\t"
718 "xor $f8, $f8, $f8 \n\t"
719 PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
720
721 "punpckhwd $f30, $f30, $f8 \n\t"
722 "punpckhwd $f28, $f28, $f8 \n\t"
723 "paddw $f8, $f28, $f30 \n\t"
724 "dmfc1 $8, $f8 \n\t"
725 "sw $8, 0x0(%[psqsum16x16]) \n\t"
726 PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
727
728 "punpckhwd $f4, $f24, $f26 \n\t"
729 "punpcklwd $f6, $f24, $f26 \n\t"
730 "psubw $f6, $f6, $f4 \n\t"
731 "dmfc1 $8, $f6 \n\t"
732 PTR_S "$8, 0x0(%[p_sd8x8]) \n\t"
733 PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x8 \n\t"
734
735 "dli $8, 0x1 \n\t"
736 "dmtc1 $8, $f8 \n\t"
737 "dli $8, 0x8 \n\t"
738 "dmtc1 $8, $f10 \n\t"
739 WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
740
741 "dmfc1 $8, $f20 \n\t"
742 "sb $8, 0x0(%[p_mad8x8]) \n\t"
743 "dmfc1 $8, $f22 \n\t"
744 "sb $8, 0x1(%[p_mad8x8]) \n\t"
745 PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
746
747 "paddw $f20, $f16, $f18 \n\t"
748 "punpcklwd $f16, $f20, $f20 \n\t"
749 "punpckhwd $f18, $f20, $f20 \n\t"
750 "paddw $f16, $f16, $f18 \n\t"
751 "dmfc1 $8, $f16 \n\t"
752 "sw $8, 0x0(%[psqdiff16x16]) \n\t"
753 PTR_ADDIU "%[psqdiff16x16], %[psqdiff16x16], 0x4 \n\t"
754
755 PTR_SUBU "$15, $15, $13 \n\t"
756 PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
757 PTR_ADDIU "$15, $15, 0x10 \n\t"
758 PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
759
760 PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
761 "bnez %[iPicWidth], 2b \n\t"
762 "nop \n\t"
763 "move %[iPicWidth], $9 \n\t"
764 "move $15, $10 \n\t"
765 "move %[ref_data], $11 \n\t"
766 PTR_ADDU "$15, $15, $13 \n\t"
767 PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
768
769 PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
770 "bnez %[iPicHeight], 1b \n\t"
771 "nop \n\t"
772
773 "sw $14, 0x0(%[psadframe]) \n\t"
774 : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
775 [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
776 [psum16x16]"+&r"((int *)psum16x16), [psqsum16x16]"+&r"((int *)psqsum16x16),
777 [psqdiff16x16]"+&r"((int *)psqdiff16x16), [p_sd8x8]"+&r"((int *)p_sd8x8),
778 [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
779 : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
780 [psadframe]"r"((int *)psadframe)
781 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
782 "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
783 "$f24", "$f26", "$f28", "$f30"
784 );
785 RECOVER_REG;
786 }
787
VAACalcSadVar_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * psum16x16,int32_t * psqsum16x16)788 void VAACalcSadVar_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
789 int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
790 int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
791 int32_t *psqsum16x16) {
792 BACKUP_REG;
793 __asm__ volatile (
794 ".set arch=loongson3a \n\t"
795 "move $15, %[cur_data] \n\t"
796 "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
797 "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
798 "dsll $13, %[iPicStride], 0x4 \n\t"
799 "xor $f0, $f0, $f0 \n\t"
800 "xor $f2, $f2, $f2 \n\t"
801 "xor $f28, $f28, $f28 \n\t"
802 "xor $f30, $f30, $f30 \n\t"
803 "xor $14, $14, $14 \n\t"
804 "1: \n\t"
805 "move $9, %[iPicWidth] \n\t"
806 "move $10, $15 \n\t"
807 "move $11, %[ref_data] \n\t"
808 "2: \n\t"
809 "xor $f24, $f24, $f24 \n\t"
810 "xor $f26, $f26, $f26 \n\t"
811 "xor $f20, $f20, $f20 \n\t"
812 "xor $f22, $f22, $f22 \n\t"
813 "xor $f16, $f16, $f16 \n\t"
814 "xor $f18, $f18, $f18 \n\t"
815 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
816 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
817 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
818 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
819 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
820 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
821 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
822 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
823 "paddw $f28, $f24, $f28 \n\t"
824 "paddw $f30, $f26, $f30 \n\t"
825 "dmfc1 $8, $f24 \n\t"
826 "sw $8, 0x0(%[psad8x8]) \n\t"
827 "dmfc1 $8, $f26 \n\t"
828 "sw $8, 0x4(%[psad8x8]) \n\t"
829
830 "xor $f24, $f24, $f24 \n\t"
831 "xor $f26, $f26, $f26 \n\t"
832 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
833 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
834 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
835 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
836 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
837 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
838 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
839 WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
840 "paddw $f28, $f24, $f28 \n\t"
841 "paddw $f30, $f26, $f30 \n\t"
842 "dmfc1 $8, $f24 \n\t"
843 "sw $8, 0x8(%[psad8x8]) \n\t"
844 "dmfc1 $8, $f26 \n\t"
845 "sw $8, 0xc(%[psad8x8]) \n\t"
846 PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
847
848 "paddw $f20, $f20, $f22 \n\t"
849 "dmfc1 $8, $f20 \n\t"
850 "sw $8, 0x0(%[psum16x16]) \n\t"
851 PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
852
853 "paddw $f20, $f16, $f18 \n\t"
854 "punpcklwd $f16, $f20, $f20 \n\t"
855 "punpckhwd $f18, $f20, $f20 \n\t"
856 "paddw $f16, $f16, $f18 \n\t"
857 "dmfc1 $8, $f16 \n\t"
858 "sw $8, 0x0(%[psqsum16x16]) \n\t"
859 PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
860
861 PTR_SUBU "$15, $15, $13 \n\t"
862 PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
863 PTR_ADDIU "$15, $15, 0x10 \n\t"
864 PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
865
866 PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
867 "bnez %[iPicWidth], 2b \n\t"
868 "nop \n\t"
869 "move %[iPicWidth], $9 \n\t"
870 "move $15, $10 \n\t"
871 "move %[ref_data], $11 \n\t"
872 PTR_ADDU "$15, $15, $13 \n\t"
873 PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
874
875 PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
876 "bnez %[iPicHeight], 1b \n\t"
877 "nop \n\t"
878
879 "paddw $f28, $f28, $f30 \n\t"
880 "dmfc1 $8, $f28 \n\t"
881 "sw $8, 0x0(%[psadframe]) \n\t"
882 : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
883 [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
884 [psqsum16x16]"+&r"((int *)psqsum16x16)
885 : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
886 [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
887 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
888 "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
889 "$f24", "$f26", "$f28", "$f30"
890 );
891 RECOVER_REG;
892 }
893