1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file dct_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 20/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
43 #define MMI_Load4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
44 "gslqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
45 "gslqc1 "#f10", "#f8", 0x10("#r0") \n\t" \
46 "gslqc1 "#f18", "#f16", 0x20("#r0") \n\t" \
47 "gslqc1 "#f6", "#f4", 0x30("#r0") \n\t" \
48 MMI_XSawp_DQ(f8, f10, f4, f6, f12, f14) \
49 MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)
50
51 #define MMI_SumSubDiv2(f0, f2, f4, f6, f8, f10, f12, f14, f16) \
52 "mov.d "#f8", "#f4" \n\t" \
53 "mov.d "#f10", "#f6" \n\t" \
54 "psrah "#f4", "#f4", "#f16" \n\t" \
55 "psrah "#f6", "#f6", "#f16" \n\t" \
56 "psrah "#f12", "#f0", "#f16" \n\t" \
57 "psrah "#f14", "#f2", "#f16" \n\t" \
58 "paddh "#f0", "#f0", "#f4" \n\t" \
59 "paddh "#f2", "#f2", "#f6" \n\t" \
60 "psubh "#f12", "#f12", "#f8" \n\t" \
61 "psubh "#f14", "#f14", "#f10" \n\t"
62
63 #define MMI_IDCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28) \
64 MMI_SumSub(f24, f26, f4, f6, f20, f22) \
65 MMI_SumSubDiv2(f0, f2, f8, f10, f16, f18, f12, f14, f28) \
66 MMI_SumSub(f4, f6, f0, f2, f16, f18) \
67 MMI_SumSub(f24, f26, f12, f14, f16, f18)
68
69 #define MMI_StoreDiff8p_6(f0, f2, f4, f6, f8, f12, r0, r1, f14) \
70 "paddh "#f0", "#f0", "#f8" \n\t" \
71 "paddh "#f2", "#f2", "#f8" \n\t" \
72 "psrah "#f0", "#f0", "#f14" \n\t" \
73 "psrah "#f2", "#f2", "#f14" \n\t" \
74 "gsldlc1 "#f4", 0x7("#r1") \n\t" \
75 "gsldrc1 "#f4", 0x0("#r1") \n\t" \
76 "punpckhbh "#f6", "#f4", "#f12" \n\t" \
77 "punpcklbh "#f4", "#f4", "#f12" \n\t" \
78 "paddsh "#f4", "#f4", "#f0" \n\t" \
79 "paddsh "#f6", "#f6", "#f2" \n\t" \
80 "packushb "#f4", "#f4", "#f6" \n\t" \
81 "gssdlc1 "#f4", 0x7("#r0") \n\t" \
82 "gssdrc1 "#f4", 0x0("#r0") \n\t"
83
84 #define MMI_StoreDiff8p_5(f0, f2, f4, f6, f8, r0, r1, offset) \
85 "gsldlc1 "#f4", "#offset"+0x7("#r1") \n\t" \
86 "gsldrc1 "#f4", "#offset"+0x0("#r1") \n\t" \
87 "punpckhbh "#f6", "#f4", "#f8" \n\t" \
88 "punpcklbh "#f4", "#f4", "#f8" \n\t" \
89 "paddsh "#f4", "#f4", "#f0" \n\t" \
90 "paddsh "#f6", "#f6", "#f2" \n\t" \
91 "packushb "#f4", "#f4", "#f6" \n\t" \
92 "gssdlc1 "#f4", "#offset"+0x7("#r0") \n\t" \
93 "gssdrc1 "#f4", "#offset"+0x0("#r0") \n\t"
94
95 #define MMI_Load8DC(f0, f2, f4, f6, f8, f10, f12, f14, f16, r0, offset, f20) \
96 "gslqc1 "#f2", "#f0", "#offset"+0x0("#r0") \n\t" \
97 "paddh "#f0", "#f0", "#f16" \n\t" \
98 "paddh "#f2", "#f2", "#f16" \n\t" \
99 "psrah "#f0", "#f0", "#f20" \n\t" \
100 "psrah "#f2", "#f2", "#f20" \n\t" \
101 "punpckhhw "#f4", "#f0", "#f0" \n\t" \
102 "punpckhwd "#f6", "#f4", "#f4" \n\t" \
103 "punpcklwd "#f4", "#f4", "#f4" \n\t" \
104 "punpcklhw "#f8", "#f2", "#f2" \n\t" \
105 "punpckhwd "#f10", "#f8", "#f8" \n\t" \
106 "punpcklwd "#f8", "#f8", "#f8" \n\t" \
107 "punpckhhw "#f12", "#f2", "#f2" \n\t" \
108 "punpckhwd "#f14", "#f12", "#f12" \n\t" \
109 "punpcklwd "#f12", "#f12", "#f12" \n\t" \
110 "punpcklhw "#f0", "#f0", "#f0" \n\t" \
111 "punpckhwd "#f2", "#f0", "#f0" \n\t" \
112 "punpcklwd "#f0", "#f0", "#f0" \n\t"
113
114 #define MMI_StoreDiff4x8p(f0, f2, f4, f6, f8, f10, f12, r0, r1, r2, r3) \
115 MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0) \
116 MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8) \
117 PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
118 PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
119 MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0) \
120 MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)
121
122 #define MMI_Load4Col(f0, f2, f4, f6, f8, r0, offset) \
123 "lh $8, "#offset"("#r0") \n\t" \
124 "dmtc1 $8, "#f0" \n\t" \
125 "lh $8, "#offset"+0x20("#r0") \n\t" \
126 "dmtc1 $8, "#f4" \n\t" \
127 "punpcklwd "#f0", "#f0", "#f4" \n\t" \
128 "lh $8, "#offset"+0x80("#r0") \n\t" \
129 "dmtc1 $8, "#f6" \n\t" \
130 "lh $8, "#offset"+0xa0("#r0") \n\t" \
131 "dmtc1 $8, "#f8" \n\t" \
132 "punpcklwd "#f2", "#f6", "#f8" \n\t"
133
134 #define MMI_SumSubD(f0, f2, f4, f6, f8, f10) \
135 "mov.d "#f8", "#f4" \n\t" \
136 "mov.d "#f10", "#f6" \n\t" \
137 "paddw "#f4", "#f4", "#f0" \n\t" \
138 "paddw "#f6", "#f6", "#f2" \n\t" \
139 "psubw "#f0", "#f0", "#f8" \n\t" \
140 "psubw "#f2", "#f2", "#f10" \n\t"
141
142 #define WELS_DD1(f0, f2, f_val_31) \
143 "pcmpeqh "#f0", "#f0", "#f0" \n\t" \
144 "pcmpeqh "#f2", "#f2", "#f2" \n\t" \
145 "psrlw "#f0", "#f0", "#f_val_31" \n\t" \
146 "psrlw "#f2", "#f2", "#f_val_31" \n\t"
147
148 #define MMI_SumSubDiv2D(f0, f2, f4, f6, f8, f10, f12, f14, f_val_1) \
149 "paddw "#f0", "#f0", "#f4" \n\t" \
150 "paddw "#f2", "#f2", "#f6" \n\t" \
151 "paddw "#f0", "#f0", "#f8" \n\t" \
152 "paddw "#f2", "#f2", "#f10" \n\t" \
153 "psraw "#f0", "#f0", "#f_val_1" \n\t" \
154 "psraw "#f2", "#f2", "#f_val_1" \n\t" \
155 "mov.d "#f12", "#f0" \n\t" \
156 "mov.d "#f14", "#f2" \n\t" \
157 "psubw "#f12", "#f12", "#f4" \n\t" \
158 "psubw "#f14", "#f14", "#f6" \n\t"
159
160 #define MMI_Trans4x4W(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
161 MMI_XSawp_WD(f0, f2, f4, f6, f16, f18) \
162 MMI_XSawp_WD(f8, f10, f12, f14, f4, f6) \
163 MMI_XSawp_DQ(f0, f2, f8, f10, f12, f14) \
164 MMI_XSawp_DQ(f16, f18, f4, f6, f8, f10)
165
166 #define MMI_SumSubMul2(f0, f2, f4, f6, f8, f10) \
167 "mov.d "#f8", "#f0" \n\t" \
168 "mov.d "#f10", "#f2" \n\t" \
169 "paddh "#f0", "#f0", "#f0" \n\t" \
170 "paddh "#f2", "#f2", "#f2" \n\t" \
171 "paddh "#f0", "#f0", "#f4" \n\t" \
172 "paddh "#f2", "#f2", "#f6" \n\t" \
173 "psubh "#f8", "#f8", "#f4" \n\t" \
174 "psubh "#f10", "#f10", "#f6" \n\t" \
175 "psubh "#f8", "#f8", "#f4" \n\t" \
176 "psubh "#f10", "#f10", "#f6" \n\t"
177
178 #define MMI_DCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22) \
179 MMI_SumSub(f20, f22, f8, f10, f16, f18) \
180 MMI_SumSub(f0, f2, f4, f6, f16, f18) \
181 MMI_SumSub(f8, f10, f4, f6, f16, f18) \
182 MMI_SumSubMul2(f20, f22, f0, f2, f12, f14)
183
184 #define MMI_Store4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
185 MMI_XSawp_DQ(f0, f2, f4, f6, f16, f18) \
186 MMI_XSawp_DQ(f8, f10, f12, f14, f4, f6) \
187 "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
188 "gssqc1 "#f10", "#f8", 0x10("#r0") \n\t" \
189 "gssqc1 "#f18", "#f16", 0x20("#r0") \n\t" \
190 "gssqc1 "#f6", "#f4", 0x30("#r0") \n\t"
191
192 #define MMI_LoadDiff4P_SINGLE(f0, f2, r0, r1, f4) \
193 "gsldlc1 "#f0", 0x7("#r0") \n\t" \
194 "gsldlc1 "#f2", 0x7("#r1") \n\t" \
195 "gsldrc1 "#f0", 0x0("#r0") \n\t" \
196 "gsldrc1 "#f2", 0x0("#r1") \n\t" \
197 "punpcklbh "#f0", "#f0", "#f4" \n\t" \
198 "punpcklbh "#f2", "#f2", "#f4" \n\t" \
199 "psubh "#f0", "#f0", "#f2" \n\t"
200
201 #define MMI_LoadDiff4x4P_SINGLE(f0, f2, f4, f6, r0, r1, r2, r3, f8, f10) \
202 MMI_LoadDiff4P_SINGLE(f0, f8, r0, r2, f10) \
203 PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
204 PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
205 MMI_LoadDiff4P_SINGLE(f2, f8, r0, r2, f10) \
206 PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
207 PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
208 MMI_LoadDiff4P_SINGLE(f4, f8, r0, r2, f10) \
209 PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
210 PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
211 MMI_LoadDiff4P_SINGLE(f6, f8, r0, r2, f10)
212
213 #define MMI_DCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
214 MMI_SumSub_SINGLE(f6, f0, f10) \
215 MMI_SumSub_SINGLE(f4, f2, f10) \
216 MMI_SumSub_SINGLE(f4, f6, f10) \
217 MMI_SumSubMul2_SINGLE(f0, f2, f8, f12)
218
WelsIDctT4Rec_mmi(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)219 void WelsIDctT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
220 int32_t iPredStride, int16_t* pDct) {
221 __asm__ volatile (
222 ".set arch=loongson3a \n\t"
223 "gsldlc1 $f0, 0x7(%[pDct]) \n\t"
224 "gsldrc1 $f0, 0x0(%[pDct]) \n\t"
225 "gsldlc1 $f2, 0xF(%[pDct]) \n\t"
226 "gsldrc1 $f2, 0x8(%[pDct]) \n\t"
227 "gsldlc1 $f4, 0x17(%[pDct]) \n\t"
228 "gsldrc1 $f4, 0x10(%[pDct]) \n\t"
229 "gsldlc1 $f6, 0x1F(%[pDct]) \n\t"
230 "gsldrc1 $f6, 0x18(%[pDct]) \n\t"
231
232 "dli $8, 0x1 \n\t"
233 "dmtc1 $8, $f16 \n\t"
234 "dli $8, 0x6 \n\t"
235 "dmtc1 $8, $f18 \n\t"
236
237 MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
238 MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f16)
239 MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
240 MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f16)
241
242 "xor $f14, $f14, $f14 \n\t"
243 "dli $8, 0x0020 \n\t"
244 "dmtc1 $8, $f12 \n\t"
245 "punpcklhw $f12, $f12, $f12 \n\t"
246 "punpcklwd $f12, $f12, $f12 \n\t"
247
248 MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
249 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
250 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
251 MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
252 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
253 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
254 MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
255 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
256 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
257 MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
258 : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred)
259 : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride),
260 [pDct]"r"((short *)pDct)
261 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
262 "$f14", "$f16", "$f18"
263 );
264 }
265
WelsIDctFourT4Rec_mmi(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)266 void WelsIDctFourT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
267 int32_t iPredStride, int16_t* pDct) {
268 BACKUP_REG;
269 __asm__ volatile (
270 ".set arch=loongson3a \n\t"
271 MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
272
273 MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
274 "dli $8, 0x1 \n\t"
275 "dmtc1 $8, $f30 \n\t"
276 MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
277 $f0, $f2, $f30)
278 MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
279 MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
280 $f4, $f6, $f30)
281
282 "xor $f28, $f28, $f28 \n\t"
283 "dli $8, 0x6 \n\t"
284 "dmtc1 $8, $f26 \n\t"
285 "dli $8, 0x0020 \n\t"
286 "dmtc1 $8, $f24 \n\t"
287 "punpcklhw $f24, $f24, $f24 \n\t"
288 "punpcklwd $f24, $f24, $f24 \n\t"
289
290 MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
291 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
292 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
293 MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
294 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
295 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
296 MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
297 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
298 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
299 MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
300
301 PTR_ADDIU "%[pDct], %[pDct], 0x40 \n\t"
302 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
303 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
304 MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
305
306 MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
307 MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
308 $f0, $f2, $f30)
309 MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
310 MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
311 $f4, $f6, $f30)
312
313 "dli $8, 0x6 \n\t"
314 "dmtc1 $8, $f26 \n\t"
315 "dli $8, 0x0020 \n\t"
316 "dmtc1 $8, $f24 \n\t"
317 "punpcklhw $f24, $f24, $f24 \n\t"
318 "punpcklwd $f24, $f24, $f24 \n\t"
319
320 MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
321 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
322 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
323 MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
324 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
325 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
326 MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
327 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
328 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
329 MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
330 : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
331 [pDct]"+&r"((short *)pDct)
332 : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
333 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
334 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
335 );
336 RECOVER_REG;
337 }
338
WelsIDctRecI16x16Dc_mmi(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)339 void WelsIDctRecI16x16Dc_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
340 int32_t iPredStride, int16_t* pDct) {
341 BACKUP_REG;
342 __asm__ volatile (
343 ".set arch=loongson3a \n\t"
344 "xor $f28, $f28, $f28 \n\t"
345 "dli $8, 0x0020 \n\t"
346 "dmtc1 $8, $f24 \n\t"
347 "punpcklhw $f24, $f24, $f24 \n\t"
348 "punpcklwd $f24, $f24, $f24 \n\t"
349 "dli $8, 0x6 \n\t"
350 "dmtc1 $8, $f30 \n\t"
351
352 MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24,
353 %[pDct], 0x0, $f30)
354
355 MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
356 %[pPred], %[iStride], %[iPredStride])
357
358 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
359 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
360 MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
361 %[pPred], %[iStride], %[iPredStride])
362
363 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
364 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
365 MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
366 %[pPred], %[iStride], %[iPredStride])
367
368 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
369 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
370 MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
371 %[pPred], %[iStride], %[iPredStride])
372
373 MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24, %[pDct], 0x10, $f30)
374 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
375 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
376 MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
377 %[pPred], %[iStride], %[iPredStride])
378
379 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
380 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
381 MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
382 %[pPred], %[iStride], %[iPredStride])
383
384 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
385 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
386 MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
387 %[pPred], %[iStride], %[iPredStride])
388
389 PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
390 PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
391 MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
392 %[pPred], %[iStride], %[iPredStride])
393 : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
394 [pDct]"+&r"((short *)pDct)
395 : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
396 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
397 "$f14", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
398 );
399 RECOVER_REG;
400 }
401
WelsHadamardT4Dc_mmi(int16_t * luma_dc,int16_t * pDct)402 void WelsHadamardT4Dc_mmi( int16_t *luma_dc, int16_t *pDct) {
403 BACKUP_REG;
404 __asm__ volatile (
405 ".set arch=loongson3a \n\t"
406 MMI_Load4Col($f4, $f6, $f20, $f24, $f0, %[pDct], 0x0)
407 MMI_Load4Col($f8, $f10, $f20, $f24, $f0, %[pDct], 0x40)
408 MMI_Load4Col($f12, $f14, $f20, $f24, $f0, %[pDct], 0x100)
409 MMI_Load4Col($f16, $f18, $f20, $f24, $f0, %[pDct], 0x140)
410
411 MMI_SumSubD($f4, $f6, $f8, $f10, $f28, $f30)
412 MMI_SumSubD($f12, $f14, $f16, $f18, $f28, $f30)
413 MMI_SumSubD($f8, $f10, $f16, $f18, $f28, $f30)
414 MMI_SumSubD($f4, $f6, $f12, $f14, $f28, $f30)
415
416 MMI_Trans4x4W($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f20, $f22)
417
418 MMI_SumSubD($f16, $f18, $f12, $f14, $f28, $f30)
419 MMI_SumSubD($f20, $f22, $f4, $f6, $f28, $f30)
420
421 "dli $8, 0x1F \n\t"
422 "dmtc1 $8, $f30 \n\t"
423
424 WELS_DD1($f24, $f26, $f30)
425
426 "dli $8, 0x1 \n\t"
427 "dmtc1 $8, $f30 \n\t"
428
429 MMI_SumSubDiv2D($f12, $f14, $f4, $f6, $f24, $f26, $f0, $f2, $f30)
430 MMI_SumSubDiv2D($f16, $f18, $f20, $f22, $f24, $f26, $f4, $f6, $f30)
431 MMI_Trans4x4W($f12, $f14, $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10)
432
433 "packsswh $f12, $f12, $f14 \n\t"
434 "packsswh $f14, $f16, $f18 \n\t"
435
436 "packsswh $f8, $f8, $f10 \n\t"
437 "packsswh $f10, $f4, $f6 \n\t"
438 "gssqc1 $f14, $f12, 0x0(%[luma_dc]) \n\t"
439 "gssqc1 $f10, $f8, 0x10(%[luma_dc]) \n\t"
440 :
441 : [luma_dc]"r"((short *)luma_dc), [pDct]"r"((short *)pDct)
442 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
443 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
444 );
445 RECOVER_REG;
446 }
447
WelsDctT4_mmi(int16_t * pDct,uint8_t * pix1,int32_t i_pix1,uint8_t * pix2,int32_t i_pix2)448 void WelsDctT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
449 uint8_t *pix2, int32_t i_pix2 ) {
450 __asm__ volatile (
451 ".set arch=loongson3a \n\t"
452 "xor $f14, $f14, $f14 \n\t"
453 "dli $8, 0x1 \n\t"
454 "dmtc1 $8, $f16 \n\t"
455
456 MMI_LoadDiff4x4P_SINGLE($f2, $f4, $f6, $f8, %[pix1], %[i_pix1],
457 %[pix2], %[i_pix2], $f0, $f14)
458
459 MMI_DCT_SINGLE($f2, $f4, $f6, $f8, $f10, $f12, $f16)
460 MMI_Trans4x4H_SINGLE($f6, $f2, $f8, $f10, $f4)
461
462 MMI_DCT_SINGLE($f6, $f10, $f4, $f8, $f2, $f12, $f16)
463 MMI_Trans4x4H_SINGLE($f4, $f6, $f8, $f2, $f10)
464
465 "gssdlc1 $f4, 0x7(%[pDct]) \n\t"
466 "gssdlc1 $f2, 0xF(%[pDct]) \n\t"
467 "gssdlc1 $f10, 0x17(%[pDct]) \n\t"
468 "gssdlc1 $f8, 0x1F(%[pDct]) \n\t"
469 "gssdrc1 $f4, 0x0(%[pDct]) \n\t"
470 "gssdrc1 $f2, 0x8(%[pDct]) \n\t"
471 "gssdrc1 $f10, 0x10(%[pDct]) \n\t"
472 "gssdrc1 $f8, 0x18(%[pDct]) \n\t"
473 : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
474 : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
475 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
476 "$f14", "$f16"
477 );
478 }
479
WelsDctFourT4_mmi(int16_t * pDct,uint8_t * pix1,int32_t i_pix1,uint8_t * pix2,int32_t i_pix2)480 void WelsDctFourT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
481 uint8_t *pix2, int32_t i_pix2 ) {
482 BACKUP_REG;
483 __asm__ volatile (
484 ".set arch=loongson3a \n\t"
485 "xor $f28, $f28, $f28 \n\t"
486 MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
487 PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
488 PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
489 MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
490 PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
491 PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
492 MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
493 PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
494 PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
495 MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
496
497 MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
498 MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
499 MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
500 MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
501
502 MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
503 PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
504 PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
505 MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
506 PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
507 PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
508 MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
509 PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
510 PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
511 MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
512 PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
513 PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
514 MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
515
516 MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
517 MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
518 MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
519 MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
520
521 PTR_ADDIU "%[pDct], %[pDct], 0x40 \n\t"
522 MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
523 : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
524 : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
525 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
526 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
527 );
528 RECOVER_REG;
529 }
530