1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file dct_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 17/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
43 #define LOAD_2_LEFT_AND_ADD \
44 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
45 "lbu $9, -0x1(%[pPred]) \n\t" \
46 PTR_ADDU "$8, $8, $9 \n\t" \
47 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
48 "lbu $9, -0x1(%[pPred]) \n\t" \
49 PTR_ADDU "$8, $8, $9 \n\t"
50
51 unsigned char mmi_dc_0x80[16] __attribute__((aligned(16))) = {
52 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
53 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
54
55 short mmi_wd_0x02[8] __attribute__((aligned(16))) = {2, 2, 2, 2, 2, 2, 2, 2};
56 short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4, -3, -2, -1, 0};
57 short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
58 short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
59
60 short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
61 short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
62 short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0, 1, 2, 3, 4};
63
64 unsigned char mmi_01bytes[16]__attribute__((aligned(16))) = {1, 1, 1, 1, 1, 1, 1, 1,
65 1, 1, 1, 1, 1, 1, 1, 1};
66
IdctResAddPred_mmi(uint8_t * pPred,const int32_t kiStride,int16_t * pRs)67 void IdctResAddPred_mmi(uint8_t *pPred, const int32_t kiStride, int16_t *pRs) {
68 __asm__ volatile (
69 ".set arch=loongson3a \n\t"
70 "dli $8, 0x1 \n\t"
71 "gsldxc1 $f0, 0x0(%[pRs], $0) \n\t"
72 "gsldxc1 $f2, 0x8(%[pRs], $0) \n\t"
73 "gsldxc1 $f4, 0x10(%[pRs], $0) \n\t"
74 "gsldxc1 $f6, 0x18(%[pRs], $0) \n\t"
75 "dmtc1 $8, $f14 \n\t"
76
77 MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
78 MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f14)
79 MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
80 MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f14)
81
82 "dli $8, 0x20 \n\t"
83 "xor $f14, $f14, $f14 \n\t"
84 "dmtc1 $8, $f12 \n\t"
85 "pshufh $f12, $f12, $f14 \n\t"
86 "dli $8, 0x6 \n\t"
87 "dmtc1 $8, $f16 \n\t"
88
89 MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
90 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
91 MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
92 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
93 MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
94 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
95 MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
96 : [pPred]"+&r"((unsigned char *)pPred)
97 : [pRs]"r"((unsigned char *)pRs), [kiStride]"r"((int)kiStride)
98 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
99 "$f14", "$f16"
100 );
101 }
102
WelsDecoderI16x16LumaPredDc_mmi(uint8_t * pPred,const int32_t kiStride)103 void WelsDecoderI16x16LumaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
104 __asm__ volatile(
105 ".set arch=loongson3a \n\t"
106 "dli $8, 0x5 \n\t"
107 "gsldxc1 $f10, 0x0(%[mmi_01bytes], $0) \n\t"
108 "dmtc1 $8, $f8 \n\t"
109
110 "move $10, %[pPred] \n\t"
111 PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
112 "gslqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
113 "xor $f4, $f4, $f4 \n\t"
114 "pasubub $f0, $f0, $f4 \n\t"
115 "pasubub $f2, $f2, $f4 \n\t"
116 "biadd $f0, $f0 \n\t"
117 "biadd $f2, $f2 \n\t"
118 "paddh $f0, $f0, $f2 \n\t"
119
120 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
121 "lbu $8, -0x1(%[pPred]) \n\t"
122 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
123 "lbu $9, -0x1(%[pPred]) \n\t"
124 PTR_ADDU "$8, $8, $9 \n\t"
125
126 LOAD_2_LEFT_AND_ADD
127 LOAD_2_LEFT_AND_ADD
128 LOAD_2_LEFT_AND_ADD
129 LOAD_2_LEFT_AND_ADD
130 LOAD_2_LEFT_AND_ADD
131 LOAD_2_LEFT_AND_ADD
132 LOAD_2_LEFT_AND_ADD
133
134 PTR_ADDIU "$8, $8, 0x10 \n\t"
135 "dmtc1 $8, $f4 \n\t"
136 "paddh $f0, $f0, $f4 \n\t"
137 "psrlw $f0, $f0, $f8 \n\t"
138 "pmuluw $f0, $f0, $f10 \n\t"
139 "punpcklwd $f0, $f0, $f0 \n\t"
140 "mov.d $f2, $f0 \n\t"
141
142 "gssqc1 $f2, $f0, 0x0($10) \n\t"
143 PTR_ADDU "$10, $10, %[kiStride] \n\t"
144 "gssqc1 $f2, $f0, 0x0($10) \n\t"
145 PTR_ADDU "$10, $10, %[kiStride] \n\t"
146 "gssqc1 $f2, $f0, 0x0($10) \n\t"
147
148 PTR_ADDU "$10, $10, %[kiStride] \n\t"
149 "gssqc1 $f2, $f0, 0x0($10) \n\t"
150 PTR_ADDU "$10, $10, %[kiStride] \n\t"
151 "gssqc1 $f2, $f0, 0x0($10) \n\t"
152
153 PTR_ADDU "$10, $10, %[kiStride] \n\t"
154 "gssqc1 $f2, $f0, 0x0($10) \n\t"
155 PTR_ADDU "$10, $10, %[kiStride] \n\t"
156 "gssqc1 $f2, $f0, 0x0($10) \n\t"
157
158 PTR_ADDU "$10, $10, %[kiStride] \n\t"
159 "gssqc1 $f2, $f0, 0x0($10) \n\t"
160 PTR_ADDU "$10, $10, %[kiStride] \n\t"
161 "gssqc1 $f2, $f0, 0x0($10) \n\t"
162
163 PTR_ADDU "$10, $10, %[kiStride] \n\t"
164 "gssqc1 $f2, $f0, 0x0($10) \n\t"
165 PTR_ADDU "$10, $10, %[kiStride] \n\t"
166 "gssqc1 $f2, $f0, 0x0($10) \n\t"
167
168 PTR_ADDU "$10, $10, %[kiStride] \n\t"
169 "gssqc1 $f2, $f0, 0x0($10) \n\t"
170 PTR_ADDU "$10, $10, %[kiStride] \n\t"
171 "gssqc1 $f2, $f0, 0x0($10) \n\t"
172
173 PTR_ADDU "$10, $10, %[kiStride] \n\t"
174 "gssqc1 $f2, $f0, 0x0($10) \n\t"
175 PTR_ADDU "$10, $10, %[kiStride] \n\t"
176 "gssqc1 $f2, $f0, 0x0($10) \n\t"
177
178 PTR_ADDU "$10, $10, %[kiStride] \n\t"
179 "gssqc1 $f2, $f0, 0x0($10) \n\t"
180 : [pPred] "+&r"((unsigned char *)pPred)
181 : [kiStride] "r"((int)kiStride),
182 [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
183 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10"
184 );
185 }
186
WelsDecoderI16x16LumaPredPlane_mmi(uint8_t * pPred,const int32_t kiStride)187 void WelsDecoderI16x16LumaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
188 BACKUP_REG;
189 __asm__ volatile (
190 ".set arch=loongson3a \n\t"
191 "move $10, %[pPred] \n\t"
192 PTR_ADDIU "%[pPred], %[pPred], -0x1 \n\t"
193 PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
194
195 "gsldlc1 $f0, 0x7(%[pPred]) \n\t"
196 "xor $f28, $f28, $f28 \n\t"
197 "gsldrc1 $f0, 0x0(%[pPred]) \n\t"
198 "gslqc1 $f22, $f20, 0x0(%[mmi_plane_dec]) \n\t"
199 "punpckhbh $f2, $f0, $f28 \n\t"
200 "punpcklbh $f0, $f0, $f28 \n\t"
201 "pmullh $f0, $f0, $f20 \n\t"
202 "gsldlc1 $f4, 0x10(%[pPred]) \n\t"
203 "pmullh $f2, $f2, $f22 \n\t"
204 "gsldrc1 $f4, 0x9(%[pPred]) \n\t"
205 "gslqc1 $f26, $f24, 0x0(%[mmi_plane_inc]) \n\t"
206 "punpckhbh $f6, $f4, $f28 \n\t"
207 "punpcklbh $f4, $f4, $f28 \n\t"
208 "pmullh $f4, $f4, $f24 \n\t"
209 "pmullh $f6, $f6, $f26 \n\t"
210 "psubh $f4, $f4, $f0 \n\t"
211 "psubh $f6, $f6, $f2 \n\t"
212
213 SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
214 "dmfc1 $8, $f4 \n\t"
215 "seh $8, $8 \n\t"
216 "mul $8, $8, 0x5 \n\t"
217 PTR_ADDIU "$8, $8, 0x20 \n\t"
218 "sra $8, $8, 0x6 \n\t"
219 MMI_Copy8Times($f4, $f6, $f28, $8)
220
221 "lbu $9, 0x10(%[pPred]) \n\t"
222 PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
223 LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
224 %[kiStride], $11)
225
226 PTR_ADDIU "%[pPred], %[pPred], 0x3 \n\t"
227 "dsll $11, %[kiStride], 0x3 \n\t"
228 PTR_ADDU "$11, $11, %[pPred] \n\t"
229 "lbu $8, 0x0($11) \n\t"
230 PTR_ADDU "$9, $9, $8 \n\t"
231 "dsll $9, $9, 0x4 \n\t"
232
233 PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
234 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
235 LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
236 %[kiStride], $11)
237
238 "xor $f16, $f16, $f16 \n\t"
239 "punpcklbh $f0, $f2, $f16 \n\t"
240 "punpckhbh $f2, $f2, $f16 \n\t"
241 "pmullh $f0, $f0, $f20 \n\t"
242 "pmullh $f2, $f2, $f22 \n\t"
243 "punpcklbh $f28, $f30, $f16 \n\t"
244 "punpckhbh $f30, $f30, $f16 \n\t"
245 "pmullh $f28, $f28, $f24 \n\t"
246 "pmullh $f30, $f30, $f26 \n\t"
247 "psubh $f28, $f28, $f0 \n\t"
248 "psubh $f30, $f30, $f2 \n\t"
249
250 "xor $f8, $f8, $f8 \n\t"
251
252 SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
253 "dmfc1 $8, $f28 \n\t"
254 "seh $8, $8 \n\t"
255
256 "mul $8, $8, 0x5 \n\t"
257 PTR_ADDIU "$8, $8, 0x20 \n\t"
258 "sra $8, $8, 0x6 \n\t"
259 MMI_Copy8Times($f16, $f18, $f8, $8)
260
261 "move %[pPred], $10 \n\t"
262 PTR_ADDIU "$9, $9, 0x10 \n\t"
263 "mul $8, $8, -0x7 \n\t"
264 PTR_ADDU "$9, $9, $8 \n\t"
265 MMI_Copy8Times($f0, $f2, $f8, $9)
266
267 "xor $8, $8, $8 \n\t"
268 "gslqc1 $f22, $f20, 0x0(%[mmi_plane_inc_minus]) \n\t"
269
270 "dli $11, 0x5 \n\t"
271 "dmtc1 $11, $f30 \n\t"
272 "1: \n\t"
273 "pmullh $f8, $f4, $f20 \n\t"
274 "pmullh $f10, $f6, $f22 \n\t"
275 "paddh $f8, $f8, $f0 \n\t"
276 "paddh $f10, $f10, $f2 \n\t"
277 "psrah $f8, $f8, $f30 \n\t"
278 "psrah $f10, $f10, $f30 \n\t"
279 "pmullh $f12, $f4, $f24 \n\t"
280 "pmullh $f14, $f6, $f26 \n\t"
281 "paddh $f12, $f12, $f0 \n\t"
282 "paddh $f14, $f14, $f2 \n\t"
283 "psrah $f12, $f12, $f30 \n\t"
284 "psrah $f14, $f14, $f30 \n\t"
285 "packushb $f8, $f8, $f10 \n\t"
286 "packushb $f10, $f12, $f14 \n\t"
287 "gssqc1 $f10, $f8, 0x0(%[pPred]) \n\t"
288 "paddh $f0, $f0, $f16 \n\t"
289 "paddh $f2, $f2, $f18 \n\t"
290 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
291 PTR_ADDIU "$8, $8, 0x1 \n\t"
292 PTR_ADDIU "$11, $8, -0x10 \n\t"
293 "bnez $11, 1b \n\t"
294 "nop \n\t"
295 : [pPred]"+&r"((unsigned char *)pPred)
296 : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
297 [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
298 : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
299 "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
300 "$f28", "$f30"
301 );
302 RECOVER_REG;
303 }
304
305 #define COPY_16_TIMES(r0, f0, f2, f4, f6, f8) \
306 "gslqc1 "#f2", "#f0", -0x10("#r0") \n\t" \
307 "dsrl "#f0", "#f2", "#f4" \n\t" \
308 "pmuluw "#f0", "#f0", "#f6" \n\t" \
309 "punpcklwd "#f0", "#f0", "#f0" \n\t" \
310 "mov.d "#f2", "#f0" \n\t"
311
312 #define MMI_PRED_H_16X16_TWO_LINE_DEC \
313 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
314 COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8) \
315 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t" \
316 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
317 COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8) \
318 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
319
WelsDecoderI16x16LumaPredH_mmi(uint8_t * pPred,const int32_t kiStride)320 void WelsDecoderI16x16LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
321 __asm__ volatile (
322 ".set arch=loongson3a \n\t"
323 "dli $8, 56 \n\t"
324 "dmtc1 $8, $f4 \n\t"
325 "gsldxc1 $f6, 0x0(%[mmi_01bytes], $0) \n\t"
326 "xor $f8, $f8, $f8 \n\t"
327
328 COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
329 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
330 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
331 COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
332 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
333
334 MMI_PRED_H_16X16_TWO_LINE_DEC
335 MMI_PRED_H_16X16_TWO_LINE_DEC
336 MMI_PRED_H_16X16_TWO_LINE_DEC
337 MMI_PRED_H_16X16_TWO_LINE_DEC
338 MMI_PRED_H_16X16_TWO_LINE_DEC
339 MMI_PRED_H_16X16_TWO_LINE_DEC
340 MMI_PRED_H_16X16_TWO_LINE_DEC
341 : [pPred]"+&r"((unsigned char *)pPred)
342 : [kiStride]"r"((int)kiStride),
343 [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
344 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
345 );
346 }
347
WelsDecoderI16x16LumaPredV_mmi(uint8_t * pPred,const int32_t kiStride)348 void WelsDecoderI16x16LumaPredV_mmi(uint8_t *pPred, const int32_t kiStride) {
349 __asm__ volatile(
350 ".set arch=loongson3a \n\t"
351 PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
352 "gslqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
353
354 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
355 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
356 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
357 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
358 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
359 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
360 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
361 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
362 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
363 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
364 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
365 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
366 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
367 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
368 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
369 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
370 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
371 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
372 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
373 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
374 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
375 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
376 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
377 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
378 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
379 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
380 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
381 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
382 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
383 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
384 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
385 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
386 : [pPred] "+&r"((unsigned char *)pPred)
387 : [kiStride] "r"((int)kiStride)
388 : "memory", "$f0", "$f2"
389 );
390 }
391
WelsDecoderI16x16LumaPredDcTop_mmi(uint8_t * pPred,const int32_t kiStride)392 void WelsDecoderI16x16LumaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
393 BACKUP_REG;
394 __asm__ volatile (
395 ".set arch=loongson3a \n\t"
396 PTR_SUBU "$8, %[pPred], %[kiStride] \n\t"
397 "gslqc1 $f2, $f0, 0x0($8) \n\t"
398 "xor $f28, $f28, $f28 \n\t"
399 "pasubub $f0, $f0, $f28 \n\t"
400 "pasubub $f2, $f2, $f28 \n\t"
401 "biadd $f0, $f0 \n\t"
402 "biadd $f2, $f2 \n\t"
403 "paddh $f0, $f0, $f2 \n\t"
404 "dmfc1 $8, $f0 \n\t"
405
406 PTR_ADDIU "$8, $8, 0x8 \n\t"
407 "dsra $8, $8, 0x4 \n\t"
408 MMI_Copy16Times($f4, $f6, $f28, $8)
409 "mov.d $f0, $f4 \n\t"
410 "mov.d $f2, $f6 \n\t"
411
412 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
413 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
414 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
415 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
416 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
417 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
418 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
419
420 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
421 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
422 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
423 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
424 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
425 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
426 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
427 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
428
429 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
430 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
431 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
432 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
433 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
434 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
435 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
436 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
437
438 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
439 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
440 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
441 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
442 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
443 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
444 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
445 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
446 : [pPred]"+&r"((unsigned char *)pPred)
447 : [kiStride]"r"((int)kiStride)
448 : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
449 );
450 RECOVER_REG;
451 }
452
WelsDecoderI16x16LumaPredDcNA_mmi(uint8_t * pPred,const int32_t kiStride)453 void WelsDecoderI16x16LumaPredDcNA_mmi(uint8_t *pPred, const int32_t kiStride) {
454 __asm__ volatile(
455 ".set arch=loongson3a \n\t"
456 "gslqc1 $f2, $f0, 0x0(%[mmi_dc_0x80]) \n\t"
457 "mov.d $f4, $f0 \n\t"
458 "mov.d $f6, $f2 \n\t"
459 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
460 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
461 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
462 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
463 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
464 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
465 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
466
467 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
468 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
469 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
470 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
471 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
472 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
473 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
474 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
475
476 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
477 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
478 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
479 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
480 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
481 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
482 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
483 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
484
485 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
486 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
487 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
488 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
489 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
490 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
491 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
492 "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
493 : [pPred] "+&r"((unsigned char *)pPred)
494 : [kiStride] "r"((int)kiStride), [mmi_dc_0x80] "r"(mmi_dc_0x80)
495 : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
496 );
497 }
498
WelsDecoderIChromaPredPlane_mmi(uint8_t * pPred,const int32_t kiStride)499 void WelsDecoderIChromaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
500 BACKUP_REG;
501 __asm__ volatile (
502 ".set arch=loongson3a \n\t"
503 "move $10, %[pPred] \n\t"
504 PTR_ADDIU "%[pPred], %[pPred], -0x1 \n\t"
505 PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
506
507 "gsldlc1 $f0, 0x7(%[pPred]) \n\t"
508 "xor $f28, $f28, $f28 \n\t"
509 "gsldrc1 $f0, 0x0(%[pPred]) \n\t"
510 "gsldxc1 $f20, 0x0(%[mmi_plane_dec_c], $0) \n\t"
511 "punpcklbh $f0, $f0, $f28 \n\t"
512 "gsldlc1 $f4, 0xc(%[pPred]) \n\t"
513 "pmullh $f0, $f0, $f20 \n\t"
514 "gsldrc1 $f4, 0x5(%[pPred]) \n\t"
515 "gsldxc1 $f24, 0x0(%[mmi_plane_inc_c], $0) \n\t"
516 "punpcklbh $f4, $f4, $f28 \n\t"
517 "pmullh $f4, $f4, $f24 \n\t"
518 "psubh $f4, $f4, $f0 \n\t"
519
520 "xor $f6, $f6, $f6 \n\t"
521 "xor $f8, $f8, $f8 \n\t"
522 SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
523 "dmfc1 $8, $f4 \n\t"
524 "seh $8, $8 \n\t"
525 "mul $8, $8, 0x11 \n\t"
526 PTR_ADDIU "$8, $8, 0x10 \n\t"
527 "sra $8, $8, 0x5 \n\t"
528 MMI_Copy8Times($f4, $f6, $f8, $8)
529
530 "lbu $9, 0x8(%[pPred]) \n\t"
531 PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
532 LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
533
534 PTR_ADDIU "%[pPred], %[pPred], 0x3 \n\t"
535 "dsll $11, %[kiStride], 0x2 \n\t"
536 PTR_ADDU "$11, $11, %[pPred] \n\t"
537 "lbu $8, 0x0($11) \n\t"
538 PTR_ADDU "$9, $9, $8 \n\t"
539 "dsll $9, $9, 0x4 \n\t"
540
541 PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
542 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
543 LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
544 "xor $f16, $f16, $f16 \n\t"
545 "punpckhbh $f0, $f0, $f16 \n\t"
546 "pmullh $f0, $f0, $f20 \n\t"
547 "punpckhbh $f28, $f28, $f16 \n\t"
548 "pmullh $f28, $f28, $f24 \n\t"
549 "psubh $f28, $f28, $f0 \n\t"
550
551 "xor $f30, $f30, $f30 \n\t"
552 "xor $f8, $f8, $f8 \n\t"
553 SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
554 "dmfc1 $8, $f28 \n\t"
555 "seh $8, $8 \n\t"
556
557 "mul $8, $8, 0x11 \n\t"
558 PTR_ADDIU "$8, $8, 0x10 \n\t"
559 "sra $8, $8, 0x5 \n\t"
560 MMI_Copy8Times($f16, $f18, $f8, $8)
561
562 "move %[pPred], $10 \n\t"
563 PTR_ADDIU "$9, $9, 0x10 \n\t"
564 "mul $8, $8, -0x3 \n\t"
565 PTR_ADDU "$9, $9, $8 \n\t"
566 MMI_Copy8Times($f0, $f2, $f8, $9)
567
568 "xor $8, $8, $8 \n\t"
569 "gslqc1 $f22, $f20, 0x0(%[mmi_plane_mul_b_c]) \n\t"
570
571 "dli $11, 0x5 \n\t"
572 "dmtc1 $11, $f30 \n\t"
573 "1: \n\t"
574 "pmullh $f8, $f4, $f20 \n\t"
575 "pmullh $f10, $f6, $f22 \n\t"
576 "paddh $f8, $f8, $f0 \n\t"
577 "paddh $f10, $f10, $f2 \n\t"
578 "psrah $f8, $f8, $f30 \n\t"
579 "psrah $f10, $f10, $f30 \n\t"
580 "packushb $f8, $f8, $f10 \n\t"
581 "gssdxc1 $f8, 0x0(%[pPred], $0) \n\t"
582 "paddh $f0, $f0, $f16 \n\t"
583 "paddh $f2, $f2, $f18 \n\t"
584 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
585 PTR_ADDIU "$8, $8, 0x1 \n\t"
586 PTR_ADDIU "$11, $8, -0x8 \n\t"
587 "bnez $11, 1b \n\t"
588 "nop \n\t"
589 : [pPred]"+&r"((unsigned char *)pPred)
590 : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
591 [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
592 : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
593 "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
594 );
595 RECOVER_REG;
596 }
597
WelsDecoderIChromaPredDc_mmi(uint8_t * pPred,const int32_t kiStride)598 void WelsDecoderIChromaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
599 __asm__ volatile(
600 ".set arch=loongson3a \n\t"
601 "move $10, %[pPred] \n\t"
602
603 PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
604 "gsldxc1 $f0, 0x0(%[pPred], $0) \n\t"
605
606 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
607 "lbu $8, -0x1(%[pPred]) \n\t"
608 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
609 "lbu $9, -0x1(%[pPred]) \n\t"
610 PTR_ADDU "$8, $8, $9 \n\t"
611 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
612 "lbu $9, -0x1(%[pPred]) \n\t"
613 PTR_ADDU "$8, $8, $9 \n\t"
614 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
615 "lbu $9, -0x1(%[pPred]) \n\t"
616 PTR_ADDU "$8, $8, $9 \n\t"
617 "dmtc1 $8, $f2 \n\t"
618
619 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
620 "lbu $8, -0x1(%[pPred]) \n\t"
621 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
622 "lbu $9, -0x1(%[pPred]) \n\t"
623 PTR_ADDU "$8, $8, $9 \n\t"
624 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
625 "lbu $9, -0x1(%[pPred]) \n\t"
626 PTR_ADDU "$8, $8, $9 \n\t"
627 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
628 "lbu $9, -0x1(%[pPred]) \n\t"
629 PTR_ADDU "$8, $8, $9 \n\t"
630 "dmtc1 $8, $f4 \n\t"
631
632 "xor $f8, $f8, $f8 \n\t"
633 "punpcklwd $f6, $f0, $f8 \n\t"
634 "punpckhwd $f0, $f0, $f8 \n\t"
635 "pasubub $f0, $f0, $f8 \n\t"
636 "pasubub $f6, $f6, $f8 \n\t"
637 "biadd $f0, $f0 \n\t"
638 "biadd $f6, $f6 \n\t"
639
640 "paddd $f6, $f6, $f2 \n\t"
641 "paddd $f2, $f4, $f0 \n\t"
642
643 "dli $8, 0x2 \n\t"
644 "dmtc1 $8, $f8 \n\t"
645 "gsldxc1 $f12, 0x0(%[mmi_01bytes], $0) \n\t"
646 "dli $8, 0x3 \n\t"
647 "dmtc1 $8, $f10 \n\t"
648
649 "paddd $f0, $f0, $f8 \n\t"
650 "dsrl $f0, $f0, $f8 \n\t"
651
652 "paddd $f4, $f4, $f8 \n\t"
653 "dsrl $f4, $f4, $f8 \n\t"
654
655 "paddd $f6, $f6, $f8 \n\t"
656 "paddd $f6, $f6, $f8 \n\t"
657 "dsrl $f6, $f6, $f10 \n\t"
658
659 "paddd $f2, $f2, $f8 \n\t"
660 "paddd $f2, $f2, $f8 \n\t"
661 "dsrl $f2, $f2, $f10 \n\t"
662
663 "dli $8, 0x20 \n\t"
664 "dmtc1 $8, $f8 \n\t"
665 "pmuluw $f0, $f0, $f12 \n\t"
666 "pmuluw $f6, $f6, $f12 \n\t"
667 "dsll $f0, $f0, $f8 \n\t"
668 "xor $f0, $f0, $f6 \n\t"
669
670 "pmuluw $f4, $f4, $f12 \n\t"
671 "pmuluw $f2, $f2, $f12 \n\t"
672 "dsll $f2, $f2, $f8 \n\t"
673 "xor $f2, $f2, $f4 \n\t"
674
675 "gssdxc1 $f0, 0x0($10, $0) \n\t"
676 PTR_ADDU "$10, $10, %[kiStride] \n\t"
677 "gssdxc1 $f0, 0x0($10, $0) \n\t"
678 PTR_ADDU "$10, $10, %[kiStride] \n\t"
679 "gssdxc1 $f0, 0x0($10, $0) \n\t"
680 PTR_ADDU "$10, $10, %[kiStride] \n\t"
681 "gssdxc1 $f0, 0x0($10, $0) \n\t"
682
683 PTR_ADDU "$10, $10, %[kiStride] \n\t"
684 "gssdxc1 $f2, 0x0($10, $0) \n\t"
685 PTR_ADDU "$10, $10, %[kiStride] \n\t"
686 "gssdxc1 $f2, 0x0($10, $0) \n\t"
687 PTR_ADDU "$10, $10, %[kiStride] \n\t"
688 "gssdxc1 $f2, 0x0($10, $0) \n\t"
689 PTR_ADDU "$10, $10, %[kiStride] \n\t"
690 "gssdxc1 $f2, 0x0($10, $0) \n\t"
691 : [pPred] "+&r"((unsigned char *)pPred)
692 : [kiStride] "r"((int)kiStride),
693 [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
694 : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
695 "$f12"
696 );
697 }
698
WelsDecoderIChromaPredDcTop_mmi(uint8_t * pPred,const int32_t kiStride)699 void WelsDecoderIChromaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
700 BACKUP_REG;
701 __asm__ volatile (
702 ".set arch=loongson3a \n\t"
703 "dli $8, 0x4e \n\t"
704 "dmtc1 $8, $f16 \n\t"
705 "dli $8, 0xb1 \n\t"
706 "dmtc1 $8, $f18 \n\t"
707 "dli $8, 0x2 \n\t"
708 "dmtc1 $8, $f20 \n\t"
709 PTR_SUBU "$8, %[pPred], %[kiStride] \n\t"
710 "gsldxc1 $f0, 0x0($8, $0) \n\t"
711 "xor $f28, $f28, $f28 \n\t"
712 "punpckhbh $f2, $f0, $f28 \n\t"
713 "punpcklbh $f0, $f0, $f28 \n\t"
714 "pshufh $f4, $f0, $f16 \n\t"
715 "pshufh $f6, $f2, $f16 \n\t"
716 "paddh $f0, $f0, $f4 \n\t"
717 "paddh $f2, $f2, $f6 \n\t"
718
719 "pshufh $f8, $f0, $f18 \n\t"
720 "pshufh $f14, $f2, $f18 \n\t"
721 "paddh $f2, $f2, $f14 \n\t"
722 "paddh $f0, $f0, $f8 \n\t"
723
724 "gslqc1 $f26, $f24, 0x0(%[mmi_wd_0x02]) \n\t"
725 "paddh $f0, $f0, $f24 \n\t"
726 "paddh $f2, $f2, $f26 \n\t"
727 "psrah $f0, $f0, $f20 \n\t"
728 "psrah $f2, $f2, $f20 \n\t"
729 "packushb $f0, $f0, $f2 \n\t"
730
731 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
732 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
733 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
734 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
735 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
736 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
737 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
738
739 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
740 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
741 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
742 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
743 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
744 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
745 PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
746 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
747 : [pPred] "+&r"((unsigned char *)pPred)
748 : [kiStride] "r"((int)kiStride), [mmi_wd_0x02] "r"((short *)mmi_wd_0x02)
749 : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
750 );
751 RECOVER_REG;
752 }
753
WelsDecoderI4x4LumaPredH_mmi(uint8_t * pPred,const int32_t kiStride)754 void WelsDecoderI4x4LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
755 __asm__ volatile(
756 ".set arch=loongson3a \n\t"
757 "gsldxc1 $f8, 0x0(%[mmi_01bytes], $0) \n\t"
758 "lbu $8, -0x1(%[pPred]) \n\t"
759 "dmtc1 $8, $f0 \n\t"
760 "pmuluw $f0, $f0, $f8 \n\t"
761
762 PTR_ADDU "$9, %[pPred], %[kiStride] \n\t"
763 "lbu $8, -0x1($9) \n\t"
764 "dmtc1 $8, $f2 \n\t"
765 "pmuluw $f2, $f2, $f8 \n\t"
766
767 PTR_ADDU "$10, $9, %[kiStride] \n\t"
768 "lbu $8, -0x1($10) \n\t"
769 "dmtc1 $8, $f4 \n\t"
770 "pmuluw $f4, $f4, $f8 \n\t"
771
772 PTR_ADDU "$11, $10, %[kiStride] \n\t"
773 "lbu $8, -0x1($11) \n\t"
774 "dmtc1 $8, $f6 \n\t"
775 "pmuluw $f6, $f6, $f8 \n\t"
776
777 "gsswxc1 $f0, 0x0(%[pPred], $0) \n\t"
778 "gsswxc1 $f2, 0x0($9, $0) \n\t"
779 "gsswxc1 $f4, 0x0($10, $0) \n\t"
780 "gsswxc1 $f6, 0x0($11, $0) \n\t"
781 : [pPred] "+&r"((unsigned char *)pPred)
782 : [kiStride] "r"((int)kiStride),
783 [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
784 : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8"
785 );
786 }
787