1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file intra_pred_com_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 23/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
43 #define MMI_PRED_H_16X16_ONE_LINE \
44 PTR_ADDIU "%[pPred], %[pPred], 0x10 \n\t" \
45 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
46 "lbu $8, 0x0(%[pRef]) \n\t" \
47 MMI_Copy16Times($f0, $f2, $f4, $8) \
48 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
49
50 #define LOAD_2_LEFT_AND_ADD \
51 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
52 "lbu $9, -0x1(%[pRef]) \n\t" \
53 PTR_ADDU "$8, $8, $9 \n\t" \
54 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
55 "lbu $9, -0x1(%[pRef]) \n\t" \
56 PTR_ADDU "$8, $8, $9 \n\t"
57
58 //f2 should be mmi_01bytes, f4 should be 0x38, f6 should be 0x0
59 #define MMI_PRED_H_8X8_ONE_LINE(f0, f2, f4, f6, r0, r1, r1_offset) \
60 PTR_ADDU ""#r0", "#r0", %[kiStride] \n\t" \
61 "gsldxc1 "#f0", -0x8("#r0", $0) \n\t" \
62 "dsrl "#f0", "#f0", "#f4" \n\t" \
63 "pmullh "#f0", "#f0", "#f2" \n\t" \
64 "pshufh "#f0", "#f0", "#f6" \n\t" \
65 "gssdxc1 "#f0", "#r1_offset"+0x0("#r1", $0) \n\t"
66
WelsI16x16LumaPredV_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)67 void WelsI16x16LumaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
68 __asm__ volatile (
69 ".set arch=loongson3a \n\t"
70 PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
71 "gslqc1 $f2, $f0, 0x0(%[pRef]) \n\t"
72
73 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
74 "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
75 "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
76 "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
77 "gssqc1 $f2, $f0, 0x40(%[pPred]) \n\t"
78 "gssqc1 $f2, $f0, 0x50(%[pPred]) \n\t"
79 "gssqc1 $f2, $f0, 0x60(%[pPred]) \n\t"
80 "gssqc1 $f2, $f0, 0x70(%[pPred]) \n\t"
81 "gssqc1 $f2, $f0, 0x80(%[pPred]) \n\t"
82 "gssqc1 $f2, $f0, 0x90(%[pPred]) \n\t"
83 "gssqc1 $f2, $f0, 0xa0(%[pPred]) \n\t"
84 "gssqc1 $f2, $f0, 0xb0(%[pPred]) \n\t"
85 "gssqc1 $f2, $f0, 0xc0(%[pPred]) \n\t"
86 "gssqc1 $f2, $f0, 0xd0(%[pPred]) \n\t"
87 "gssqc1 $f2, $f0, 0xe0(%[pPred]) \n\t"
88 "gssqc1 $f2, $f0, 0xf0(%[pPred]) \n\t"
89 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
90 : [kiStride]"r"((int)kiStride)
91 : "memory", "$f0", "$f2"
92 );
93 }
94
WelsI16x16LumaPredH_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)95 void WelsI16x16LumaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
96 __asm__ volatile (
97 ".set arch=loongson3a \n\t"
98 PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
99 "lbu $8, 0x0(%[pRef]) \n\t"
100 "xor $f4, $f4, $f4 \n\t"
101 MMI_Copy16Times($f0, $f2, $f4, $8)
102 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
103
104 MMI_PRED_H_16X16_ONE_LINE
105 MMI_PRED_H_16X16_ONE_LINE
106 MMI_PRED_H_16X16_ONE_LINE
107 MMI_PRED_H_16X16_ONE_LINE
108 MMI_PRED_H_16X16_ONE_LINE
109 MMI_PRED_H_16X16_ONE_LINE
110 MMI_PRED_H_16X16_ONE_LINE
111 MMI_PRED_H_16X16_ONE_LINE
112 MMI_PRED_H_16X16_ONE_LINE
113 MMI_PRED_H_16X16_ONE_LINE
114 MMI_PRED_H_16X16_ONE_LINE
115 MMI_PRED_H_16X16_ONE_LINE
116 MMI_PRED_H_16X16_ONE_LINE
117 MMI_PRED_H_16X16_ONE_LINE
118 MMI_PRED_H_16X16_ONE_LINE
119 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
120 : [kiStride]"r"((int)kiStride)
121 : "memory", "$8", "$f0", "$f2", "$f4"
122 );
123 }
124
WelsI16x16LumaPredDc_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)125 void WelsI16x16LumaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
126 unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
127 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
128 __asm__ volatile (
129 ".set arch=loongson3a \n\t"
130 PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
131 "gslqc1 $f2, $f0, 0x0(%[pRef]) \n\t"
132 "xor $f4, $f4, $f4 \n\t"
133 "pasubub $f0, $f0, $f4 \n\t"
134 "pasubub $f2, $f2, $f4 \n\t"
135 "biadd $f0, $f0 \n\t"
136 "biadd $f2, $f2 \n\t"
137 "paddh $f0, $f0, $f2 \n\t"
138
139 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
140 "lbu $8, -0x1(%[pRef]) \n\t"
141 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
142 "lbu $9, -0x1(%[pRef]) \n\t"
143 PTR_ADDU "$8, $8, $9 \n\t"
144
145 LOAD_2_LEFT_AND_ADD
146 LOAD_2_LEFT_AND_ADD
147 LOAD_2_LEFT_AND_ADD
148 LOAD_2_LEFT_AND_ADD
149 LOAD_2_LEFT_AND_ADD
150 LOAD_2_LEFT_AND_ADD
151 LOAD_2_LEFT_AND_ADD
152
153 "dli $10, 0x5 \n\t"
154 "dmtc1 $10, $f6 \n\t"
155 PTR_ADDIU "$8, 0x10 \n\t"
156 "dmtc1 $8, $f4 \n\t"
157 "paddh $f0, $f0, $f4 \n\t"
158 "psrlw $f0, $f0, $f6 \n\t"
159 "gsldxc1 $f6, 0x0(%[mmi_01bytes], $0) \n\t"
160 "pmuluw $f0, $f0, $f6 \n\t"
161 "punpcklwd $f0, $f0, $f0 \n\t"
162 "mov.d $f2, $f0 \n\t"
163
164 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
165 "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
166 "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
167 "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
168 "gssqc1 $f2, $f0, 0x40(%[pPred]) \n\t"
169 "gssqc1 $f2, $f0, 0x50(%[pPred]) \n\t"
170 "gssqc1 $f2, $f0, 0x60(%[pPred]) \n\t"
171 "gssqc1 $f2, $f0, 0x70(%[pPred]) \n\t"
172 "gssqc1 $f2, $f0, 0x80(%[pPred]) \n\t"
173 "gssqc1 $f2, $f0, 0x90(%[pPred]) \n\t"
174 "gssqc1 $f2, $f0, 0xa0(%[pPred]) \n\t"
175 "gssqc1 $f2, $f0, 0xb0(%[pPred]) \n\t"
176 "gssqc1 $f2, $f0, 0xc0(%[pPred]) \n\t"
177 "gssqc1 $f2, $f0, 0xd0(%[pPred]) \n\t"
178 "gssqc1 $f2, $f0, 0xe0(%[pPred]) \n\t"
179 "gssqc1 $f2, $f0, 0xf0(%[pPred]) \n\t"
180 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
181 : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
182 : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
183 );
184 }
185
WelsI16x16LumaPredPlane_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)186 void WelsI16x16LumaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
187 short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4,
188 -3, -2, -1, 0};
189 short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
190 short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
191 BACKUP_REG;
192 __asm__ volatile (
193 ".set arch=loongson3a \n\t"
194 PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
195 PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
196
197 "gsldlc1 $f0, 0x7(%[pRef]) \n\t"
198 "xor $f28, $f28, $f28 \n\t"
199 "gsldrc1 $f0, 0x0(%[pRef]) \n\t"
200 "gslqc1 $f22, $f20, 0x0(%[mmi_plane_dec]) \n\t"
201 "punpckhbh $f2, $f0, $f28 \n\t"
202 "punpcklbh $f0, $f0, $f28 \n\t"
203 "gsldlc1 $f4, 0x10(%[pRef]) \n\t"
204 "pmullh $f0, $f0, $f20 \n\t"
205 "pmullh $f2, $f2, $f22 \n\t"
206 "gsldrc1 $f4, 0x9(%[pRef]) \n\t"
207 "gslqc1 $f26, $f24, 0x0(%[mmi_plane_inc]) \n\t"
208 "punpckhbh $f6, $f4, $f28 \n\t"
209 "punpcklbh $f4, $f4, $f28 \n\t"
210 "pmullh $f4, $f4, $f24 \n\t"
211 "pmullh $f6, $f6, $f26 \n\t"
212 "psubh $f4, $f4, $f0 \n\t"
213 "psubh $f6, $f6, $f2 \n\t"
214
215 "xor $f8, $f8, $f8 \n\t"
216 SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
217 "dmfc1 $8, $f4 \n\t"
218 "seh $8, $8 \n\t"
219 "mul $8, $8, 0x5 \n\t"
220 PTR_ADDIU "$8, $8, 0x20 \n\t"
221 "sra $8, $8, 0x6 \n\t"
222 MMI_Copy8Times($f4, $f6, $f28, $8)
223
224 "lbu $9, 0x10(%[pRef]) \n\t"
225 PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
226 LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16,
227 $f18, %[pRef], %[kiStride], $11)
228
229 PTR_ADDIU "%[pRef], %[pRef], 0x3 \n\t"
230 "dsll $10, %[kiStride], 0x3 \n\t"
231 PTR_ADDU "$10, $10, %[pRef] \n\t"
232 "lbu $8, 0x0($10) \n\t"
233 PTR_ADDU "$9, $9, $8 \n\t"
234 "dsll $9, $9, 0x4 \n\t"
235
236 PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
237 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
238 LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16,
239 $f18, %[pRef], %[kiStride], $11)
240 "xor $f16, $f16, $f16 \n\t"
241 "xor $f18, $f18, $f18 \n\t"
242 "punpcklbh $f0, $f2, $f18 \n\t"
243 "punpckhbh $f2, $f2, $f18 \n\t"
244 "pmullh $f0, $f0, $f20 \n\t"
245 "pmullh $f2, $f2, $f22 \n\t"
246 "punpcklbh $f28, $f30, $f18 \n\t"
247 "punpckhbh $f30, $f30, $f18 \n\t"
248 "pmullh $f28, $f28, $f24 \n\t"
249 "pmullh $f30, $f30, $f26 \n\t"
250 "psubh $f28, $f28, $f0 \n\t"
251 "psubh $f30, $f30, $f2 \n\t"
252
253 SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
254 "dmfc1 $8, $f28 \n\t"
255 "seh $8, $8 \n\t"
256 "mul $8, $8, 0x5 \n\t"
257 PTR_ADDIU "$8, $8, 0x20 \n\t"
258 "sra $8, $8, 0x6 \n\t"
259 "xor $f20, $f20, $f20 \n\t"
260 MMI_Copy8Times($f16, $f18, $f20, $8)
261
262 PTR_ADDIU "$9, $9, 0x10 \n\t"
263 "mul $8, $8, -0x7 \n\t"
264 PTR_ADDU "$8, $8, $9 \n\t"
265 "xor $f20, $f20, $f20 \n\t"
266 MMI_Copy8Times($f0, $f2, $f20, $8)
267
268 "xor $8, $8, $8 \n\t"
269 "gslqc1 $f22, $f20, 0x0(%[mmi_plane_inc_minus]) \n\t"
270
271 "dli $10, 0x5 \n\t"
272 "dmtc1 $10, $f30 \n\t"
273 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
274 "1: \n\t"
275 "pmullh $f8, $f4, $f20 \n\t"
276 "pmullh $f10, $f6, $f22 \n\t"
277 "paddh $f8, $f8, $f0 \n\t"
278 "paddh $f10, $f10, $f2 \n\t"
279 "psrah $f8, $f8, $f30 \n\t"
280 "psrah $f10, $f10, $f30 \n\t"
281 "pmullh $f12, $f4, $f24 \n\t"
282 "pmullh $f14, $f6, $f26 \n\t"
283 "paddh $f12, $f12, $f0 \n\t"
284 "paddh $f14, $f14, $f2 \n\t"
285 "psrah $f12, $f12, $f30 \n\t"
286 "psrah $f14, $f14, $f30 \n\t"
287 "packushb $f8, $f8, $f10 \n\t"
288 "packushb $f10, $f12, $f14 \n\t"
289 "gssqc1 $f10, $f8, 0x0(%[pPred]) \n\t"
290 "paddh $f0, $f0, $f16 \n\t"
291 "paddh $f2, $f2, $f18 \n\t"
292 PTR_ADDIU "%[pPred], %[pPred], 0x10 \n\t"
293 PTR_ADDIU "$8, $8, 0x1 \n\t"
294 PTR_ADDIU "$10, $8, -0x10 \n\t"
295 "bnez $10, 1b \n\t"
296 "nop \n\t"
297 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
298 : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
299 [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
300 : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
301 "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
302 "$f28", "$f30"
303 );
304 RECOVER_REG;
305 }
306
WelsIChromaPredPlane_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)307 void WelsIChromaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
308 short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
309 short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
310 short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0,
311 1, 2, 3, 4};
312 BACKUP_REG;
313 __asm__ volatile (
314 ".set arch=loongson3a \n\t"
315 PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
316 PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
317
318 "gsldlc1 $f0, 0x7(%[pRef]) \n\t"
319 "xor $f28, $f28, $f28 \n\t"
320 "gsldrc1 $f0, 0x0(%[pRef]) \n\t"
321 "gsldxc1 $f20, 0x0(%[mmi_plane_dec_c], $0) \n\t"
322 "punpcklbh $f0, $f0, $f28 \n\t"
323 "gsldlc1 $f4, 0xc(%[pRef]) \n\t"
324 "pmullh $f0, $f0, $f20 \n\t"
325 "gsldrc1 $f4, 0x5(%[pRef]) \n\t"
326 "gsldxc1 $f24, 0x0(%[mmi_plane_inc_c], $0) \n\t"
327 "punpcklbh $f4, $f4, $f28 \n\t"
328 "pmullh $f4, $f4, $f24 \n\t"
329 "psubh $f4, $f4, $f0 \n\t"
330
331 "xor $f6, $f6, $f6 \n\t"
332 "xor $f8, $f8, $f8 \n\t"
333 SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
334 "dmfc1 $8, $f4 \n\t"
335 "seh $8, $8 \n\t"
336 "mul $8, $8, 0x11 \n\t"
337 PTR_ADDIU "$8, $8, 0x10 \n\t"
338 "sra $8, $8, 0x5 \n\t"
339 MMI_Copy8Times($f4, $f6, $f28, $8)
340
341 "lbu $8, 0x8(%[pRef]) \n\t"
342 PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
343 LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
344
345 PTR_ADDIU "%[pRef], %[pRef], 0x3 \n\t"
346 "dsll $10, %[kiStride], 0x2 \n\t"
347 PTR_ADDU "$10, $10, %[pRef] \n\t"
348 "lbu $9, 0x0($10) \n\t"
349 PTR_ADDU "$9, $9, $8 \n\t"
350 "dsll $9, $9, 0x4 \n\t"
351
352 PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
353 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
354 LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
355 "xor $f16, $f16, $f16 \n\t"
356 "punpckhbh $f0, $f0, $f16 \n\t"
357 "pmullh $f0, $f0, $f20 \n\t"
358 "punpckhbh $f28, $f28, $f16 \n\t"
359 "pmullh $f28, $f28, $f24 \n\t"
360 "psubh $f28, $f28, $f0 \n\t"
361
362 "xor $f30, $f30, $f30 \n\t"
363 "xor $f8, $f8, $f8 \n\t"
364 SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
365 "dmfc1 $8, $f28 \n\t"
366 "seh $8, $8 \n\t"
367 "mul $8, $8, 0x11 \n\t"
368 PTR_ADDIU "$8, $8, 0x10 \n\t"
369 "sra $8, $8, 0x5 \n\t"
370 MMI_Copy8Times($f16, $f18, $f8, $8)
371
372 PTR_ADDIU "$9, $9, 0x10 \n\t"
373 "mul $8, $8, -0x3 \n\t"
374 PTR_ADDU "$8, $8, $9 \n\t"
375 MMI_Copy8Times($f0, $f2, $f8, $8)
376
377 "xor $8, $8, $8 \n\t"
378 "gslqc1 $f22, $f20, 0x0(%[mmi_plane_mul_b_c]) \n\t"
379
380 "dli $10, 0x5 \n\t"
381 "dmtc1 $10, $f30 \n\t"
382 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
383
384 "1: \n\t"
385 "pmullh $f8, $f4, $f20 \n\t"
386 "pmullh $f10, $f6, $f22 \n\t"
387 "paddh $f8, $f8, $f0 \n\t"
388 "paddh $f10, $f10, $f2 \n\t"
389 "psrah $f8, $f8, $f30 \n\t"
390 "psrah $f10, $f10, $f30 \n\t"
391 "packushb $f8, $f8, $f10 \n\t"
392 "gssdxc1 $f8, 0x0(%[pPred], $0) \n\t"
393 "paddh $f0, $f0, $f16 \n\t"
394 "paddh $f2, $f2, $f18 \n\t"
395 PTR_ADDIU "%[pPred], %[pPred], 0x8 \n\t"
396 PTR_ADDIU "$8, $8, 0x1 \n\t"
397 PTR_ADDIU "$10, $8, -0x8 \n\t"
398 "bnez $10, 1b \n\t"
399 "nop \n\t"
400 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
401 : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
402 [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
403 : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
404 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
405 );
406 RECOVER_REG;
407 }
408
WelsIChromaPredV_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)409 void WelsIChromaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
410 __asm__ volatile (
411 ".set arch=loongson3a \n\t"
412 PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
413 "gsldxc1 $f0, 0x0(%[pRef], $0) \n\t"
414 "mov.d $f2, $f0 \n\t"
415
416 "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
417 "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
418 "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
419 "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
420 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
421 : [kiStride]"r"((int)kiStride)
422 : "memory", "$f0", "$f2"
423 );
424 }
425
WelsIChromaPredDc_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)426 void WelsIChromaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
427 short mmi_0x02[4]__attribute__((aligned(16))) = {2, 0, 0, 0};
428 unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
429 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
430 __asm__ volatile (
431 ".set arch=loongson3a \n\t"
432 PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
433 "gsldxc1 $f0, 0x0(%[pRef], $0) \n\t"
434
435 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
436 "lbu $8, -0x1(%[pRef]) \n\t"
437 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
438 "lbu $9, -0x1(%[pRef]) \n\t"
439 PTR_ADDU "$8, $8, $9 \n\t"
440 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
441 "lbu $9, -0x1(%[pRef]) \n\t"
442 PTR_ADDU "$8, $8, $9 \n\t"
443 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
444 "lbu $9, -0x1(%[pRef]) \n\t"
445 PTR_ADDU "$8, $8, $9 \n\t"
446 "dmtc1 $8, $f2 \n\t"
447
448 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
449 "lbu $8, -0x1(%[pRef]) \n\t"
450 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
451 "lbu $9, -0x1(%[pRef]) \n\t"
452 PTR_ADDU "$8, $8, $9 \n\t"
453 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
454 "lbu $9, -0x1(%[pRef]) \n\t"
455 PTR_ADDU "$8, $8, $9 \n\t"
456 PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
457 "lbu $9, -0x1(%[pRef]) \n\t"
458 PTR_ADDU "$8, $8, $9 \n\t"
459 "dmtc1 $8, $f4 \n\t"
460
461 "xor $f8, $f8, $f8 \n\t"
462 "punpcklwd $f6, $f0, $f8 \n\t"
463 "punpckhwd $f0, $f0, $f8 \n\t"
464 "pasubub $f0, $f0, $f8 \n\t"
465 "pasubub $f6, $f6, $f8 \n\t"
466 "biadd $f0, $f0 \n\t"
467 "biadd $f6, $f6 \n\t"
468
469 "dadd $f6, $f6, $f2 \n\t"
470 "dadd $f2, $f4, $f0 \n\t"
471
472 "gsldxc1 $f8, 0x0(%[mmi_0x02], $0) \n\t"
473
474 "dli $10, 0x2 \n\t"
475 "dmtc1 $10, $f10 \n\t"
476 "dadd $f0, $f0, $f8 \n\t"
477 "dsrl $f0, $f0, $f10 \n\t"
478
479 "dadd $f4, $f4, $f8 \n\t"
480 "dsrl $f4, $f4, $f10 \n\t"
481
482 "dli $10, 0x3 \n\t"
483 "dmtc1 $10, $f10 \n\t"
484 "dadd $f6, $f6, $f8 \n\t"
485 "dadd $f6, $f6, $f8 \n\t"
486 "dsrl $f6, $f6, $f10 \n\t"
487
488 "dadd $f2, $f2, $f8 \n\t"
489 "dadd $f2, $f2, $f8 \n\t"
490 "dsrl $f2, $f2, $f10 \n\t"
491
492 "dli $10, 0x20 \n\t"
493 "dmtc1 $10, $f10 \n\t"
494 "gsldxc1 $f12, 0x0(%[mmi_01bytes], $0) \n\t"
495 "pmuluw $f0, $f0, $f12 \n\t"
496 "pmuluw $f6, $f6, $f12 \n\t"
497 "dsll $f0, $f0, $f10 \n\t"
498 "xor $f0, $f0, $f6 \n\t"
499
500 "pmuluw $f4, $f4, $f12 \n\t"
501 "pmuluw $f2, $f2, $f12 \n\t"
502 "dsll $f2, $f2, $f10 \n\t"
503 "xor $f2, $f2, $f4 \n\t"
504
505 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
506 "gssdxc1 $f0, 0x8(%[pPred], $0) \n\t"
507 "gssdxc1 $f0, 0x10(%[pPred], $0) \n\t"
508 "gssdxc1 $f0, 0x18(%[pPred], $0) \n\t"
509
510 "gssdxc1 $f2, 0x20(%[pPred], $0) \n\t"
511 "gssdxc1 $f2, 0x28(%[pPred], $0) \n\t"
512 "gssdxc1 $f2, 0x30(%[pPred], $0) \n\t"
513 "gssdxc1 $f2, 0x38(%[pPred], $0) \n\t"
514 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
515 : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes),
516 [mmi_0x02]"r"((unsigned char *)mmi_0x02)
517 : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
518 );
519 }
520
WelsIChromaPredH_mmi(uint8_t * pPred,uint8_t * pRef,int32_t kiStride)521 void WelsIChromaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
522 unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
523 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
524 __asm__ volatile (
525 ".set arch=loongson3a \n\t"
526 "gsldxc1 $f2, 0x0(%[mmi_01bytes], $0) \n\t"
527 "dli $8, 0x38 \n\t"
528 "dmtc1 $8, $f4 \n\t"
529 "xor $f6, $f6, $f6 \n\t"
530 "gsldxc1 $f0, -0x8(%[pRef], $0) \n\t"
531 "dsrl $f0, $f0, $f4 \n\t"
532
533 "pmullh $f0, $f0, $f2 \n\t"
534 "pshufh $f0, $f0, $f6 \n\t"
535 "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
536
537 MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x8)
538 MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x10)
539 MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x18)
540 MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x20)
541 MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x28)
542 MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x30)
543 MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x38)
544 : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
545 : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
546 : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
547 );
548 }
549