1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file score_mmi.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 21/07/2018 Created
37 *
38 *************************************************************************************
39 */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42
43 unsigned char nozero_count_table[] __attribute__((aligned(16))) = {
44 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
45 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
46 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
47 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
48 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
49 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
50 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
51 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
52 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
53 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
54 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
55
WelsGetNoneZeroCount_mmi(int16_t * level)56 int32_t WelsGetNoneZeroCount_mmi(int16_t *level) {
57 int ret_val = 0;
58 __asm__ volatile(
59 ".set arch=loongson3a \n\t"
60 "gslqc1 $f2, $f0, 0x0(%[level]) \n\t"
61 "gslqc1 $f6, $f4, 0x10(%[level]) \n\t"
62 "xor $f8, $f8, $f8 \n\t"
63 "pcmpeqh $f0, $f0, $f8 \n\t"
64 "pcmpeqh $f2, $f2, $f8 \n\t"
65 "pcmpeqh $f4, $f4, $f8 \n\t"
66 "pcmpeqh $f6, $f6, $f8 \n\t"
67 "packsshb $f4, $f4, $f6 \n\t"
68 "packsshb $f6, $f0, $f2 \n\t"
69 "pmovmskb $f0, $f4 \n\t"
70 "pmovmskb $f2, $f6 \n\t"
71 "dmfc1 $8, $f0 \n\t"
72 "dmfc1 $9, $f2 \n\t"
73 "xor $8, 0xFF \n\t"
74 "xor $9, 0xFF \n\t"
75 PTR_ADDU "$10, $8, %[nozero_count_table] \n\t"
76 "lbu $8, 0x0($10) \n\t"
77 PTR_ADDU "$10, $9, %[nozero_count_table] \n\t"
78 "lbu $9, 0x0($10) \n\t"
79 PTR_ADDU "%[ret_val], $8, $9 \n\t"
80 : [ret_val] "=r"((int)ret_val)
81 : [level] "r"((unsigned char *)level),
82 [nozero_count_table] "r"((unsigned char *)nozero_count_table)
83 : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8"
84 );
85 return ret_val;
86 }
87
WelsScan4x4DcAc_mmi(int16_t level[16],int16_t * pDct)88 void WelsScan4x4DcAc_mmi(int16_t level[16], int16_t *pDct) {
89 BACKUP_REG;
90 __asm__ volatile(
91 ".set arch=loongson3a \n\t"
92 "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
93 "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
94 "dli $8, 0x3 \n\t"
95 "dmtc1 $8, $f22 \n\t"
96 "dli $8, 0x2 \n\t"
97 "dmtc1 $8, $f24 \n\t"
98 "dli $8, 0x1 \n\t"
99 "dmtc1 $8, $f26 \n\t"
100 "dmtc1 $0, $f28 \n\t"
101 "pextrh $f18, $f2, $f22 \n\t"
102 "pextrh $f20, $f4, $f24 \n\t"
103 "pextrh $f16, $f2, $f26 \n\t"
104 "pinsrh_2 $f4, $f4, $f18 \n\t"
105 "pinsrh_3 $f2, $f2, $f16 \n\t"
106 "pextrh $f18, $f4, $f28 \n\t"
107 "pinsrh_1 $f2, $f2, $f18 \n\t"
108 "pinsrh_0 $f4, $f4, $f20 \n\t"
109 "dli $8, 0x93 \n\t"
110 "dmtc1 $8, $f22 \n\t"
111 "dli $8, 0x39 \n\t"
112 "dmtc1 $8, $f24 \n\t"
113 "punpckhwd $f10, $f0, $f2 \n\t"
114 "punpcklwd $f8, $f0, $f2 \n\t"
115 "punpckhwd $f14, $f4, $f6 \n\t"
116 "punpcklwd $f12, $f4, $f6 \n\t"
117 "mov.d $f0, $f8 \n\t"
118 "pshufh $f2, $f10, $f22 \n\t"
119 "pshufh $f4, $f12, $f24 \n\t"
120 "mov.d $f6, $f14 \n\t"
121 "gssqc1 $f2, $f0, 0x0(%[level]) \n\t"
122 "gssqc1 $f6, $f4, 0x10(%[level]) \n\t"
123 :
124 : [level] "r"((short *)level), [pDct] "r"((short *)pDct)
125 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
126 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
127 );
128 RECOVER_REG;
129 }
130
WelsScan4x4Ac_mmi(int16_t * zig_value,int16_t * pDct)131 void WelsScan4x4Ac_mmi(int16_t *zig_value, int16_t *pDct) {
132 BACKUP_REG;
133 __asm__ volatile(
134 ".set arch=loongson3a \n\t"
135 "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
136 "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
137 "mov.d $f8, $f2 \n\t"
138 "mov.d $f2, $f4 \n\t"
139 "mov.d $f10, $f6 \n\t"
140
141 "mov.d $f12, $f2 \n\t"
142 "punpckhwd $f2, $f0, $f8 \n\t"
143 "punpcklwd $f0, $f0, $f8 \n\t"
144 "punpckhwd $f14, $f12, $f10 \n\t"
145 "punpcklwd $f12, $f12, $f10 \n\t"
146
147 "dmtc1 $0, $f20 \n\t"
148 "dli $8, 0x10 \n\t"
149 "dmtc1 $8, $f22 \n\t"
150 "dli $8, 0x30 \n\t"
151 "dmtc1 $8, $f24 \n\t"
152 "dli $8, 0x3 \n\t"
153 "dmtc1 $8, $f26 \n\t"
154 "dli $8, 0x93 \n\t"
155 "dmtc1 $8, $f28 \n\t"
156 "dli $8, 0x39 \n\t"
157 "dmtc1 $8, $f30 \n\t"
158 "pextrh $f16, $f0, $f26 \n\t"
159 "pextrh $f18, $f2, $f26 \n\t"
160 "pinsrh_3 $f2, $f2, $f16 \n\t"
161 "pextrh $f16, $f14, $f20 \n\t"
162 "pinsrh_0 $f14, $f14, $f18 \n\t"
163 "pextrh $f18, $f12, $f20 \n\t"
164 "pinsrh_0 $f12, $f12, $f16 \n\t"
165 "pinsrh_3 $f0, $f0, $f18 \n\t"
166
167 "mov.d $f4, $f0 \n\t"
168 "pshufh $f6, $f2, $f28 \n\t"
169 "pshufh $f8, $f12, $f30 \n\t"
170 "mov.d $f10, $f14 \n\t"
171
172 "mov.d $f12, $f8 \n\t"
173 "mov.d $f14, $f10 \n\t"
174 "dsrl $f4, $f4, $f22 \n\t"
175 "pinsrh_3 $f4, $f4, $f6 \n\t"
176 "dsrl $f6, $f6, $f22 \n\t"
177 "dsll $f14, $f12, $f24 \n\t"
178 "xor $f12, $f12, $f12 \n\t"
179 "or $f4, $f4, $f12 \n\t"
180 "or $f6, $f6, $f14 \n\t"
181 "dsrl $f8, $f8, $f22 \n\t"
182 "pinsrh_3 $f8, $f8, $f10 \n\t"
183 "dsrl $f10, $f10, $f22 \n\t"
184 "gssqc1 $f6, $f4, 0x0(%[zig_value]) \n\t"
185 "gssqc1 $f10, $f8, 0x10(%[zig_value]) \n\t"
186 :
187 : [zig_value] "r"((short *)zig_value), [pDct] "r"((short *)pDct)
188 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
189 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
190 );
191 RECOVER_REG;
192 }
193
194 unsigned char i_ds_table[]__attribute__((aligned(16))) = {
195 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
196 unsigned char high_mask_table[]__attribute__((aligned(16))) = {
197 0, 0, 0, 3, 0, 2, 3, 6, 0, 2,
198 2, 5, 3, 5, 6, 9, 0, 1, 2, 5,
199 2, 4, 5, 8, 3, 5, 5, 8, 6, 8,
200 9,12, 0, 1, 1, 4, 2, 4, 5, 8,
201 2, 4, 4, 7, 5, 7, 8,11, 3, 4,
202 5, 8, 5, 7, 8,11, 6, 8, 8,11,
203 9,11,12,15, 0, 1, 1, 4, 1, 3,
204 4, 7, 2, 4, 4, 7, 5, 7, 8,11,
205 2, 3, 4, 7, 4, 6, 7,10, 5, 7,
206 7,10, 8,10,11,14, 3, 4, 4, 7,
207 5, 7, 8,11, 5, 7, 7,10, 8,10,
208 11,14, 6, 7, 8,11, 8,10,11,14,
209 9,11,11,14,12,14,15,18, 0, 0,
210 1, 4, 1, 3, 4, 7, 1, 3, 3, 6,
211 4, 6, 7,10, 2, 3, 4, 7, 4, 6,
212 7,10, 5, 7, 7,10, 8,10,11,14,
213 2, 3, 3, 6, 4, 6, 7,10, 4, 6,
214 6, 9, 7, 9,10,13, 5, 6, 7,10,
215 7, 9,10,13, 8,10,10,13,11,13,
216 14,17, 3, 4, 4, 7, 4, 6, 7,10,
217 5, 7, 7,10, 8,10,11,14, 5, 6,
218 7,10, 7, 9,10,13, 8,10,10,13,
219 11,13,14,17, 6, 7, 7,10, 8,10,
220 11,14, 8,10,10,13,11,13,14,17,
221 9,10,11,14,11,13,14,17,12,14,
222 14,17,15,17,18,21};
223
224 unsigned char low_mask_table[]__attribute__((aligned(16))) = {
225 0, 3, 2, 6, 2, 5, 5, 9, 1, 5,
226 4, 8, 5, 8, 8,12, 1, 4, 4, 8,
227 4, 7, 7,11, 4, 8, 7,11, 8,11,
228 11,15, 1, 4, 3, 7, 4, 7, 7,11,
229 3, 7, 6,10, 7,10,10,14, 4, 7,
230 7,11, 7,10,10,14, 7,11,10,14,
231 11,14,14,18, 0, 4, 3, 7, 3, 6,
232 6,10, 3, 7, 6,10, 7,10,10,14,
233 3, 6, 6,10, 6, 9, 9,13, 6,10,
234 9,13,10,13,13,17, 4, 7, 6,10,
235 7,10,10,14, 6,10, 9,13,10,13,
236 13,17, 7,10,10,14,10,13,13,17,
237 10,14,13,17,14,17,17,21, 0, 3,
238 3, 7, 3, 6, 6,10, 2, 6, 5, 9,
239 6, 9, 9,13, 3, 6, 6,10, 6, 9,
240 9,13, 6,10, 9,13,10,13,13,17,
241 3, 6, 5, 9, 6, 9, 9,13, 5, 9,
242 8,12, 9,12,12,16, 6, 9, 9,13,
243 9,12,12,16, 9,13,12,16,13,16,
244 16,20, 3, 7, 6,10, 6, 9, 9,13,
245 6,10, 9,13,10,13,13,17, 6, 9,
246 9,13, 9,12,12,16, 9,13,12,16,
247 13,16,16,20, 7,10, 9,13,10,13,
248 13,17, 9,13,12,16,13,16,16,20,
249 10,13,13,17,13,16,16,20,13,17,
250 16,20,17,20,20,24};
251
WelsCalculateSingleCtr4x4_mmi(int16_t * pDct)252 int32_t WelsCalculateSingleCtr4x4_mmi(int16_t *pDct) {
253 int32_t iSingleCtr = 0;
254 __asm__ volatile(
255 ".set arch=loongson3a \n\t"
256 "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
257 "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
258 "packsshb $f0, $f0, $f2 \n\t"
259 "packsshb $f2, $f4, $f6 \n\t"
260
261 "xor $f10, $f10, $f10 \n\t"
262 "xor $f8, $f8, $f8 \n\t"
263
264 "pcmpeqb $f0, $f0, $f8 \n\t"
265 "pcmpeqb $f2, $f2, $f8 \n\t"
266
267 "pmovmskb $f10, $f0 \n\t"
268 "pmovmskb $f12, $f2 \n\t"
269 "punpcklbh $f10, $f10, $f12 \n\t"
270
271 "dmfc1 $12, $f10 \n\t"
272 "dli $8, 0xffff \n\t"
273 "xor $12, $12, $8 \n\t"
274
275 "xor %[pDct], %[pDct], %[pDct] \n\t"
276 "dli $8, 0x80 \n\t"
277 "dli $9, 0x7 \n\t"
278 "dli $10, 0x100 \n\t"
279 "dli $11, 0x8 \n\t"
280
281 "1: \n\t"
282 "and $13, $12, $8 \n\t"
283 "bnez $13, 2f \n\t"
284 "nop \n\t"
285 "daddiu $9, -0x1 \n\t"
286 "dsrl $8, 1 \n\t"
287 "bnez $9, 1b \n\t"
288 "nop \n\t"
289 "2: \n\t"
290 "and $13, $12, $10 \n\t"
291 "bnez $13, 3f \n\t"
292 "nop \n\t"
293 "daddiu $11, 0x1 \n\t"
294 "dsll $10, 1 \n\t"
295 "daddiu $13, $11, -0x10 \n\t"
296 "bltz $13, 2b \n\t"
297 "nop \n\t"
298 "3: \n\t"
299 "dsubu $11, $11, $9 \n\t"
300 "daddiu $11, -0x1 \n\t"
301 PTR_ADDU "$8, %[i_ds_table], $11 \n\t"
302 "lb $10, 0x0($8) \n\t"
303 PTR_ADDU "%[pDct], %[pDct], $10 \n\t"
304 "move $11, $12 \n\t"
305 "dli $10, 0xff \n\t"
306 "and $12, $10 \n\t"
307 "dsrl $11, 0x8 \n\t"
308 "and $11, $10 \n\t"
309 PTR_ADDU "$8, %[low_mask_table], $12 \n\t"
310 "lb $10, 0x0($8) \n\t"
311 PTR_ADDU "%[pDct], %[pDct], $10 \n\t"
312 PTR_ADDU "$8, %[high_mask_table], $11 \n\t"
313 "lb $10, 0x0($8) \n\t"
314 PTR_ADDU "%[iSingleCtr], %[pDct], $10 \n\t"
315 : [iSingleCtr] "=r"(iSingleCtr)
316 : [pDct] "r"((short *)pDct),
317 [i_ds_table] "r"((unsigned char *)i_ds_table),
318 [high_mask_table] "r"((unsigned char *)high_mask_table),
319 [low_mask_table] "r"((unsigned char *)low_mask_table)
320 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
321 "$f6", "$f8", "$f10", "$f12"
322 );
323 return iSingleCtr;
324 }
325