• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    score_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    21/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
43 unsigned char nozero_count_table[] __attribute__((aligned(16))) = {
44     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
45     2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
46     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
47     2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
48     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
49     4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
50     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
51     3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
52     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
53     4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
54     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
55 
WelsGetNoneZeroCount_mmi(int16_t * level)56 int32_t WelsGetNoneZeroCount_mmi(int16_t *level) {
57   int ret_val = 0;
58   __asm__ volatile(
59     ".set       arch=loongson3a                 \n\t"
60     "gslqc1     $f2, $f0, 0x0(%[level])         \n\t"
61     "gslqc1     $f6, $f4, 0x10(%[level])        \n\t"
62     "xor        $f8, $f8, $f8                   \n\t"
63     "pcmpeqh    $f0, $f0, $f8                   \n\t"
64     "pcmpeqh    $f2, $f2, $f8                   \n\t"
65     "pcmpeqh    $f4, $f4, $f8                   \n\t"
66     "pcmpeqh    $f6, $f6, $f8                   \n\t"
67     "packsshb   $f4, $f4, $f6                   \n\t"
68     "packsshb   $f6, $f0, $f2                   \n\t"
69     "pmovmskb   $f0, $f4                        \n\t"
70     "pmovmskb   $f2, $f6                        \n\t"
71     "dmfc1      $8, $f0                         \n\t"
72     "dmfc1      $9, $f2                         \n\t"
73     "xor        $8, 0xFF                        \n\t"
74     "xor        $9, 0xFF                        \n\t"
75     PTR_ADDU   "$10, $8, %[nozero_count_table]  \n\t"
76     "lbu        $8, 0x0($10)                    \n\t"
77     PTR_ADDU   "$10, $9, %[nozero_count_table]  \n\t"
78     "lbu        $9, 0x0($10)                    \n\t"
79     PTR_ADDU   "%[ret_val], $8, $9              \n\t"
80     : [ret_val] "=r"((int)ret_val)
81     : [level] "r"((unsigned char *)level),
82       [nozero_count_table] "r"((unsigned char *)nozero_count_table)
83     : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8"
84   );
85   return ret_val;
86 }
87 
WelsScan4x4DcAc_mmi(int16_t level[16],int16_t * pDct)88 void WelsScan4x4DcAc_mmi(int16_t level[16], int16_t *pDct) {
89   BACKUP_REG;
90   __asm__ volatile(
91     ".set       arch=loongson3a                 \n\t"
92     "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
93     "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"
94     "dli        $8, 0x3                         \n\t"
95     "dmtc1      $8, $f22                        \n\t"
96     "dli        $8, 0x2                         \n\t"
97     "dmtc1      $8, $f24                        \n\t"
98     "dli        $8, 0x1                         \n\t"
99     "dmtc1      $8, $f26                        \n\t"
100     "dmtc1      $0, $f28                        \n\t"
101     "pextrh     $f18, $f2, $f22                 \n\t"
102     "pextrh     $f20, $f4, $f24                 \n\t"
103     "pextrh     $f16, $f2, $f26                 \n\t"
104     "pinsrh_2   $f4, $f4, $f18                  \n\t"
105     "pinsrh_3   $f2, $f2, $f16                  \n\t"
106     "pextrh     $f18, $f4, $f28                 \n\t"
107     "pinsrh_1   $f2, $f2, $f18                  \n\t"
108     "pinsrh_0   $f4, $f4, $f20                  \n\t"
109     "dli        $8, 0x93                        \n\t"
110     "dmtc1      $8, $f22                        \n\t"
111     "dli        $8, 0x39                        \n\t"
112     "dmtc1      $8, $f24                        \n\t"
113     "punpckhwd  $f10, $f0, $f2                  \n\t"
114     "punpcklwd  $f8, $f0, $f2                   \n\t"
115     "punpckhwd  $f14, $f4, $f6                  \n\t"
116     "punpcklwd  $f12, $f4, $f6                  \n\t"
117     "mov.d      $f0, $f8                        \n\t"
118     "pshufh     $f2, $f10, $f22                 \n\t"
119     "pshufh     $f4, $f12, $f24                 \n\t"
120     "mov.d      $f6, $f14                       \n\t"
121     "gssqc1     $f2, $f0, 0x0(%[level])         \n\t"
122     "gssqc1     $f6, $f4, 0x10(%[level])        \n\t"
123     :
124     : [level] "r"((short *)level), [pDct] "r"((short *)pDct)
125     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
126       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
127   );
128   RECOVER_REG;
129 }
130 
WelsScan4x4Ac_mmi(int16_t * zig_value,int16_t * pDct)131 void WelsScan4x4Ac_mmi(int16_t *zig_value, int16_t *pDct) {
132   BACKUP_REG;
133   __asm__ volatile(
134     ".set       arch=loongson3a                 \n\t"
135     "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
136     "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"
137     "mov.d      $f8, $f2                        \n\t"
138     "mov.d      $f2, $f4                        \n\t"
139     "mov.d      $f10, $f6                       \n\t"
140 
141     "mov.d      $f12, $f2                       \n\t"
142     "punpckhwd  $f2, $f0, $f8                   \n\t"
143     "punpcklwd  $f0, $f0, $f8                   \n\t"
144     "punpckhwd  $f14, $f12, $f10                \n\t"
145     "punpcklwd  $f12, $f12, $f10                \n\t"
146 
147     "dmtc1      $0, $f20                        \n\t"
148     "dli        $8, 0x10                        \n\t"
149     "dmtc1      $8, $f22                        \n\t"
150     "dli        $8, 0x30                        \n\t"
151     "dmtc1      $8, $f24                        \n\t"
152     "dli        $8, 0x3                         \n\t"
153     "dmtc1      $8, $f26                        \n\t"
154     "dli        $8, 0x93                        \n\t"
155     "dmtc1      $8, $f28                        \n\t"
156     "dli        $8, 0x39                        \n\t"
157     "dmtc1      $8, $f30                        \n\t"
158     "pextrh     $f16, $f0, $f26                 \n\t"
159     "pextrh     $f18, $f2, $f26                 \n\t"
160     "pinsrh_3   $f2, $f2, $f16                  \n\t"
161     "pextrh     $f16, $f14, $f20                \n\t"
162     "pinsrh_0   $f14, $f14, $f18                \n\t"
163     "pextrh     $f18, $f12, $f20                \n\t"
164     "pinsrh_0   $f12, $f12, $f16                \n\t"
165     "pinsrh_3   $f0, $f0, $f18                  \n\t"
166 
167     "mov.d      $f4, $f0                        \n\t"
168     "pshufh     $f6, $f2, $f28                  \n\t"
169     "pshufh     $f8, $f12, $f30                 \n\t"
170     "mov.d      $f10, $f14                      \n\t"
171 
172     "mov.d      $f12, $f8                       \n\t"
173     "mov.d      $f14, $f10                      \n\t"
174     "dsrl       $f4, $f4, $f22                  \n\t"
175     "pinsrh_3   $f4, $f4, $f6                   \n\t"
176     "dsrl       $f6, $f6, $f22                  \n\t"
177     "dsll       $f14, $f12, $f24                \n\t"
178     "xor        $f12, $f12, $f12                \n\t"
179     "or         $f4, $f4, $f12                  \n\t"
180     "or         $f6, $f6, $f14                  \n\t"
181     "dsrl       $f8, $f8, $f22                  \n\t"
182     "pinsrh_3   $f8, $f8, $f10                  \n\t"
183     "dsrl       $f10, $f10, $f22                \n\t"
184     "gssqc1     $f6, $f4, 0x0(%[zig_value])     \n\t"
185     "gssqc1     $f10, $f8, 0x10(%[zig_value])   \n\t"
186     :
187     : [zig_value] "r"((short *)zig_value), [pDct] "r"((short *)pDct)
188     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
189       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
190   );
191   RECOVER_REG;
192 }
193 
194 unsigned char i_ds_table[]__attribute__((aligned(16))) = {
195       3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
196 unsigned char high_mask_table[]__attribute__((aligned(16))) = {
197       0, 0, 0, 3, 0, 2, 3, 6, 0, 2,
198       2, 5, 3, 5, 6, 9, 0, 1, 2, 5,
199       2, 4, 5, 8, 3, 5, 5, 8, 6, 8,
200       9,12, 0, 1, 1, 4, 2, 4, 5, 8,
201       2, 4, 4, 7, 5, 7, 8,11, 3, 4,
202       5, 8, 5, 7, 8,11, 6, 8, 8,11,
203       9,11,12,15, 0, 1, 1, 4, 1, 3,
204       4, 7, 2, 4, 4, 7, 5, 7, 8,11,
205       2, 3, 4, 7, 4, 6, 7,10, 5, 7,
206       7,10, 8,10,11,14, 3, 4, 4, 7,
207       5, 7, 8,11, 5, 7, 7,10, 8,10,
208      11,14, 6, 7, 8,11, 8,10,11,14,
209       9,11,11,14,12,14,15,18, 0, 0,
210       1, 4, 1, 3, 4, 7, 1, 3, 3, 6,
211       4, 6, 7,10, 2, 3, 4, 7, 4, 6,
212       7,10, 5, 7, 7,10, 8,10,11,14,
213       2, 3, 3, 6, 4, 6, 7,10, 4, 6,
214       6, 9, 7, 9,10,13, 5, 6, 7,10,
215       7, 9,10,13, 8,10,10,13,11,13,
216      14,17, 3, 4, 4, 7, 4, 6, 7,10,
217       5, 7, 7,10, 8,10,11,14, 5, 6,
218       7,10, 7, 9,10,13, 8,10,10,13,
219      11,13,14,17, 6, 7, 7,10, 8,10,
220      11,14, 8,10,10,13,11,13,14,17,
221       9,10,11,14,11,13,14,17,12,14,
222      14,17,15,17,18,21};
223 
224 unsigned char low_mask_table[]__attribute__((aligned(16))) = {
225       0, 3, 2, 6, 2, 5, 5, 9, 1, 5,
226       4, 8, 5, 8, 8,12, 1, 4, 4, 8,
227       4, 7, 7,11, 4, 8, 7,11, 8,11,
228      11,15, 1, 4, 3, 7, 4, 7, 7,11,
229       3, 7, 6,10, 7,10,10,14, 4, 7,
230       7,11, 7,10,10,14, 7,11,10,14,
231      11,14,14,18, 0, 4, 3, 7, 3, 6,
232       6,10, 3, 7, 6,10, 7,10,10,14,
233       3, 6, 6,10, 6, 9, 9,13, 6,10,
234       9,13,10,13,13,17, 4, 7, 6,10,
235       7,10,10,14, 6,10, 9,13,10,13,
236      13,17, 7,10,10,14,10,13,13,17,
237      10,14,13,17,14,17,17,21, 0, 3,
238       3, 7, 3, 6, 6,10, 2, 6, 5, 9,
239       6, 9, 9,13, 3, 6, 6,10, 6, 9,
240       9,13, 6,10, 9,13,10,13,13,17,
241       3, 6, 5, 9, 6, 9, 9,13, 5, 9,
242       8,12, 9,12,12,16, 6, 9, 9,13,
243       9,12,12,16, 9,13,12,16,13,16,
244      16,20, 3, 7, 6,10, 6, 9, 9,13,
245       6,10, 9,13,10,13,13,17, 6, 9,
246       9,13, 9,12,12,16, 9,13,12,16,
247      13,16,16,20, 7,10, 9,13,10,13,
248      13,17, 9,13,12,16,13,16,16,20,
249      10,13,13,17,13,16,16,20,13,17,
250      16,20,17,20,20,24};
251 
WelsCalculateSingleCtr4x4_mmi(int16_t * pDct)252 int32_t WelsCalculateSingleCtr4x4_mmi(int16_t *pDct) {
253   int32_t iSingleCtr = 0;
254   __asm__ volatile(
255     ".set       arch=loongson3a                 \n\t"
256     "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
257     "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"
258     "packsshb   $f0, $f0, $f2                   \n\t"
259     "packsshb   $f2, $f4, $f6                   \n\t"
260 
261     "xor        $f10, $f10, $f10                \n\t"
262     "xor        $f8, $f8, $f8                   \n\t"
263 
264     "pcmpeqb    $f0, $f0, $f8                   \n\t"
265     "pcmpeqb    $f2, $f2, $f8                   \n\t"
266 
267     "pmovmskb   $f10, $f0                       \n\t"
268     "pmovmskb   $f12, $f2                       \n\t"
269     "punpcklbh  $f10, $f10, $f12                \n\t"
270 
271     "dmfc1      $12, $f10                       \n\t"
272     "dli        $8, 0xffff                      \n\t"
273     "xor        $12, $12, $8                    \n\t"
274 
275     "xor        %[pDct], %[pDct], %[pDct]       \n\t"
276     "dli        $8, 0x80                        \n\t"
277     "dli        $9, 0x7                         \n\t"
278     "dli        $10, 0x100                      \n\t"
279     "dli        $11, 0x8                        \n\t"
280 
281     "1:                                         \n\t"
282     "and        $13, $12, $8                    \n\t"
283     "bnez       $13, 2f                         \n\t"
284     "nop                                        \n\t"
285     "daddiu     $9, -0x1                        \n\t"
286     "dsrl       $8, 1                           \n\t"
287     "bnez       $9, 1b                          \n\t"
288     "nop                                        \n\t"
289     "2:                                         \n\t"
290     "and        $13, $12, $10                   \n\t"
291     "bnez       $13, 3f                         \n\t"
292     "nop                                        \n\t"
293     "daddiu     $11, 0x1                        \n\t"
294     "dsll       $10, 1                          \n\t"
295     "daddiu     $13, $11, -0x10                 \n\t"
296     "bltz       $13, 2b                         \n\t"
297     "nop                                        \n\t"
298     "3:                                         \n\t"
299     "dsubu      $11, $11, $9                    \n\t"
300     "daddiu     $11, -0x1                       \n\t"
301     PTR_ADDU   "$8, %[i_ds_table], $11          \n\t"
302     "lb         $10, 0x0($8)                    \n\t"
303     PTR_ADDU   "%[pDct], %[pDct], $10           \n\t"
304     "move       $11, $12                        \n\t"
305     "dli        $10, 0xff                       \n\t"
306     "and        $12, $10                        \n\t"
307     "dsrl       $11, 0x8                        \n\t"
308     "and        $11, $10                        \n\t"
309     PTR_ADDU   "$8, %[low_mask_table], $12      \n\t"
310     "lb         $10, 0x0($8)                    \n\t"
311     PTR_ADDU   "%[pDct], %[pDct], $10           \n\t"
312     PTR_ADDU   "$8, %[high_mask_table], $11     \n\t"
313     "lb         $10, 0x0($8)                    \n\t"
314     PTR_ADDU   "%[iSingleCtr], %[pDct], $10     \n\t"
315     : [iSingleCtr] "=r"(iSingleCtr)
316     : [pDct] "r"((short *)pDct),
317       [i_ds_table] "r"((unsigned char *)i_ds_table),
318       [high_mask_table] "r"((unsigned char *)high_mask_table),
319       [low_mask_table] "r"((unsigned char *)low_mask_table)
320     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
321       "$f6", "$f8", "$f10", "$f12"
322   );
323   return iSingleCtr;
324 }
325