• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    dct_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    20/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
43 #define MMI_Load4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
44   "gslqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
45   "gslqc1     "#f10", "#f8", 0x10("#r0")      \n\t" \
46   "gslqc1     "#f18", "#f16", 0x20("#r0")     \n\t" \
47   "gslqc1     "#f6", "#f4", 0x30("#r0")       \n\t" \
48   MMI_XSawp_DQ(f8, f10, f4, f6, f12, f14)           \
49   MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)
50 
51 #define MMI_SumSubDiv2(f0, f2, f4, f6, f8, f10, f12, f14, f16) \
52   "mov.d      "#f8", "#f4"                    \n\t" \
53   "mov.d      "#f10", "#f6"                   \n\t" \
54   "psrah      "#f4", "#f4", "#f16"            \n\t" \
55   "psrah      "#f6", "#f6", "#f16"            \n\t" \
56   "psrah      "#f12", "#f0", "#f16"           \n\t" \
57   "psrah      "#f14", "#f2", "#f16"           \n\t" \
58   "paddh      "#f0", "#f0", "#f4"             \n\t" \
59   "paddh      "#f2", "#f2", "#f6"             \n\t" \
60   "psubh      "#f12", "#f12", "#f8"           \n\t" \
61   "psubh      "#f14", "#f14", "#f10"          \n\t"
62 
63 #define MMI_IDCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28) \
64   MMI_SumSub(f24, f26, f4, f6, f20, f22)                        \
65   MMI_SumSubDiv2(f0, f2, f8, f10, f16, f18, f12, f14, f28)      \
66   MMI_SumSub(f4, f6, f0, f2, f16, f18)                          \
67   MMI_SumSub(f24, f26, f12, f14, f16, f18)
68 
69 #define MMI_StoreDiff8p_6(f0, f2, f4, f6, f8, f12, r0, r1, f14) \
70   "paddh      "#f0", "#f0", "#f8"             \n\t" \
71   "paddh      "#f2", "#f2", "#f8"             \n\t" \
72   "psrah      "#f0", "#f0", "#f14"            \n\t" \
73   "psrah      "#f2", "#f2", "#f14"            \n\t" \
74   "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
75   "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
76   "punpckhbh  "#f6", "#f4", "#f12"            \n\t" \
77   "punpcklbh  "#f4", "#f4", "#f12"            \n\t" \
78   "paddsh     "#f4", "#f4", "#f0"             \n\t" \
79   "paddsh     "#f6", "#f6", "#f2"             \n\t" \
80   "packushb   "#f4", "#f4", "#f6"             \n\t" \
81   "gssdlc1    "#f4", 0x7("#r0")               \n\t" \
82   "gssdrc1    "#f4", 0x0("#r0")               \n\t"
83 
84 #define MMI_StoreDiff8p_5(f0, f2, f4, f6, f8, r0, r1, offset) \
85   "gsldlc1    "#f4", "#offset"+0x7("#r1")     \n\t" \
86   "gsldrc1    "#f4", "#offset"+0x0("#r1")     \n\t" \
87   "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
88   "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
89   "paddsh     "#f4", "#f4", "#f0"             \n\t" \
90   "paddsh     "#f6", "#f6", "#f2"             \n\t" \
91   "packushb   "#f4", "#f4", "#f6"             \n\t" \
92   "gssdlc1    "#f4", "#offset"+0x7("#r0")     \n\t" \
93   "gssdrc1    "#f4", "#offset"+0x0("#r0")     \n\t"
94 
95 #define MMI_Load8DC(f0, f2, f4, f6, f8, f10, f12, f14, f16, r0, offset, f20) \
96   "gslqc1     "#f2", "#f0", "#offset"+0x0("#r0") \n\t" \
97   "paddh      "#f0", "#f0", "#f16"               \n\t" \
98   "paddh      "#f2", "#f2", "#f16"               \n\t" \
99   "psrah      "#f0", "#f0", "#f20"               \n\t" \
100   "psrah      "#f2", "#f2", "#f20"               \n\t" \
101   "punpckhhw  "#f4", "#f0", "#f0"                \n\t" \
102   "punpckhwd  "#f6", "#f4", "#f4"                \n\t" \
103   "punpcklwd  "#f4", "#f4", "#f4"                \n\t" \
104   "punpcklhw  "#f8", "#f2", "#f2"                \n\t" \
105   "punpckhwd  "#f10", "#f8", "#f8"               \n\t" \
106   "punpcklwd  "#f8", "#f8", "#f8"                \n\t" \
107   "punpckhhw  "#f12", "#f2", "#f2"               \n\t" \
108   "punpckhwd  "#f14", "#f12", "#f12"             \n\t" \
109   "punpcklwd  "#f12", "#f12", "#f12"             \n\t" \
110   "punpcklhw  "#f0", "#f0", "#f0"                \n\t" \
111   "punpckhwd  "#f2", "#f0", "#f0"                \n\t" \
112   "punpcklwd  "#f0", "#f0", "#f0"                \n\t"
113 
114 #define MMI_StoreDiff4x8p(f0, f2, f4, f6, f8, f10, f12, r0, r1, r2, r3) \
115   MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0)         \
116   MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)         \
117   PTR_ADDU   ""#r0", "#r0", "#r2"                        \n\t" \
118   PTR_ADDU   ""#r1", "#r1", "#r3"                        \n\t" \
119   MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0)         \
120   MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)
121 
122 #define MMI_Load4Col(f0, f2, f4, f6, f8, r0, offset) \
123   "lh         $8, "#offset"("#r0")        \n\t" \
124   "dmtc1      $8, "#f0"                   \n\t" \
125   "lh         $8, "#offset"+0x20("#r0")   \n\t" \
126   "dmtc1      $8, "#f4"                   \n\t" \
127   "punpcklwd  "#f0", "#f0", "#f4"         \n\t" \
128   "lh         $8, "#offset"+0x80("#r0")   \n\t" \
129   "dmtc1      $8, "#f6"                   \n\t" \
130   "lh         $8, "#offset"+0xa0("#r0")   \n\t" \
131   "dmtc1      $8, "#f8"                   \n\t" \
132   "punpcklwd  "#f2", "#f6", "#f8"         \n\t"
133 
134 #define MMI_SumSubD(f0, f2, f4, f6, f8, f10) \
135   "mov.d      "#f8", "#f4"                \n\t" \
136   "mov.d      "#f10", "#f6"               \n\t" \
137   "paddw      "#f4", "#f4", "#f0"         \n\t" \
138   "paddw      "#f6", "#f6", "#f2"         \n\t" \
139   "psubw      "#f0", "#f0", "#f8"         \n\t" \
140   "psubw      "#f2", "#f2", "#f10"        \n\t"
141 
142 #define WELS_DD1(f0, f2, f_val_31) \
143   "pcmpeqh    "#f0", "#f0", "#f0"         \n\t" \
144   "pcmpeqh    "#f2", "#f2", "#f2"         \n\t" \
145   "psrlw      "#f0", "#f0", "#f_val_31"   \n\t" \
146   "psrlw      "#f2", "#f2", "#f_val_31"   \n\t"
147 
148 #define MMI_SumSubDiv2D(f0, f2, f4, f6, f8, f10, f12, f14, f_val_1) \
149   "paddw      "#f0", "#f0", "#f4"         \n\t" \
150   "paddw      "#f2", "#f2", "#f6"         \n\t" \
151   "paddw      "#f0", "#f0", "#f8"         \n\t" \
152   "paddw      "#f2", "#f2", "#f10"        \n\t" \
153   "psraw      "#f0", "#f0", "#f_val_1"    \n\t" \
154   "psraw      "#f2", "#f2", "#f_val_1"    \n\t" \
155   "mov.d      "#f12", "#f0"               \n\t" \
156   "mov.d      "#f14", "#f2"               \n\t" \
157   "psubw      "#f12", "#f12", "#f4"       \n\t" \
158   "psubw      "#f14", "#f14", "#f6"       \n\t"
159 
160 #define MMI_Trans4x4W(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
161   MMI_XSawp_WD(f0, f2, f4, f6, f16, f18)  \
162   MMI_XSawp_WD(f8, f10, f12, f14, f4, f6) \
163   MMI_XSawp_DQ(f0, f2, f8, f10, f12, f14) \
164   MMI_XSawp_DQ(f16, f18, f4, f6, f8, f10)
165 
166 #define MMI_SumSubMul2(f0, f2, f4, f6, f8, f10) \
167   "mov.d      "#f8", "#f0"                    \n\t" \
168   "mov.d      "#f10", "#f2"                   \n\t" \
169   "paddh      "#f0", "#f0", "#f0"             \n\t" \
170   "paddh      "#f2", "#f2", "#f2"             \n\t" \
171   "paddh      "#f0", "#f0", "#f4"             \n\t" \
172   "paddh      "#f2", "#f2", "#f6"             \n\t" \
173   "psubh      "#f8", "#f8", "#f4"             \n\t" \
174   "psubh      "#f10", "#f10", "#f6"           \n\t" \
175   "psubh      "#f8", "#f8", "#f4"             \n\t" \
176   "psubh      "#f10", "#f10", "#f6"           \n\t"
177 
178 #define MMI_DCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22) \
179   MMI_SumSub(f20, f22, f8, f10, f16, f18)   \
180   MMI_SumSub(f0, f2, f4, f6, f16, f18)      \
181   MMI_SumSub(f8, f10, f4, f6, f16, f18)     \
182   MMI_SumSubMul2(f20, f22, f0, f2, f12, f14)
183 
184 #define MMI_Store4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
185   MMI_XSawp_DQ(f0, f2, f4, f6, f16, f18)            \
186   MMI_XSawp_DQ(f8, f10, f12, f14, f4, f6)           \
187   "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
188   "gssqc1     "#f10", "#f8", 0x10("#r0")      \n\t" \
189   "gssqc1     "#f18", "#f16", 0x20("#r0")     \n\t" \
190   "gssqc1     "#f6", "#f4", 0x30("#r0")       \n\t"
191 
192 #define MMI_LoadDiff4P_SINGLE(f0, f2, r0, r1, f4) \
193   "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
194   "gsldlc1    "#f2", 0x7("#r1")               \n\t" \
195   "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
196   "gsldrc1    "#f2", 0x0("#r1")               \n\t" \
197   "punpcklbh  "#f0", "#f0", "#f4"             \n\t" \
198   "punpcklbh  "#f2", "#f2", "#f4"             \n\t" \
199   "psubh      "#f0", "#f0", "#f2"             \n\t"
200 
201 #define MMI_LoadDiff4x4P_SINGLE(f0, f2, f4, f6, r0, r1, r2, r3, f8, f10) \
202   MMI_LoadDiff4P_SINGLE(f0, f8, r0, r2, f10)        \
203   PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
204   PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \
205   MMI_LoadDiff4P_SINGLE(f2, f8, r0, r2, f10)        \
206   PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
207   PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \
208   MMI_LoadDiff4P_SINGLE(f4, f8, r0, r2, f10)        \
209   PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
210   PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \
211   MMI_LoadDiff4P_SINGLE(f6, f8, r0, r2, f10)
212 
213 #define MMI_DCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
214   MMI_SumSub_SINGLE(f6, f0, f10)     \
215   MMI_SumSub_SINGLE(f4, f2, f10)     \
216   MMI_SumSub_SINGLE(f4, f6, f10)     \
217   MMI_SumSubMul2_SINGLE(f0, f2, f8, f12)
218 
WelsIDctT4Rec_mmi(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)219 void WelsIDctT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
220                        int32_t iPredStride, int16_t* pDct) {
221   __asm__ volatile (
222     ".set       arch=loongson3a                    \n\t"
223     "gsldlc1    $f0, 0x7(%[pDct])                  \n\t"
224     "gsldrc1    $f0, 0x0(%[pDct])                  \n\t"
225     "gsldlc1    $f2, 0xF(%[pDct])                  \n\t"
226     "gsldrc1    $f2, 0x8(%[pDct])                  \n\t"
227     "gsldlc1    $f4, 0x17(%[pDct])                 \n\t"
228     "gsldrc1    $f4, 0x10(%[pDct])                 \n\t"
229     "gsldlc1    $f6, 0x1F(%[pDct])                 \n\t"
230     "gsldrc1    $f6, 0x18(%[pDct])                 \n\t"
231 
232     "dli        $8, 0x1                            \n\t"
233     "dmtc1      $8, $f16                           \n\t"
234     "dli        $8, 0x6                            \n\t"
235     "dmtc1      $8, $f18                           \n\t"
236 
237     MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
238     MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f16)
239     MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
240     MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f16)
241 
242     "xor        $f14, $f14, $f14                   \n\t"
243     "dli        $8, 0x0020                         \n\t"
244     "dmtc1      $8, $f12                           \n\t"
245     "punpcklhw  $f12, $f12, $f12                   \n\t"
246     "punpcklwd  $f12, $f12, $f12                   \n\t"
247 
248     MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
249     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
250     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
251     MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
252     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
253     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
254     MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
255     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
256     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
257     MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
258     : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred)
259     : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride),
260       [pDct]"r"((short *)pDct)
261     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
262       "$f14", "$f16", "$f18"
263   );
264 }
265 
WelsIDctFourT4Rec_mmi(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)266 void WelsIDctFourT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
267                            int32_t iPredStride, int16_t* pDct) {
268   BACKUP_REG;
269   __asm__ volatile (
270     ".set       arch=loongson3a                    \n\t"
271     MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
272 
273     MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
274     "dli        $8, 0x1                            \n\t"
275     "dmtc1      $8, $f30                           \n\t"
276     MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
277              $f0, $f2, $f30)
278     MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
279     MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
280              $f4, $f6, $f30)
281 
282     "xor        $f28, $f28, $f28                   \n\t"
283     "dli        $8, 0x6                            \n\t"
284     "dmtc1      $8, $f26                           \n\t"
285     "dli        $8, 0x0020                         \n\t"
286     "dmtc1      $8, $f24                           \n\t"
287     "punpcklhw  $f24, $f24, $f24                   \n\t"
288     "punpcklwd  $f24, $f24, $f24                   \n\t"
289 
290     MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
291     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
292     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
293     MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
294     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
295     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
296     MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
297     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
298     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
299     MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
300 
301     PTR_ADDIU  "%[pDct], %[pDct], 0x40             \n\t"
302     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
303     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
304     MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
305 
306     MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
307     MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
308              $f0, $f2, $f30)
309     MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
310     MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
311              $f4, $f6, $f30)
312 
313     "dli        $8, 0x6                            \n\t"
314     "dmtc1      $8, $f26                           \n\t"
315     "dli        $8, 0x0020                         \n\t"
316     "dmtc1      $8, $f24                           \n\t"
317     "punpcklhw  $f24, $f24, $f24                   \n\t"
318     "punpcklwd  $f24, $f24, $f24                   \n\t"
319 
320     MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
321     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
322     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
323     MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
324     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
325     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
326     MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
327     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
328     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
329     MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
330     : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
331       [pDct]"+&r"((short *)pDct)
332     : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
333     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
334       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
335   );
336   RECOVER_REG;
337 }
338 
WelsIDctRecI16x16Dc_mmi(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)339 void WelsIDctRecI16x16Dc_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
340                              int32_t iPredStride, int16_t* pDct) {
341   BACKUP_REG;
342   __asm__ volatile (
343     ".set       arch=loongson3a                    \n\t"
344     "xor        $f28, $f28, $f28                   \n\t"
345     "dli        $8, 0x0020                         \n\t"
346     "dmtc1      $8, $f24                           \n\t"
347     "punpcklhw  $f24, $f24, $f24                   \n\t"
348     "punpcklwd  $f24, $f24, $f24                   \n\t"
349     "dli        $8, 0x6                            \n\t"
350     "dmtc1      $8, $f30                           \n\t"
351 
352     MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24,
353                 %[pDct], 0x0, $f30)
354 
355     MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
356                       %[pPred], %[iStride], %[iPredStride])
357 
358     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
359     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
360     MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
361                       %[pPred], %[iStride], %[iPredStride])
362 
363     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
364     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
365     MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
366                       %[pPred], %[iStride], %[iPredStride])
367 
368     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
369     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
370     MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
371                       %[pPred], %[iStride], %[iPredStride])
372 
373     MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24, %[pDct], 0x10, $f30)
374     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
375     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
376     MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
377                       %[pPred], %[iStride], %[iPredStride])
378 
379     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
380     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
381     MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
382                       %[pPred], %[iStride], %[iPredStride])
383 
384     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
385     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
386     MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
387                       %[pPred], %[iStride], %[iPredStride])
388 
389     PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
390     PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
391     MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
392                       %[pPred], %[iStride], %[iPredStride])
393     : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
394       [pDct]"+&r"((short *)pDct)
395     : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
396     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
397       "$f14", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
398   );
399   RECOVER_REG;
400 }
401 
WelsHadamardT4Dc_mmi(int16_t * luma_dc,int16_t * pDct)402 void WelsHadamardT4Dc_mmi( int16_t *luma_dc, int16_t *pDct) {
403   BACKUP_REG;
404   __asm__ volatile (
405     ".set       arch=loongson3a                 \n\t"
406     MMI_Load4Col($f4, $f6, $f20, $f24, $f0, %[pDct], 0x0)
407     MMI_Load4Col($f8, $f10, $f20, $f24, $f0, %[pDct], 0x40)
408     MMI_Load4Col($f12, $f14, $f20, $f24, $f0, %[pDct], 0x100)
409     MMI_Load4Col($f16, $f18, $f20, $f24, $f0, %[pDct], 0x140)
410 
411     MMI_SumSubD($f4, $f6, $f8, $f10, $f28, $f30)
412     MMI_SumSubD($f12, $f14, $f16, $f18, $f28, $f30)
413     MMI_SumSubD($f8, $f10, $f16, $f18, $f28, $f30)
414     MMI_SumSubD($f4, $f6, $f12, $f14, $f28, $f30)
415 
416     MMI_Trans4x4W($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f20, $f22)
417 
418     MMI_SumSubD($f16, $f18, $f12, $f14, $f28, $f30)
419     MMI_SumSubD($f20, $f22, $f4, $f6, $f28, $f30)
420 
421     "dli        $8, 0x1F                        \n\t"
422     "dmtc1      $8, $f30                        \n\t"
423 
424     WELS_DD1($f24, $f26, $f30)
425 
426     "dli        $8, 0x1                         \n\t"
427     "dmtc1      $8, $f30                        \n\t"
428 
429     MMI_SumSubDiv2D($f12, $f14, $f4, $f6, $f24, $f26, $f0, $f2, $f30)
430     MMI_SumSubDiv2D($f16, $f18, $f20, $f22, $f24, $f26, $f4, $f6, $f30)
431     MMI_Trans4x4W($f12, $f14, $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10)
432 
433     "packsswh   $f12, $f12, $f14                \n\t"
434     "packsswh   $f14, $f16, $f18                \n\t"
435 
436     "packsswh   $f8, $f8, $f10                  \n\t"
437     "packsswh   $f10, $f4, $f6                  \n\t"
438     "gssqc1     $f14, $f12, 0x0(%[luma_dc])     \n\t"
439     "gssqc1     $f10, $f8, 0x10(%[luma_dc])     \n\t"
440    :
441    : [luma_dc]"r"((short *)luma_dc), [pDct]"r"((short *)pDct)
442    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
443      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
444   );
445   RECOVER_REG;
446 }
447 
WelsDctT4_mmi(int16_t * pDct,uint8_t * pix1,int32_t i_pix1,uint8_t * pix2,int32_t i_pix2)448 void WelsDctT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
449                    uint8_t *pix2, int32_t i_pix2 ) {
450   __asm__ volatile (
451     ".set       arch=loongson3a                 \n\t"
452     "xor        $f14, $f14, $f14                \n\t"
453     "dli        $8, 0x1                         \n\t"
454     "dmtc1      $8, $f16                        \n\t"
455 
456     MMI_LoadDiff4x4P_SINGLE($f2, $f4, $f6, $f8, %[pix1], %[i_pix1],
457                             %[pix2], %[i_pix2], $f0, $f14)
458 
459     MMI_DCT_SINGLE($f2, $f4, $f6, $f8, $f10, $f12, $f16)
460     MMI_Trans4x4H_SINGLE($f6, $f2, $f8, $f10, $f4)
461 
462     MMI_DCT_SINGLE($f6, $f10, $f4, $f8, $f2, $f12, $f16)
463     MMI_Trans4x4H_SINGLE($f4, $f6, $f8, $f2, $f10)
464 
465     "gssdlc1    $f4, 0x7(%[pDct])               \n\t"
466     "gssdlc1    $f2, 0xF(%[pDct])               \n\t"
467     "gssdlc1    $f10, 0x17(%[pDct])             \n\t"
468     "gssdlc1    $f8, 0x1F(%[pDct])              \n\t"
469     "gssdrc1    $f4, 0x0(%[pDct])               \n\t"
470     "gssdrc1    $f2, 0x8(%[pDct])               \n\t"
471     "gssdrc1    $f10, 0x10(%[pDct])             \n\t"
472     "gssdrc1    $f8, 0x18(%[pDct])              \n\t"
473    : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
474    : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
475    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
476      "$f14", "$f16"
477   );
478 }
479 
WelsDctFourT4_mmi(int16_t * pDct,uint8_t * pix1,int32_t i_pix1,uint8_t * pix2,int32_t i_pix2)480 void WelsDctFourT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
481                        uint8_t *pix2, int32_t i_pix2 ) {
482   BACKUP_REG;
483   __asm__ volatile (
484     ".set       arch=loongson3a                 \n\t"
485     "xor        $f28, $f28, $f28                \n\t"
486     MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
487     PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
488     PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
489     MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
490     PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
491     PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
492     MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
493     PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
494     PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
495     MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
496 
497     MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
498     MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
499     MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
500     MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
501 
502     MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
503     PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
504     PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
505     MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
506     PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
507     PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
508     MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
509     PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
510     PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
511     MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
512     PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
513     PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
514     MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
515 
516     MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
517     MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
518     MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
519     MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
520 
521     PTR_ADDIU  "%[pDct], %[pDct], 0x40          \n\t"
522     MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
523    : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
524    : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
525    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
526      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
527   );
528   RECOVER_REG;
529 }
530