• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    dct_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    17/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
43 #define LOAD_2_LEFT_AND_ADD                                   \
44   PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
45   "lbu        $9, -0x1(%[pPred])                        \n\t" \
46   PTR_ADDU   "$8, $8, $9                                \n\t" \
47   PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
48   "lbu        $9, -0x1(%[pPred])                        \n\t" \
49   PTR_ADDU   "$8, $8, $9                                \n\t"
50 
51 unsigned char mmi_dc_0x80[16] __attribute__((aligned(16))) = {
52   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
53   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
54 
55 short mmi_wd_0x02[8] __attribute__((aligned(16))) = {2, 2, 2, 2, 2, 2, 2, 2};
56 short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4, -3, -2, -1, 0};
57 short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
58 short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
59 
60 short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
61 short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
62 short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0, 1, 2, 3, 4};
63 
64 unsigned char mmi_01bytes[16]__attribute__((aligned(16))) = {1, 1, 1, 1, 1, 1, 1, 1,
65                                                              1, 1, 1, 1, 1, 1, 1, 1};
66 
IdctResAddPred_mmi(uint8_t * pPred,const int32_t kiStride,int16_t * pRs)67 void IdctResAddPred_mmi(uint8_t *pPred, const int32_t kiStride, int16_t *pRs) {
68   __asm__ volatile (
69     ".set       arch=loongson3a                           \n\t"
70     "dli        $8, 0x1                                   \n\t"
71     "gsldxc1    $f0, 0x0(%[pRs], $0)                      \n\t"
72     "gsldxc1    $f2, 0x8(%[pRs], $0)                      \n\t"
73     "gsldxc1    $f4, 0x10(%[pRs], $0)                     \n\t"
74     "gsldxc1    $f6, 0x18(%[pRs], $0)                     \n\t"
75     "dmtc1      $8, $f14                                  \n\t"
76 
77     MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
78     MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f14)
79     MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
80     MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f14)
81 
82     "dli        $8, 0x20                                  \n\t"
83     "xor        $f14, $f14, $f14                          \n\t"
84     "dmtc1      $8, $f12                                  \n\t"
85     "pshufh     $f12, $f12, $f14                          \n\t"
86     "dli        $8, 0x6                                   \n\t"
87     "dmtc1      $8, $f16                                  \n\t"
88 
89     MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
90     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
91     MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
92     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
93     MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
94     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
95     MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
96     : [pPred]"+&r"((unsigned char *)pPred)
97     : [pRs]"r"((unsigned char *)pRs), [kiStride]"r"((int)kiStride)
98     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
99       "$f14", "$f16"
100   );
101 }
102 
WelsDecoderI16x16LumaPredDc_mmi(uint8_t * pPred,const int32_t kiStride)103 void WelsDecoderI16x16LumaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
104   __asm__ volatile(
105     ".set       arch=loongson3a                           \n\t"
106     "dli        $8, 0x5                                   \n\t"
107     "gsldxc1    $f10, 0x0(%[mmi_01bytes], $0)             \n\t"
108     "dmtc1      $8, $f8                                   \n\t"
109 
110     "move       $10, %[pPred]                             \n\t"
111     PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
112     "gslqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
113     "xor        $f4, $f4, $f4                             \n\t"
114     "pasubub    $f0, $f0, $f4                             \n\t"
115     "pasubub    $f2, $f2, $f4                             \n\t"
116     "biadd      $f0, $f0                                  \n\t"
117     "biadd      $f2, $f2                                  \n\t"
118     "paddh      $f0, $f0, $f2                             \n\t"
119 
120     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
121     "lbu        $8, -0x1(%[pPred])                        \n\t"
122     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
123     "lbu        $9, -0x1(%[pPred])                        \n\t"
124     PTR_ADDU   "$8, $8, $9                                \n\t"
125 
126     LOAD_2_LEFT_AND_ADD
127     LOAD_2_LEFT_AND_ADD
128     LOAD_2_LEFT_AND_ADD
129     LOAD_2_LEFT_AND_ADD
130     LOAD_2_LEFT_AND_ADD
131     LOAD_2_LEFT_AND_ADD
132     LOAD_2_LEFT_AND_ADD
133 
134     PTR_ADDIU  "$8, $8, 0x10                              \n\t"
135     "dmtc1      $8, $f4                                   \n\t"
136     "paddh      $f0, $f0, $f4                             \n\t"
137     "psrlw      $f0, $f0, $f8                             \n\t"
138     "pmuluw     $f0, $f0, $f10                            \n\t"
139     "punpcklwd  $f0, $f0, $f0                             \n\t"
140     "mov.d      $f2, $f0                                  \n\t"
141 
142     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
143     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
144     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
145     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
146     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
147 
148     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
149     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
150     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
151     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
152 
153     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
154     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
155     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
156     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
157 
158     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
159     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
160     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
161     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
162 
163     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
164     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
165     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
166     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
167 
168     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
169     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
170     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
171     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
172 
173     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
174     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
175     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
176     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
177 
178     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
179     "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
180     : [pPred] "+&r"((unsigned char *)pPred)
181     : [kiStride] "r"((int)kiStride),
182       [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
183     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10"
184   );
185 }
186 
WelsDecoderI16x16LumaPredPlane_mmi(uint8_t * pPred,const int32_t kiStride)187 void WelsDecoderI16x16LumaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
188   BACKUP_REG;
189   __asm__ volatile (
190     ".set       arch=loongson3a                           \n\t"
191     "move       $10, %[pPred]                             \n\t"
192     PTR_ADDIU  "%[pPred], %[pPred], -0x1                  \n\t"
193     PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
194 
195     "gsldlc1    $f0, 0x7(%[pPred])                        \n\t"
196     "xor        $f28, $f28, $f28                          \n\t"
197     "gsldrc1    $f0, 0x0(%[pPred])                        \n\t"
198     "gslqc1     $f22, $f20, 0x0(%[mmi_plane_dec])         \n\t"
199     "punpckhbh  $f2, $f0, $f28                            \n\t"
200     "punpcklbh  $f0, $f0, $f28                            \n\t"
201     "pmullh     $f0, $f0, $f20                            \n\t"
202     "gsldlc1    $f4, 0x10(%[pPred])                       \n\t"
203     "pmullh     $f2, $f2, $f22                            \n\t"
204     "gsldrc1    $f4, 0x9(%[pPred])                        \n\t"
205     "gslqc1     $f26, $f24, 0x0(%[mmi_plane_inc])         \n\t"
206     "punpckhbh  $f6, $f4, $f28                            \n\t"
207     "punpcklbh  $f4, $f4, $f28                            \n\t"
208     "pmullh     $f4, $f4, $f24                            \n\t"
209     "pmullh     $f6, $f6, $f26                            \n\t"
210     "psubh      $f4, $f4, $f0                             \n\t"
211     "psubh      $f6, $f6, $f2                             \n\t"
212 
213     SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
214     "dmfc1      $8, $f4                                   \n\t"
215     "seh        $8, $8                                    \n\t"
216     "mul        $8, $8, 0x5                               \n\t"
217     PTR_ADDIU  "$8, $8, 0x20                              \n\t"
218     "sra        $8, $8, 0x6                               \n\t"
219     MMI_Copy8Times($f4, $f6, $f28, $8)
220 
221     "lbu        $9, 0x10(%[pPred])                        \n\t"
222     PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
223     LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
224                 %[kiStride], $11)
225 
226     PTR_ADDIU  "%[pPred], %[pPred], 0x3                   \n\t"
227     "dsll       $11, %[kiStride], 0x3                     \n\t"
228     PTR_ADDU   "$11, $11, %[pPred]                        \n\t"
229     "lbu        $8, 0x0($11)                              \n\t"
230     PTR_ADDU   "$9, $9, $8                                \n\t"
231     "dsll       $9, $9, 0x4                               \n\t"
232 
233     PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
234     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
235     LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
236                 %[kiStride], $11)
237 
238     "xor        $f16, $f16, $f16                          \n\t"
239     "punpcklbh  $f0, $f2, $f16                            \n\t"
240     "punpckhbh  $f2, $f2, $f16                            \n\t"
241     "pmullh     $f0, $f0, $f20                            \n\t"
242     "pmullh     $f2, $f2, $f22                            \n\t"
243     "punpcklbh  $f28, $f30, $f16                          \n\t"
244     "punpckhbh  $f30, $f30, $f16                          \n\t"
245     "pmullh     $f28, $f28, $f24                          \n\t"
246     "pmullh     $f30, $f30, $f26                          \n\t"
247     "psubh      $f28, $f28, $f0                           \n\t"
248     "psubh      $f30, $f30, $f2                           \n\t"
249 
250     "xor        $f8, $f8, $f8                             \n\t"
251 
252     SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
253     "dmfc1      $8, $f28                                  \n\t"
254     "seh        $8, $8                                    \n\t"
255 
256     "mul        $8, $8, 0x5                               \n\t"
257     PTR_ADDIU  "$8, $8, 0x20                              \n\t"
258     "sra        $8, $8, 0x6                               \n\t"
259     MMI_Copy8Times($f16, $f18, $f8, $8)
260 
261     "move       %[pPred], $10                             \n\t"
262     PTR_ADDIU  "$9, $9, 0x10                              \n\t"
263     "mul        $8, $8, -0x7                              \n\t"
264     PTR_ADDU   "$9, $9, $8                                \n\t"
265     MMI_Copy8Times($f0, $f2, $f8, $9)
266 
267     "xor        $8, $8, $8                                \n\t"
268     "gslqc1     $f22, $f20, 0x0(%[mmi_plane_inc_minus])   \n\t"
269 
270     "dli        $11, 0x5                                  \n\t"
271     "dmtc1      $11, $f30                                 \n\t"
272     "1:                                                   \n\t"
273     "pmullh     $f8, $f4, $f20                            \n\t"
274     "pmullh     $f10, $f6, $f22                           \n\t"
275     "paddh      $f8, $f8, $f0                             \n\t"
276     "paddh      $f10, $f10, $f2                           \n\t"
277     "psrah      $f8, $f8, $f30                            \n\t"
278     "psrah      $f10, $f10, $f30                          \n\t"
279     "pmullh     $f12, $f4, $f24                           \n\t"
280     "pmullh     $f14, $f6, $f26                           \n\t"
281     "paddh      $f12, $f12, $f0                           \n\t"
282     "paddh      $f14, $f14, $f2                           \n\t"
283     "psrah      $f12, $f12, $f30                          \n\t"
284     "psrah      $f14, $f14, $f30                          \n\t"
285     "packushb   $f8, $f8, $f10                            \n\t"
286     "packushb   $f10, $f12, $f14                          \n\t"
287     "gssqc1     $f10, $f8, 0x0(%[pPred])                  \n\t"
288     "paddh      $f0, $f0, $f16                            \n\t"
289     "paddh      $f2, $f2, $f18                            \n\t"
290     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
291     PTR_ADDIU  "$8, $8, 0x1                               \n\t"
292     PTR_ADDIU  "$11, $8, -0x10                            \n\t"
293     "bnez       $11, 1b                                   \n\t"
294     "nop                                                  \n\t"
295     : [pPred]"+&r"((unsigned char *)pPred)
296     : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
297       [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
298     : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
299       "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
300       "$f28", "$f30"
301   );
302   RECOVER_REG;
303 }
304 
305 #define COPY_16_TIMES(r0, f0, f2, f4, f6, f8)                 \
306   "gslqc1     "#f2", "#f0", -0x10("#r0")                \n\t" \
307   "dsrl       "#f0", "#f2", "#f4"                       \n\t" \
308   "pmuluw     "#f0", "#f0", "#f6"                       \n\t" \
309   "punpcklwd  "#f0", "#f0", "#f0"                       \n\t" \
310   "mov.d      "#f2", "#f0"                              \n\t"
311 
312 #define MMI_PRED_H_16X16_TWO_LINE_DEC                         \
313   PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
314   COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)            \
315   "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t" \
316   PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
317   COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)            \
318   "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
319 
WelsDecoderI16x16LumaPredH_mmi(uint8_t * pPred,const int32_t kiStride)320 void WelsDecoderI16x16LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
321   __asm__ volatile (
322     ".set       arch=loongson3a                           \n\t"
323     "dli        $8, 56                                    \n\t"
324     "dmtc1      $8, $f4                                   \n\t"
325     "gsldxc1    $f6, 0x0(%[mmi_01bytes], $0)              \n\t"
326     "xor        $f8, $f8, $f8                             \n\t"
327 
328     COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
329     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
330     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
331     COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
332     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
333 
334     MMI_PRED_H_16X16_TWO_LINE_DEC
335     MMI_PRED_H_16X16_TWO_LINE_DEC
336     MMI_PRED_H_16X16_TWO_LINE_DEC
337     MMI_PRED_H_16X16_TWO_LINE_DEC
338     MMI_PRED_H_16X16_TWO_LINE_DEC
339     MMI_PRED_H_16X16_TWO_LINE_DEC
340     MMI_PRED_H_16X16_TWO_LINE_DEC
341     : [pPred]"+&r"((unsigned char *)pPred)
342     : [kiStride]"r"((int)kiStride),
343       [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
344     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
345   );
346 }
347 
WelsDecoderI16x16LumaPredV_mmi(uint8_t * pPred,const int32_t kiStride)348 void WelsDecoderI16x16LumaPredV_mmi(uint8_t *pPred, const int32_t kiStride) {
349   __asm__ volatile(
350     ".set       arch=loongson3a                           \n\t"
351     PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
352     "gslqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
353 
354     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
355     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
356     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
357     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
358     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
359     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
360     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
361     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
362     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
363     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
364     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
365     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
366     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
367     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
368     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
369     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
370     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
371     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
372     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
373     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
374     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
375     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
376     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
377     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
378     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
379     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
380     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
381     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
382     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
383     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
384     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
385     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
386     : [pPred] "+&r"((unsigned char *)pPred)
387     : [kiStride] "r"((int)kiStride)
388     : "memory", "$f0", "$f2"
389   );
390 }
391 
WelsDecoderI16x16LumaPredDcTop_mmi(uint8_t * pPred,const int32_t kiStride)392 void WelsDecoderI16x16LumaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
393   BACKUP_REG;
394   __asm__ volatile (
395     ".set       arch=loongson3a                           \n\t"
396     PTR_SUBU   "$8, %[pPred], %[kiStride]                 \n\t"
397     "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
398     "xor        $f28, $f28, $f28                          \n\t"
399     "pasubub    $f0, $f0, $f28                            \n\t"
400     "pasubub    $f2, $f2, $f28                            \n\t"
401     "biadd      $f0, $f0                                  \n\t"
402     "biadd      $f2, $f2                                  \n\t"
403     "paddh      $f0, $f0, $f2                             \n\t"
404     "dmfc1      $8, $f0                                   \n\t"
405 
406     PTR_ADDIU  "$8, $8, 0x8                               \n\t"
407     "dsra       $8, $8, 0x4                               \n\t"
408     MMI_Copy16Times($f4, $f6, $f28, $8)
409     "mov.d      $f0, $f4                                  \n\t"
410     "mov.d      $f2, $f6                                  \n\t"
411 
412     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
413     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
414     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
415     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
416     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
417     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
418     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
419 
420     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
421     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
422     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
423     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
424     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
425     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
426     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
427     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
428 
429     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
430     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
431     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
432     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
433     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
434     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
435     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
436     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
437 
438     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
439     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
440     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
441     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
442     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
443     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
444     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
445     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
446     : [pPred]"+&r"((unsigned char *)pPred)
447     : [kiStride]"r"((int)kiStride)
448     : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
449   );
450   RECOVER_REG;
451 }
452 
WelsDecoderI16x16LumaPredDcNA_mmi(uint8_t * pPred,const int32_t kiStride)453 void WelsDecoderI16x16LumaPredDcNA_mmi(uint8_t *pPred, const int32_t kiStride) {
454   __asm__ volatile(
455     ".set       arch=loongson3a                           \n\t"
456     "gslqc1     $f2, $f0, 0x0(%[mmi_dc_0x80])             \n\t"
457     "mov.d      $f4, $f0                                  \n\t"
458     "mov.d      $f6, $f2                                  \n\t"
459     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
460     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
461     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
462     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
463     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
464     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
465     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
466 
467     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
468     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
469     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
470     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
471     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
472     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
473     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
474     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
475 
476     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
477     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
478     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
479     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
480     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
481     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
482     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
483     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
484 
485     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
486     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
487     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
488     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
489     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
490     "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
491     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
492     "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
493     : [pPred] "+&r"((unsigned char *)pPred)
494     : [kiStride] "r"((int)kiStride), [mmi_dc_0x80] "r"(mmi_dc_0x80)
495     : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
496   );
497 }
498 
WelsDecoderIChromaPredPlane_mmi(uint8_t * pPred,const int32_t kiStride)499 void WelsDecoderIChromaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
500   BACKUP_REG;
501   __asm__ volatile (
502     ".set       arch=loongson3a                           \n\t"
503     "move       $10, %[pPred]                             \n\t"
504     PTR_ADDIU  "%[pPred], %[pPred], -0x1                  \n\t"
505     PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
506 
507     "gsldlc1    $f0, 0x7(%[pPred])                        \n\t"
508     "xor        $f28, $f28, $f28                          \n\t"
509     "gsldrc1    $f0, 0x0(%[pPred])                        \n\t"
510     "gsldxc1    $f20, 0x0(%[mmi_plane_dec_c], $0)         \n\t"
511     "punpcklbh  $f0, $f0, $f28                            \n\t"
512     "gsldlc1    $f4, 0xc(%[pPred])                        \n\t"
513     "pmullh     $f0, $f0, $f20                            \n\t"
514     "gsldrc1    $f4, 0x5(%[pPred])                        \n\t"
515     "gsldxc1    $f24, 0x0(%[mmi_plane_inc_c], $0)         \n\t"
516     "punpcklbh  $f4, $f4, $f28                            \n\t"
517     "pmullh     $f4, $f4, $f24                            \n\t"
518     "psubh      $f4, $f4, $f0                             \n\t"
519 
520     "xor        $f6, $f6, $f6                             \n\t"
521     "xor        $f8, $f8, $f8                             \n\t"
522     SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
523     "dmfc1      $8, $f4                                   \n\t"
524     "seh        $8, $8                                    \n\t"
525     "mul        $8, $8, 0x11                              \n\t"
526     PTR_ADDIU  "$8, $8, 0x10                              \n\t"
527     "sra        $8, $8, 0x5                               \n\t"
528     MMI_Copy8Times($f4, $f6, $f8, $8)
529 
530     "lbu        $9, 0x8(%[pPred])                         \n\t"
531     PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
532     LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
533 
534     PTR_ADDIU  "%[pPred], %[pPred], 0x3                   \n\t"
535     "dsll       $11, %[kiStride], 0x2                     \n\t"
536     PTR_ADDU   "$11, $11, %[pPred]                        \n\t"
537     "lbu        $8, 0x0($11)                              \n\t"
538     PTR_ADDU   "$9, $9, $8                                \n\t"
539     "dsll       $9, $9, 0x4                               \n\t"
540 
541     PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
542     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
543     LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
544     "xor        $f16, $f16, $f16                          \n\t"
545     "punpckhbh  $f0, $f0, $f16                            \n\t"
546     "pmullh     $f0, $f0, $f20                            \n\t"
547     "punpckhbh  $f28, $f28, $f16                          \n\t"
548     "pmullh     $f28, $f28, $f24                          \n\t"
549     "psubh      $f28, $f28, $f0                           \n\t"
550 
551     "xor        $f30, $f30, $f30                          \n\t"
552     "xor        $f8, $f8, $f8                             \n\t"
553     SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
554     "dmfc1      $8, $f28                                  \n\t"
555     "seh        $8, $8                                    \n\t"
556 
557     "mul        $8, $8, 0x11                              \n\t"
558     PTR_ADDIU  "$8, $8, 0x10                              \n\t"
559     "sra        $8, $8, 0x5                               \n\t"
560     MMI_Copy8Times($f16, $f18, $f8, $8)
561 
562     "move       %[pPred], $10                             \n\t"
563     PTR_ADDIU  "$9, $9, 0x10                              \n\t"
564     "mul        $8, $8, -0x3                              \n\t"
565     PTR_ADDU   "$9, $9, $8                                \n\t"
566     MMI_Copy8Times($f0, $f2, $f8, $9)
567 
568     "xor        $8, $8, $8                                \n\t"
569     "gslqc1     $f22, $f20, 0x0(%[mmi_plane_mul_b_c])     \n\t"
570 
571     "dli        $11, 0x5                                  \n\t"
572     "dmtc1      $11, $f30                                 \n\t"
573     "1:                                                   \n\t"
574     "pmullh     $f8, $f4, $f20                            \n\t"
575     "pmullh     $f10, $f6, $f22                           \n\t"
576     "paddh      $f8, $f8, $f0                             \n\t"
577     "paddh      $f10, $f10, $f2                           \n\t"
578     "psrah      $f8, $f8, $f30                            \n\t"
579     "psrah      $f10, $f10, $f30                          \n\t"
580     "packushb   $f8, $f8, $f10                            \n\t"
581     "gssdxc1    $f8, 0x0(%[pPred], $0)                    \n\t"
582     "paddh      $f0, $f0, $f16                            \n\t"
583     "paddh      $f2, $f2, $f18                            \n\t"
584     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
585     PTR_ADDIU  "$8, $8, 0x1                               \n\t"
586     PTR_ADDIU  "$11, $8, -0x8                             \n\t"
587     "bnez       $11, 1b                                   \n\t"
588     "nop                                                  \n\t"
589     : [pPred]"+&r"((unsigned char *)pPred)
590     : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
591       [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
592     : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
593       "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
594   );
595   RECOVER_REG;
596 }
597 
WelsDecoderIChromaPredDc_mmi(uint8_t * pPred,const int32_t kiStride)598 void WelsDecoderIChromaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
599   __asm__ volatile(
600     ".set       arch=loongson3a                           \n\t"
601     "move       $10, %[pPred]                             \n\t"
602 
603     PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
604     "gsldxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
605 
606     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
607     "lbu        $8, -0x1(%[pPred])                        \n\t"
608     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
609     "lbu        $9, -0x1(%[pPred])                        \n\t"
610     PTR_ADDU   "$8, $8, $9                                \n\t"
611     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
612     "lbu        $9, -0x1(%[pPred])                        \n\t"
613     PTR_ADDU   "$8, $8, $9                                \n\t"
614     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
615     "lbu        $9, -0x1(%[pPred])                        \n\t"
616     PTR_ADDU   "$8, $8, $9                                \n\t"
617     "dmtc1      $8, $f2                                   \n\t"
618 
619     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
620     "lbu        $8, -0x1(%[pPred])                        \n\t"
621     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
622     "lbu        $9, -0x1(%[pPred])                        \n\t"
623     PTR_ADDU   "$8, $8, $9                                \n\t"
624     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
625     "lbu        $9, -0x1(%[pPred])                        \n\t"
626     PTR_ADDU   "$8, $8, $9                                \n\t"
627     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
628     "lbu        $9, -0x1(%[pPred])                        \n\t"
629     PTR_ADDU   "$8, $8, $9                                \n\t"
630     "dmtc1      $8, $f4                                   \n\t"
631 
632     "xor        $f8, $f8, $f8                             \n\t"
633     "punpcklwd  $f6, $f0, $f8                             \n\t"
634     "punpckhwd  $f0, $f0, $f8                             \n\t"
635     "pasubub    $f0, $f0, $f8                             \n\t"
636     "pasubub    $f6, $f6, $f8                             \n\t"
637     "biadd      $f0, $f0                                  \n\t"
638     "biadd      $f6, $f6                                  \n\t"
639 
640     "paddd      $f6, $f6, $f2                             \n\t"
641     "paddd      $f2, $f4, $f0                             \n\t"
642 
643     "dli        $8, 0x2                                   \n\t"
644     "dmtc1      $8, $f8                                   \n\t"
645     "gsldxc1    $f12, 0x0(%[mmi_01bytes], $0)             \n\t"
646     "dli        $8, 0x3                                   \n\t"
647     "dmtc1      $8, $f10                                  \n\t"
648 
649     "paddd      $f0, $f0, $f8                             \n\t"
650     "dsrl       $f0, $f0, $f8                             \n\t"
651 
652     "paddd      $f4, $f4, $f8                             \n\t"
653     "dsrl       $f4, $f4, $f8                             \n\t"
654 
655     "paddd      $f6, $f6, $f8                             \n\t"
656     "paddd      $f6, $f6, $f8                             \n\t"
657     "dsrl       $f6, $f6, $f10                            \n\t"
658 
659     "paddd      $f2, $f2, $f8                             \n\t"
660     "paddd      $f2, $f2, $f8                             \n\t"
661     "dsrl       $f2, $f2, $f10                            \n\t"
662 
663     "dli        $8, 0x20                                  \n\t"
664     "dmtc1      $8, $f8                                   \n\t"
665     "pmuluw     $f0, $f0, $f12                            \n\t"
666     "pmuluw     $f6, $f6, $f12                            \n\t"
667     "dsll       $f0, $f0, $f8                             \n\t"
668     "xor        $f0, $f0, $f6                             \n\t"
669 
670     "pmuluw     $f4, $f4, $f12                            \n\t"
671     "pmuluw     $f2, $f2, $f12                            \n\t"
672     "dsll       $f2, $f2, $f8                             \n\t"
673     "xor        $f2, $f2, $f4                             \n\t"
674 
675     "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
676     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
677     "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
678     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
679     "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
680     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
681     "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
682 
683     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
684     "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
685     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
686     "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
687     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
688     "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
689     PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
690     "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
691     : [pPred] "+&r"((unsigned char *)pPred)
692     : [kiStride] "r"((int)kiStride),
693       [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
694     : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
695       "$f12"
696   );
697 }
698 
WelsDecoderIChromaPredDcTop_mmi(uint8_t * pPred,const int32_t kiStride)699 void WelsDecoderIChromaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
700   BACKUP_REG;
701   __asm__ volatile (
702     ".set       arch=loongson3a                           \n\t"
703     "dli        $8, 0x4e                                  \n\t"
704     "dmtc1      $8, $f16                                  \n\t"
705     "dli        $8, 0xb1                                  \n\t"
706     "dmtc1      $8, $f18                                  \n\t"
707     "dli        $8, 0x2                                   \n\t"
708     "dmtc1      $8, $f20                                  \n\t"
709     PTR_SUBU   "$8, %[pPred], %[kiStride]                 \n\t"
710     "gsldxc1    $f0, 0x0($8, $0)                          \n\t"
711     "xor        $f28, $f28, $f28                          \n\t"
712     "punpckhbh  $f2, $f0, $f28                            \n\t"
713     "punpcklbh  $f0, $f0, $f28                            \n\t"
714     "pshufh     $f4, $f0, $f16                            \n\t"
715     "pshufh     $f6, $f2, $f16                            \n\t"
716     "paddh      $f0, $f0, $f4                             \n\t"
717     "paddh      $f2, $f2, $f6                             \n\t"
718 
719     "pshufh     $f8, $f0, $f18                            \n\t"
720     "pshufh     $f14, $f2, $f18                           \n\t"
721     "paddh      $f2, $f2, $f14                            \n\t"
722     "paddh      $f0, $f0, $f8                             \n\t"
723 
724     "gslqc1     $f26, $f24, 0x0(%[mmi_wd_0x02])           \n\t"
725     "paddh      $f0, $f0, $f24                            \n\t"
726     "paddh      $f2, $f2, $f26                            \n\t"
727     "psrah      $f0, $f0, $f20                            \n\t"
728     "psrah      $f2, $f2, $f20                            \n\t"
729     "packushb   $f0, $f0, $f2                             \n\t"
730 
731     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
732     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
733     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
734     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
735     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
736     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
737     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
738 
739     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
740     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
741     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
742     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
743     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
744     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
745     PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
746     "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
747     : [pPred] "+&r"((unsigned char *)pPred)
748     : [kiStride] "r"((int)kiStride), [mmi_wd_0x02] "r"((short *)mmi_wd_0x02)
749     : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
750   );
751   RECOVER_REG;
752 }
753 
WelsDecoderI4x4LumaPredH_mmi(uint8_t * pPred,const int32_t kiStride)754 void WelsDecoderI4x4LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
755   __asm__ volatile(
756     ".set       arch=loongson3a                           \n\t"
757     "gsldxc1    $f8, 0x0(%[mmi_01bytes], $0)              \n\t"
758     "lbu        $8, -0x1(%[pPred])                        \n\t"
759     "dmtc1      $8, $f0                                   \n\t"
760     "pmuluw     $f0, $f0, $f8                             \n\t"
761 
762     PTR_ADDU   "$9, %[pPred], %[kiStride]                 \n\t"
763     "lbu        $8, -0x1($9)                              \n\t"
764     "dmtc1      $8, $f2                                   \n\t"
765     "pmuluw     $f2, $f2, $f8                             \n\t"
766 
767     PTR_ADDU   "$10, $9, %[kiStride]                      \n\t"
768     "lbu        $8, -0x1($10)                             \n\t"
769     "dmtc1      $8, $f4                                   \n\t"
770     "pmuluw     $f4, $f4, $f8                             \n\t"
771 
772     PTR_ADDU   "$11, $10, %[kiStride]                     \n\t"
773     "lbu        $8, -0x1($11)                             \n\t"
774     "dmtc1      $8, $f6                                   \n\t"
775     "pmuluw     $f6, $f6, $f8                             \n\t"
776 
777     "gsswxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
778     "gsswxc1    $f2, 0x0($9, $0)                          \n\t"
779     "gsswxc1    $f4, 0x0($10, $0)                         \n\t"
780     "gsswxc1    $f6, 0x0($11, $0)                         \n\t"
781     : [pPred] "+&r"((unsigned char *)pPred)
782     : [kiStride] "r"((int)kiStride),
783       [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
784     : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8"
785   );
786 }
787