• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    vaa_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    23/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
43 //f4 is 0x1, f6 is 0x8
44 #define WELS_MAX_REG_MMI(f0, f2, f4, f6) \
45   "punpckhwd  $f4, "#f0", "#f0"    \n\t" \
46   "punpckhwd  $f6, "#f2", "#f2"    \n\t" \
47   "pmaxub     "#f0", "#f0", $f4    \n\t" \
48   "pmaxub     "#f2", "#f2", $f6    \n\t" \
49   "pshufh     $f4, "#f0", "#f4"    \n\t" \
50   "pshufh     $f6, "#f2", "#f4"    \n\t" \
51   "pmaxub     "#f0", "#f0", $f4    \n\t" \
52   "pmaxub     "#f2", "#f2", $f6    \n\t" \
53   "dsrl       $f4, "#f0", "#f6"    \n\t" \
54   "dsrl       $f6, "#f2", "#f6"    \n\t" \
55   "pmaxub     "#f0", "#f0", $f4    \n\t" \
56   "pmaxub     "#f2", "#f2", $f6    \n\t"
57 
58 #define WELS_SAD_SD_MAD_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
59   "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
60   "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
61   "pasubub    $f12, $f4, $f0                      \n\t" \
62   "pasubub    $f14, $f6, $f2                      \n\t" \
63   "biadd      $f12, $f12                          \n\t" \
64   "biadd      $f14, $f14                          \n\t" \
65   "paddw      "#f4", "#f4", $f12                  \n\t" \
66   "paddw      "#f6", "#f6", $f14                  \n\t" \
67   "pasubub    $f12, $f8, $f0                      \n\t" \
68   "pasubub    $f14, $f10, $f2                     \n\t" \
69   "biadd      $f12, $f12                          \n\t" \
70   "biadd      $f14, $f14                          \n\t" \
71   "paddw      "#f8", "#f8", $f12                  \n\t" \
72   "paddw      "#f10", "#f10", $f14                \n\t" \
73   "pasubub    $f12, $f4, $f8                      \n\t" \
74   "pasubub    $f14, $f6, $f10                     \n\t" \
75   "pmaxub     "#f12", "#f12", $f12                \n\t" \
76   "pmaxub     "#f14", "#f14", $f14                \n\t" \
77   "pasubub    $f12, $f12, $f0                     \n\t" \
78   "pasubub    $f14, $f14, $f2                     \n\t" \
79   "biadd      $f12, $f12                          \n\t" \
80   "biadd      $f14, $f14                          \n\t" \
81   "paddw      "#f0", "#f0", $f12                  \n\t" \
82   "paddw      "#f2", "#f2", $f14                  \n\t" \
83   PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
84   PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
85 
86 #define WELS_SAD_16x2_MMI(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, r1, r2, r3) \
87   "gslqc1     "#f1",  "#f2",  0x00("#r1")         \n\t" \
88   "gslqc1     "#f3",  "#f4",  0x00("#r2")         \n\t" \
89   PTR_ADDU    ""#r1", "#r1",  "#r3"               \n\t" \
90   "gslqc1     "#f5",  "#f6",  0x00("#r1")         \n\t" \
91   PTR_ADDU    ""#r2", "#r2",  "#r3"               \n\t" \
92   "gslqc1     "#f7",  "#f8",  0x00("#r2")         \n\t" \
93   "pasubub    "#f1",  "#f1",  "#f3"               \n\t" \
94   "pasubub    "#f2",  "#f2",  "#f4"               \n\t" \
95   "biadd      "#f1",  "#f1"                       \n\t" \
96   "biadd      "#f2",  "#f2"                       \n\t" \
97   "pasubub    "#f5",  "#f5",  "#f7"               \n\t" \
98   "pasubub    "#f6",  "#f6",  "#f8"               \n\t" \
99   "biadd      "#f5",  "#f5"                       \n\t" \
100   "biadd      "#f6",  "#f6"                       \n\t" \
101   "paddw      "#f9",  "#f9",  "#f1"               \n\t" \
102   "paddw      "#f9",  "#f9",  "#f5"               \n\t" \
103   "paddw      "#f10", "#f10", "#f2"               \n\t" \
104   "paddw      "#f10", "#f10", "#f6"               \n\t" \
105   PTR_ADDU    ""#r1", "#r1",  "#r3"               \n\t" \
106   PTR_ADDU    ""#r2", "#r2",  "#r3"               \n\t"
107 
108 #define WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI(r0, r1, r2) \
109   "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
110   "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
111   "pasubub    $f12, $f4, $f8                      \n\t" \
112   "pasubub    $f14, $f6, $f10                     \n\t" \
113   "biadd      $f12, $f12                          \n\t" \
114   "biadd      $f14, $f14                          \n\t" \
115   "paddw      $f28, $f28, $f12                    \n\t" \
116   "paddw      $f30, $f30, $f14                    \n\t" \
117   "pasubub    $f12, $f4, $f8                      \n\t" \
118   "pasubub    $f14, $f6, $f10                     \n\t" \
119   "pasubub    $f8, $f4, $f0                       \n\t" \
120   "pasubub    $f10, $f6, $f2                      \n\t" \
121   "biadd      $f8, $f8                            \n\t" \
122   "biadd      $f10, $f10                          \n\t" \
123   "paddw      $f24, $f24, $f8                     \n\t" \
124   "paddw      $f26, $f26, $f10                    \n\t" \
125   "punpcklbh  $f8, $f6, $f2                       \n\t" \
126   "punpckhbh  $f10, $f6, $f2                      \n\t" \
127   "punpckhbh  $f6, $f4, $f0                       \n\t" \
128   "punpcklbh  $f4, $f4, $f0                       \n\t" \
129   "pmaddhw    $f4, $f4, $f4                       \n\t" \
130   "pmaddhw    $f6, $f6, $f6                       \n\t" \
131   "pmaddhw    $f8, $f8, $f8                       \n\t" \
132   "pmaddhw    $f10, $f10, $f10                    \n\t" \
133   "paddw      $f20, $f20, $f4                     \n\t" \
134   "paddw      $f22, $f22, $f6                     \n\t" \
135   "paddw      $f20, $f20, $f8                     \n\t" \
136   "paddw      $f22, $f22, $f10                    \n\t" \
137   "punpcklbh  $f4, $f12, $f0                      \n\t" \
138   "punpckhbh  $f6, $f12, $f0                      \n\t" \
139 	"punpcklbh  $f12, $f14, $f2                     \n\t" \
140 	"punpckhbh  $f14, $f14, $f2                     \n\t" \
141   "pmaddhw    $f4, $f4, $f4                       \n\t" \
142   "pmaddhw    $f6, $f6, $f6                       \n\t" \
143   "pmaddhw    $f12, $f12, $f12                    \n\t" \
144   "pmaddhw    $f14, $f14, $f14                    \n\t" \
145   "paddw      $f16, $f16, $f4                     \n\t" \
146   "paddw      $f18, $f18, $f6                     \n\t" \
147   "paddw      $f16, $f16, $f12                    \n\t" \
148   "paddw      $f18, $f18, $f14                    \n\t" \
149   PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
150   PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
151 
152 #define WELS_SAD_BGD_SQDIFF_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
153   "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
154   "punpcklbh  $f8, $f4, $f0                       \n\t" \
155   "punpckhbh  $f10, $f4, $f0                      \n\t" \
156   "punpcklbh  $f12, $f6, $f2                      \n\t" \
157   "punpckhbh  $f14, $f6, $f2                      \n\t" \
158   "pmaddhw    $f8, $f8, $f8                       \n\t" \
159   "pmaddhw    $f10, $f10, $f10                    \n\t" \
160   "pmaddhw    $f12, $f12, $f12                    \n\t" \
161   "pmaddhw    $f14, $f14, $f14                    \n\t" \
162   "paddw      $f8, $f8, $f12                      \n\t" \
163   "paddw      $f10, $f10, $f14                    \n\t" \
164   "punpckhwd  $f12, $f0, $f8                      \n\t" \
165   "punpckhwd  $f14, $f0, $f10                     \n\t" \
166   "punpcklwd  $f8, $f0, $f8                       \n\t" \
167   "punpcklwd  $f10, $f0, $f10                     \n\t" \
168   "paddw      $f8, $f8, $f12                      \n\t" \
169   "paddw      $f10, $f10, $f14                    \n\t" \
170   "paddw      "#f0", "#f0", $f8                   \n\t" \
171   "paddw      "#f2", "#f2", $f10                  \n\t" \
172   "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
173   "pasubub    $f12, $f4, $f0                      \n\t" \
174   "pasubub    $f14, $f6, $f2                      \n\t" \
175   "biadd      $f12, $f12                          \n\t" \
176   "biadd      $f14, $f14                          \n\t" \
177   "paddw      "#f4", "#f4", $f12                  \n\t" \
178   "paddw      "#f6", "#f6", $f14                  \n\t" \
179   "pasubub    $f12, $f8, $f0                      \n\t" \
180   "pasubub    $f14, $f10, $f2                     \n\t" \
181   "biadd      $f12, $f12                          \n\t" \
182   "biadd      $f14, $f14                          \n\t" \
183   "punpcklwd  $f14, $f14, $f14                    \n\t" \
184   "punpckhwd  $f14, $f12, $f14                    \n\t" \
185   "punpcklwd  $f12, $f0, $f12                     \n\t" \
186   "paddw      "#f4", "#f4", $f12                  \n\t" \
187   "paddw      "#f6", "#f6", $f14                  \n\t" \
188   "pasubub    $f12, $f4, $f8                      \n\t" \
189   "pasubub    $f14, $f6, $f10                     \n\t" \
190   "pmaxub     "#f8", "#f8", $f12                  \n\t" \
191   "pmaxub     "#f10", "#f10", $f14                \n\t" \
192   "paddw      $f4, $f0, $f12                      \n\t" \
193   "paddw      $f6, $f0, $f14                      \n\t" \
194   "pasubub    $f12, $f12, $f0                     \n\t" \
195   "pasubub    $f14, $f14, $f2                     \n\t" \
196   "biadd      $f12, $f12                          \n\t" \
197   "biadd      $f14, $f14                          \n\t" \
198   "paddw      "#f0", "#f0", $f12                  \n\t" \
199   "paddw      "#f2", "#f2", $f14                  \n\t" \
200   "paddw      $f12, $f0, $f4                      \n\t" \
201   "paddw      $f14, $f0, $f6                      \n\t" \
202   "punpcklbh  $f4, $f12, $f0                      \n\t" \
203   "punpckhbh  $f6, $f12, $f0                      \n\t" \
204   "punpcklbh  $f12, $f14, $f2                     \n\t" \
205   "punpckhbh  $f14, $f14, $f2                     \n\t" \
206   "pmaddhw    $f4, $f4, $f4                       \n\t" \
207   "pmaddhw    $f6, $f6, $f6                       \n\t" \
208   "pmaddhw    $f12, $f12, $f12                    \n\t" \
209   "pmaddhw    $f14, $f14, $f14                    \n\t" \
210   "paddw      "#f12", "#f12", $f4                 \n\t" \
211   "paddw      "#f14", "#f14", $f6                 \n\t" \
212   "paddw      "#f12", "#f12", $f12                \n\t" \
213   "paddw      "#f14", "#f14", $f14                \n\t" \
214   PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
215   PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
216 
217 #define WELS_SAD_SUM_SQSUM_16x1_MMI(r0, r1, r2) \
218   "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
219   "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
220   "pasubub    $f12, $f4, $f8                      \n\t" \
221   "pasubub    $f14, $f6, $f10                     \n\t" \
222   "biadd      $f12, $f12                          \n\t" \
223   "biadd      $f14, $f14                          \n\t" \
224   "paddw      $f24, $f24, $f12                    \n\t" \
225   "paddw      $f26, $f26, $f14                    \n\t" \
226   "pasubub    $f12, $f4, $f0                      \n\t" \
227   "pasubub    $f14, $f6, $f2                      \n\t" \
228   "biadd      $f12, $f12                          \n\t" \
229   "biadd      $f14, $f14                          \n\t" \
230   "paddw      $f20, $f20, $f12                    \n\t" \
231   "paddw      $f22, $f22, $f14                    \n\t" \
232   "punpcklbh  $f8, $f6, $f2                       \n\t" \
233   "punpckhbh  $f10, $f6, $f2                      \n\t" \
234   "punpckhbh  $f6, $f4, $f0                       \n\t" \
235   "punpcklbh  $f4, $f4, $f0                       \n\t" \
236   "pmaddhw    $f4, $f4, $f4                       \n\t" \
237   "pmaddhw    $f6, $f6, $f6                       \n\t" \
238   "pmaddhw    $f8, $f8, $f8                       \n\t" \
239   "pmaddhw    $f10, $f10, $f10                    \n\t" \
240   "paddw      $f16, $f16, $f4                     \n\t" \
241   "paddw      $f18, $f18, $f6                     \n\t" \
242   "paddw      $f16, $f16, $f8                     \n\t" \
243   "paddw      $f18, $f18, $f10                    \n\t" \
244   PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
245   PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
246 
VAACalcSad_mmi(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8)247 void VAACalcSad_mmi(const uint8_t* pCurData, const uint8_t* pRefData,
248                     int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
249                     int32_t* pFrameSad, int32_t* pSad8x8) {
250   double ftmp[13];
251   uint64_t tmp[2];
252   mips_reg addr[3];
253 
254   __asm__ volatile (
255     ".set       arch=loongson3a                                     \n\t"
256     PTR_SRL    "%[iPicWidth],   %[iPicWidth],   0x04                \n\t"
257     PTR_SRL    "%[iPicHeight],  %[iPicHeight],  0x04                \n\t"
258     "move       %[addr2],       %[iPicStride]                       \n\t"
259     PTR_SLL    "%[iPicStride],  %[iPicStride],  0x04                \n\t"
260     "xor        %[ftmp0],       %[ftmp0],       %[ftmp0]            \n\t"
261     "xor        %[ftmp11],      %[ftmp11],      %[ftmp11]           \n\t"
262     "xor        %[ftmp12],      %[ftmp12],      %[ftmp12]           \n\t"
263     "1:                                                             \n\t"
264     "move       %[addr0],       %[pCurData]                         \n\t"
265     "move       %[addr1],       %[pRefData]                         \n\t"
266     "move       %[tmp0],        %[iPicWidth]                        \n\t"
267     "2:                                                             \n\t"
268     "xor        %[ftmp9],       %[ftmp9],       %[ftmp9]            \n\t"
269     "xor        %[ftmp10],      %[ftmp10],      %[ftmp10]           \n\t"
270     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
271                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
272                       %[addr0], %[addr1], %[addr2])
273     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
274                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
275                       %[addr0], %[addr1], %[addr2])
276     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
277                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
278                       %[addr0], %[addr1], %[addr2])
279     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
280                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
281                       %[addr0], %[addr1], %[addr2])
282     "paddw      %[ftmp11],      %[ftmp11],      %[ftmp9]            \n\t"
283     "paddw      %[ftmp12],      %[ftmp12],      %[ftmp10]           \n\t"
284     "swc1       %[ftmp10],      0x00(%[pSad8x8])                    \n\t"
285     "swc1       %[ftmp9],       0x04(%[pSad8x8])                    \n\t"
286 
287     "xor        %[ftmp9],       %[ftmp9],       %[ftmp9]            \n\t"
288     "xor        %[ftmp10],      %[ftmp10],      %[ftmp10]           \n\t"
289     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
290                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
291                       %[addr0], %[addr1], %[addr2])
292     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
293                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
294                       %[addr0], %[addr1], %[addr2])
295     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
296                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
297                       %[addr0], %[addr1], %[addr2])
298     WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
299                       %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
300                       %[addr0], %[addr1], %[addr2])
301     "paddw      %[ftmp11],      %[ftmp11],      %[ftmp9]            \n\t"
302     "paddw      %[ftmp12],      %[ftmp12],      %[ftmp10]           \n\t"
303     "swc1       %[ftmp10],      0x08(%[pSad8x8])                    \n\t"
304     "swc1       %[ftmp9],       0x0c(%[pSad8x8])                    \n\t"
305 
306     PTR_ADDU   "%[pSad8x8],     %[pSad8x8],     0x10                \n\t"
307     PTR_SUBU   "%[addr0],       %[addr0],       %[iPicStride]       \n\t"
308     PTR_SUBU   "%[addr1],       %[addr1],       %[iPicStride]       \n\t"
309     PTR_ADDI   "%[tmp0],        %[tmp0],        -0x01               \n\t"
310     PTR_ADDU   "%[addr0],       %[addr0],       0x10                \n\t"
311     PTR_ADDU   "%[addr1],       %[addr1],       0x10                \n\t"
312     "bnez       %[tmp0],        2b                                  \n\t"
313 
314     PTR_ADDI   "%[iPicHeight],  %[iPicHeight],  -0x01               \n\t"
315     PTR_ADDU   "%[pCurData],    %[pCurData],    %[iPicStride]       \n\t"
316     PTR_ADDU   "%[pRefData],    %[pRefData],    %[iPicStride]       \n\t"
317     "bnez       %[iPicHeight],  1b                                  \n\t"
318 
319     "paddw      %[ftmp11],      %[ftmp11],      %[ftmp12]           \n\t"
320     "swc1       %[ftmp11],      0x00(%[pFrameSad])                  \n\t"
321     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
322       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
323       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
324       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
325       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
326       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
327       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
328       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
329       [pCurData]"+&r"(pCurData),        [pRefData]"+&r"(pRefData),
330       [iPicHeight]"+&r"(iPicHeight),    [iPicWidth]"+&r"(iPicWidth),
331       [pSad8x8]"+&r"(pSad8x8),          [iPicStride]"+&r"(iPicStride),
332       [addr2]"=&r"(addr[2])
333     : [pFrameSad]"r"(pFrameSad)
334     : "memory"
335   );
336 }
337 
VAACalcSadBgd_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * p_sd8x8,uint8_t * p_mad8x8)338 void VAACalcSadBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
339                        int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
340                        int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8,
341                        uint8_t *p_mad8x8) {
342   BACKUP_REG;
343   __asm__ volatile (
344     ".set       arch=loongson3a                           \n\t"
345     "move       $15, %[cur_data]                          \n\t"
346     "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
347     "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
348     "dsll       $13, %[iPicStride], 0x4                   \n\t"
349     "xor        $f0, $f0, $f0                             \n\t"
350     "xor        $f2, $f2, $f2                             \n\t"
351     "xor        $14, $14, $14                             \n\t"
352     "1:                                                   \n\t"
353     "move       $9, %[iPicWidth]                          \n\t"
354     "move       $10, $15                                  \n\t"
355     "move       $11, %[ref_data]                          \n\t"
356     "2:                                                   \n\t"
357     "xor        $f28, $f28, $f28                          \n\t"
358     "xor        $f30, $f30, $f30                          \n\t"
359     "xor        $f24, $f24, $f24                          \n\t"
360     "xor        $f26, $f26, $f26                          \n\t"
361     "xor        $f20, $f20, $f20                          \n\t"
362     "xor        $f22, $f22, $f22                          \n\t"
363     "xor        $f16, $f16, $f16                          \n\t"
364     "xor        $f18, $f18, $f18                          \n\t"
365     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
366                              $15, %[ref_data], %[iPicStride])
367     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
368                              $15, %[ref_data], %[iPicStride])
369     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
370                              $15, %[ref_data], %[iPicStride])
371     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
372                              $15, %[ref_data], %[iPicStride])
373     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
374                              $15, %[ref_data], %[iPicStride])
375     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
376                              $15, %[ref_data], %[iPicStride])
377     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
378                              $15, %[ref_data], %[iPicStride])
379     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
380                              $15, %[ref_data], %[iPicStride])
381 
382     "dli        $8, 0x1                                   \n\t"
383     "dmtc1      $8, $f8                                   \n\t"
384     "dli        $8, 0x8                                   \n\t"
385     "dmtc1      $8, $f10                                  \n\t"
386     WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
387 
388     "dmfc1      $8, $f16                                  \n\t"
389     "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
390     "dmfc1      $8, $f18                                  \n\t"
391     "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
392     PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
393 
394     "xor        $f16, $f16, $f16                          \n\t"
395     "xor        $f18, $f18, $f18                          \n\t"
396     "punpcklwd  $f30, $f30, $f30                          \n\t"
397     "punpcklwd  $f26, $f26, $f26                          \n\t"
398     "punpcklwd  $f22, $f22, $f22                          \n\t"
399 
400     "punpckhwd  $f30, $f28, $f30                          \n\t"
401     "punpckhwd  $f26, $f24, $f26                          \n\t"
402     "punpckhwd  $f22, $f20, $f22                          \n\t"
403 
404     "punpcklwd  $f28, $f16, $f28                          \n\t"
405     "punpcklwd  $f24, $f16, $f24                          \n\t"
406     "punpcklwd  $f20, $f16, $f20                          \n\t"
407 
408     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
409                              $15, %[ref_data], %[iPicStride])
410     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
411                              $15, %[ref_data], %[iPicStride])
412     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
413                              $15, %[ref_data], %[iPicStride])
414     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
415                              $15, %[ref_data], %[iPicStride])
416     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
417                              $15, %[ref_data], %[iPicStride])
418     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
419                              $15, %[ref_data], %[iPicStride])
420     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
421                              $15, %[ref_data], %[iPicStride])
422     WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
423                              $15, %[ref_data], %[iPicStride])
424 
425     "dli        $8, 0x1                                   \n\t"
426     "dmtc1      $8, $f8                                   \n\t"
427     "dli        $8, 0x8                                   \n\t"
428     "dmtc1      $8, $f10                                  \n\t"
429     WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
430 
431     "dmfc1      $8, $f16                                  \n\t"
432     "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
433     "dmfc1      $8, $f18                                  \n\t"
434     "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
435     "punpckhwd  $f4, $f28, $f30                           \n\t"
436     PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
437 
438     "punpcklwd  $f6, $f28, $f30                           \n\t"
439     "gssqc1     $f6, $f4, 0x0(%[psad8x8])                 \n\t"
440     PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x10              \n\t"
441 
442     "paddw      $f6, $f6, $f30                            \n\t"
443     "paddw      $f4, $f4, $f28                            \n\t"
444     "punpckhwd  $f8, $f6, $f6                             \n\t"
445     "paddw      $f4, $f4, $f8                             \n\t"
446     "dmtc1      $14, $f6                                  \n\t"
447     "paddw      $f6, $f6, $f4                             \n\t"
448     "dmfc1      $14, $f6                                  \n\t"
449 
450     "psubw      $f24, $f24, $f20                          \n\t"
451     "psubw      $f26, $f26, $f22                          \n\t"
452     "punpckhwd  $f4, $f24, $f26                           \n\t"
453     "punpcklwd  $f6, $f24, $f26                           \n\t"
454     "gssqc1     $f6, $f4, 0x0(%[p_sd8x8])                 \n\t"
455     PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x10              \n\t"
456 
457     PTR_SUBU   "$15, $15, $13                             \n\t"
458     PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
459     PTR_ADDIU  "$15, $15, 0x10                            \n\t"
460     PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
461 
462     PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
463     "bnez       %[iPicWidth], 2b                          \n\t"
464     "move       %[iPicWidth], $9                          \n\t"
465     "move       $15, $10                                  \n\t"
466     "move       %[ref_data], $11                          \n\t"
467     PTR_ADDU   "$15, $15, $13                             \n\t"
468     PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
469 
470     PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
471     "bnez       %[iPicHeight], 1b                         \n\t"
472 
473     "swl        $14, 0x3(%[psadframe])                    \n\t"
474     "swr        $14, 0x0(%[psadframe])                    \n\t"
475     : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
476       [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
477       [p_sd8x8]"+&r"((int *)p_sd8x8), [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
478     : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
479       [psadframe]"r"((int *)psadframe)
480     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
481       "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
482       "$f24", "$f26", "$f28", "$f30"
483   );
484   RECOVER_REG;
485 }
486 
VAACalcSadSsd_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * psum16x16,int32_t * psqsum16x16,int32_t * psqdiff16x16)487 void VAACalcSadSsd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
488                        int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
489                        int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
490                        int32_t *psqsum16x16, int32_t *psqdiff16x16) {
491   BACKUP_REG;
492   __asm__ volatile (
493     ".set       arch=loongson3a                           \n\t"
494     "move       $15, %[cur_data]                          \n\t"
495     "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
496     "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
497     "dsll       $13, %[iPicStride], 0x4                   \n\t"
498     "xor        $f0, $f0, $f0                             \n\t"
499     "xor        $f2, $f2, $f2                             \n\t"
500     "xor        $12, $12, $12                             \n\t"
501     "xor        $14, $14, $14                             \n\t"
502     "1:                                                   \n\t"
503     "move       $9, %[iPicWidth]                          \n\t"
504     "move       $10, $15                                  \n\t"
505     "move       $11, %[ref_data]                          \n\t"
506     "2:                                                   \n\t"
507     "xor        $f28, $f28, $f28                          \n\t"
508     "xor        $f30, $f30, $f30                          \n\t"
509     "xor        $f24, $f24, $f24                          \n\t"
510     "xor        $f26, $f26, $f26                          \n\t"
511     "xor        $f20, $f20, $f20                          \n\t"
512     "xor        $f22, $f22, $f22                          \n\t"
513     "xor        $f16, $f16, $f16                          \n\t"
514     "xor        $f18, $f18, $f18                          \n\t"
515     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
516     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
517     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
518     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
519     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
520     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
521     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
522     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
523     "dmfc1      $8, $f28                                  \n\t"
524     "sw         $8, 0x0(%[psad8x8])                       \n\t"
525     "dmfc1      $8, $f30                                  \n\t"
526     "sw         $8, 0x4(%[psad8x8])                       \n\t"
527     "paddw      $f4, $f28, $f30                           \n\t"
528     "dmfc1      $12, $f4                                  \n\t"
529 	  PTR_ADDU   "$14, $14, $12                             \n\t"
530 
531     "xor        $f28, $f28, $f28                          \n\t"
532     "xor        $f30, $f30, $f30                          \n\t"
533     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
534     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
535     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
536     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
537     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
538     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
539     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
540     WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
541     "dmfc1      $8, $f28                                  \n\t"
542     "sw         $8, 0x8(%[psad8x8])                       \n\t"
543     "dmfc1      $8, $f30                                  \n\t"
544     "paddw      $f4, $f28, $f30                           \n\t"
545     "sw         $8, 0xc(%[psad8x8])                       \n\t"
546     "dmfc1      $12, $f4                                  \n\t"
547 	  PTR_ADDU   "$14, $14, $12                             \n\t"
548     PTR_ADDIU  "%[psad8x8],   %[psad8x8],   0x10          \n\t"
549 
550     "paddw      $f24, $f24, $f26                          \n\t"
551     "dmfc1      $8, $f24                                  \n\t"
552     "sw         $8, 0x0(%[psum16x16])                     \n\t"
553     PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"
554     "paddw      $f24, $f20, $f22                          \n\t"
555 	  "punpcklwd  $f20, $f24, $f24                          \n\t"
556 	  "punpckhwd  $f22, $f24, $f24                          \n\t"
557     "paddw      $f20, $f20, $f22                          \n\t"
558     "dmfc1      $8, $f20                                  \n\t"
559     "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
560     PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"
561 
562     "paddw      $f20, $f16, $f18                          \n\t"
563 	  "punpcklwd  $f16, $f20, $f20                          \n\t"
564 	  "punpckhwd  $f18, $f20, $f20                          \n\t"
565     "paddw      $f16, $f16, $f18                          \n\t"
566     "dmfc1      $8, $f16                                  \n\t"
567     "sw         $8, 0x0(%[psqdiff16x16])                  \n\t"
568     PTR_ADDIU  "%[psqdiff16x16], %[psqdiff16x16], 0x4     \n\t"
569 
570     PTR_SUBU   "$15, $15, $13                             \n\t"
571     PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
572     PTR_ADDIU  "$15, $15, 0x10                            \n\t"
573     PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
574 
575     PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
576     "bnez       %[iPicWidth], 2b                          \n\t"
577     "nop                                                  \n\t"
578     "move       %[iPicWidth], $9                          \n\t"
579     "move       $15, $10                                  \n\t"
580     "move       %[ref_data], $11                          \n\t"
581     PTR_ADDU   "$15, $15, $13                             \n\t"
582     PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
583 
584     PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
585     "bnez       %[iPicHeight], 1b                         \n\t"
586     "nop                                                  \n\t"
587 
588     "sw         $14, 0x0(%[psadframe])                    \n\t"
589     : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
590       [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
591       [psqsum16x16]"+&r"((int *)psqsum16x16), [psqdiff16x16]"+&r"((int *)psqdiff16x16)
592     : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
593       [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
594     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
595       "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
596       "$f24", "$f26", "$f28", "$f30"
597   );
598   RECOVER_REG;
599 }
600 
VAACalcSadSsdBgd_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * psum16x16,int32_t * psqsum16x16,int32_t * psqdiff16x16,int32_t * p_sd8x8,uint8_t * p_mad8x8)601 void VAACalcSadSsdBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
602                           int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
603                           int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
604                           int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8,
605                           uint8_t *p_mad8x8) {
606   BACKUP_REG;
607   __asm__ volatile (
608     ".set       arch=loongson3a                           \n\t"
609     "move       $15, %[cur_data]                          \n\t"
610     "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
611     "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
612     "dsll       $13, %[iPicStride], 0x4                   \n\t"
613     "xor        $f0, $f0, $f0                             \n\t"
614     "xor        $f2, $f2, $f2                             \n\t"
615     "xor        $12, $12, $12                             \n\t"
616     "xor        $14, $14, $14                             \n\t"
617     "1:                                                   \n\t"
618     "move       $9, %[iPicWidth]                          \n\t"
619     "move       $10, $15                                  \n\t"
620     "move       $11, %[ref_data]                          \n\t"
621     "2:                                                   \n\t"
622     "xor        $f28, $f28, $f28                          \n\t"
623     "xor        $f30, $f30, $f30                          \n\t"
624     "xor        $f24, $f24, $f24                          \n\t"
625     "xor        $f26, $f26, $f26                          \n\t"
626     "xor        $f20, $f20, $f20                          \n\t"
627     "xor        $f22, $f22, $f22                          \n\t"
628     "xor        $f16, $f16, $f16                          \n\t"
629     "xor        $f18, $f18, $f18                          \n\t"
630     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
631                                  $f18, $15, %[ref_data], %[iPicStride])
632     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
633                                  $f18, $15, %[ref_data], %[iPicStride])
634     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
635                                  $f18, $15, %[ref_data], %[iPicStride])
636     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
637                                  $f18, $15, %[ref_data], %[iPicStride])
638     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
639                                  $f18, $15, %[ref_data], %[iPicStride])
640     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
641                                  $f18, $15, %[ref_data], %[iPicStride])
642     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
643                                  $f18, $15, %[ref_data], %[iPicStride])
644     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
645                                  $f18, $15, %[ref_data], %[iPicStride])
646 
647     "dmfc1      $8, $f28                                  \n\t"
648     "sw         $8, 0x0(%[psad8x8])                       \n\t"
649     "dmfc1      $8, $f30                                  \n\t"
650     "sw         $8, 0x4(%[psad8x8])                       \n\t"
651     PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x8               \n\t"
652 
653     "paddw      $f4, $f28, $f30                           \n\t"
654     "dmfc1      $12, $f4                                  \n\t"
655     PTR_ADDU   "$14, $14,  $12                            \n\t"
656 
657     "paddw      $f4, $f24, $f26                           \n\t"
658     "dmfc1      $8, $f4                                   \n\t"
659     "sw         $8, 0x0(%[psum16x16])                     \n\t"
660 
661     "punpckhwd  $f4, $f24, $f26                           \n\t"
662     "punpcklwd  $f6, $f24, $f26                           \n\t"
663     "psubw      $f6, $f6, $f4                             \n\t"
664     "dmfc1      $8, $f6                                   \n\t"
665     PTR_S      "$8, 0x0(%[p_sd8x8])                       \n\t"
666     PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x8               \n\t"
667 
668     "dli        $8, 0x1                                   \n\t"
669     "dmtc1      $8, $f8                                   \n\t"
670     "dli        $8, 0x8                                   \n\t"
671     "dmtc1      $8, $f10                                  \n\t"
672     WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
673 
674     "dmfc1      $8, $f20                                  \n\t"
675     "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
676     "dmfc1      $8, $f22                                  \n\t"
677     "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
678     PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
679 
680     "xor        $f20, $f20, $f20                          \n\t"
681     "xor        $f22, $f22, $f22                          \n\t"
682     "punpckhwd  $f28, $f20, $f28                          \n\t"
683     "xor        $f24, $f24, $f24                          \n\t"
684     "xor        $f26, $f26, $f26                          \n\t"
685     "punpckhwd  $f30, $f20, $f30                          \n\t"
686     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
687                                  $f18, $15, %[ref_data], %[iPicStride])
688     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
689                                  $f18, $15, %[ref_data], %[iPicStride])
690     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
691                                  $f18, $15, %[ref_data], %[iPicStride])
692     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
693                                  $f18, $15, %[ref_data], %[iPicStride])
694     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
695                                  $f18, $15, %[ref_data], %[iPicStride])
696     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
697                                  $f18, $15, %[ref_data], %[iPicStride])
698     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
699                                  $f18, $15, %[ref_data], %[iPicStride])
700     WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
701                                  $f18, $15, %[ref_data], %[iPicStride])
702 
703     "dmfc1      $8, $f28                                  \n\t"
704     "sw         $8, 0x0(%[psad8x8])                       \n\t"
705     "dmfc1      $8, $f30                                  \n\t"
706     "sw         $8, 0x4(%[psad8x8])                       \n\t"
707     PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x8               \n\t"
708 
709     "paddw      $f4, $f28, $f30                           \n\t"
710     "dmfc1      $12, $f4                                  \n\t"
711     PTR_ADDU   "$14, $14, $12                             \n\t"
712 
713     "paddw      $f4, $f24, $f26                           \n\t"
714     "dmfc1      $8, $f4                                   \n\t"
715     "lw         $12, 0x0(%[psum16x16])                    \n\t"
716     PTR_ADDU   "$8, $8, $12                               \n\t"
717     "sw         $8, 0x0(%[psum16x16])                     \n\t"
718     "xor        $f8, $f8, $f8                             \n\t"
719     PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"
720 
721     "punpckhwd  $f30, $f30, $f8                           \n\t"
722     "punpckhwd  $f28, $f28, $f8                           \n\t"
723     "paddw      $f8, $f28, $f30                           \n\t"
724     "dmfc1      $8, $f8                                   \n\t"
725     "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
726     PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"
727 
728     "punpckhwd  $f4, $f24, $f26                           \n\t"
729     "punpcklwd  $f6, $f24, $f26                           \n\t"
730     "psubw      $f6, $f6, $f4                             \n\t"
731     "dmfc1      $8, $f6                                   \n\t"
732     PTR_S      "$8, 0x0(%[p_sd8x8])                       \n\t"
733     PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x8               \n\t"
734 
735     "dli        $8, 0x1                                   \n\t"
736     "dmtc1      $8, $f8                                   \n\t"
737     "dli        $8, 0x8                                   \n\t"
738     "dmtc1      $8, $f10                                  \n\t"
739     WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
740 
741     "dmfc1      $8, $f20                                  \n\t"
742     "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
743     "dmfc1      $8, $f22                                  \n\t"
744     "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
745     PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
746 
747     "paddw      $f20, $f16, $f18                          \n\t"
748 	  "punpcklwd  $f16, $f20, $f20                          \n\t"
749 	  "punpckhwd  $f18, $f20, $f20                          \n\t"
750     "paddw      $f16, $f16, $f18                          \n\t"
751     "dmfc1      $8, $f16                                  \n\t"
752     "sw         $8, 0x0(%[psqdiff16x16])                  \n\t"
753     PTR_ADDIU  "%[psqdiff16x16], %[psqdiff16x16], 0x4     \n\t"
754 
755     PTR_SUBU   "$15, $15, $13                             \n\t"
756     PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
757     PTR_ADDIU  "$15, $15, 0x10                            \n\t"
758     PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
759 
760     PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
761     "bnez       %[iPicWidth], 2b                          \n\t"
762     "nop                                                  \n\t"
763     "move       %[iPicWidth], $9                          \n\t"
764     "move       $15, $10                                  \n\t"
765     "move       %[ref_data], $11                          \n\t"
766     PTR_ADDU   "$15, $15, $13                             \n\t"
767     PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
768 
769     PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
770     "bnez       %[iPicHeight], 1b                         \n\t"
771     "nop                                                  \n\t"
772 
773     "sw         $14, 0x0(%[psadframe])                    \n\t"
774     : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
775       [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
776       [psum16x16]"+&r"((int *)psum16x16), [psqsum16x16]"+&r"((int *)psqsum16x16),
777 	    [psqdiff16x16]"+&r"((int *)psqdiff16x16), [p_sd8x8]"+&r"((int *)p_sd8x8),
778       [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
779     : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
780       [psadframe]"r"((int *)psadframe)
781     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
782       "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
783       "$f24", "$f26", "$f28", "$f30"
784   );
785   RECOVER_REG;
786 }
787 
VAACalcSadVar_mmi(const uint8_t * cur_data,const uint8_t * ref_data,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * psadframe,int32_t * psad8x8,int32_t * psum16x16,int32_t * psqsum16x16)788 void VAACalcSadVar_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
789                        int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
790                        int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
791                        int32_t *psqsum16x16) {
792   BACKUP_REG;
793   __asm__ volatile (
794     ".set       arch=loongson3a                           \n\t"
795     "move       $15, %[cur_data]                          \n\t"
796     "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
797     "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
798     "dsll       $13, %[iPicStride], 0x4                   \n\t"
799     "xor        $f0, $f0, $f0                             \n\t"
800     "xor        $f2, $f2, $f2                             \n\t"
801     "xor        $f28, $f28, $f28                          \n\t"
802     "xor        $f30, $f30, $f30                          \n\t"
803     "xor        $14, $14, $14                             \n\t"
804     "1:                                                   \n\t"
805     "move       $9, %[iPicWidth]                          \n\t"
806     "move       $10, $15                                  \n\t"
807     "move       $11, %[ref_data]                          \n\t"
808     "2:                                                   \n\t"
809     "xor        $f24, $f24, $f24                          \n\t"
810     "xor        $f26, $f26, $f26                          \n\t"
811     "xor        $f20, $f20, $f20                          \n\t"
812     "xor        $f22, $f22, $f22                          \n\t"
813     "xor        $f16, $f16, $f16                          \n\t"
814     "xor        $f18, $f18, $f18                          \n\t"
815     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
816     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
817     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
818     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
819     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
820     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
821     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
822     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
823     "paddw      $f28, $f24, $f28                          \n\t"
824     "paddw      $f30, $f26, $f30                          \n\t"
825     "dmfc1      $8, $f24                                  \n\t"
826     "sw         $8, 0x0(%[psad8x8])                       \n\t"
827     "dmfc1      $8, $f26                                  \n\t"
828     "sw         $8, 0x4(%[psad8x8])                       \n\t"
829 
830     "xor        $f24, $f24, $f24                          \n\t"
831     "xor        $f26, $f26, $f26                          \n\t"
832     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
833     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
834     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
835     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
836     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
837     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
838     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
839     WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
840     "paddw      $f28, $f24, $f28                          \n\t"
841     "paddw      $f30, $f26, $f30                          \n\t"
842     "dmfc1      $8, $f24                                  \n\t"
843     "sw         $8, 0x8(%[psad8x8])                       \n\t"
844     "dmfc1      $8, $f26                                  \n\t"
845     "sw         $8, 0xc(%[psad8x8])                       \n\t"
846     PTR_ADDIU  "%[psad8x8],   %[psad8x8],   0x10          \n\t"
847 
848     "paddw      $f20, $f20, $f22                          \n\t"
849     "dmfc1      $8, $f20                                  \n\t"
850     "sw         $8, 0x0(%[psum16x16])                     \n\t"
851     PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"
852 
853     "paddw      $f20, $f16, $f18                          \n\t"
854 	  "punpcklwd  $f16, $f20, $f20                          \n\t"
855 	  "punpckhwd  $f18, $f20, $f20                          \n\t"
856     "paddw      $f16, $f16, $f18                          \n\t"
857     "dmfc1      $8, $f16                                  \n\t"
858     "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
859     PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"
860 
861     PTR_SUBU   "$15, $15, $13                             \n\t"
862     PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
863     PTR_ADDIU  "$15, $15, 0x10                            \n\t"
864     PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
865 
866     PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
867     "bnez       %[iPicWidth], 2b                          \n\t"
868     "nop                                                  \n\t"
869     "move       %[iPicWidth], $9                          \n\t"
870     "move       $15, $10                                  \n\t"
871     "move       %[ref_data], $11                          \n\t"
872     PTR_ADDU   "$15, $15, $13                             \n\t"
873     PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
874 
875     PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
876     "bnez       %[iPicHeight], 1b                         \n\t"
877     "nop                                                  \n\t"
878 
879     "paddw      $f28, $f28, $f30                          \n\t"
880     "dmfc1      $8, $f28                                  \n\t"
881     "sw         $8, 0x0(%[psadframe])                     \n\t"
882     : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
883       [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
884       [psqsum16x16]"+&r"((int *)psqsum16x16)
885     : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
886       [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
887     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
888       "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
889       "$f24", "$f26", "$f28", "$f30"
890   );
891   RECOVER_REG;
892 }
893