• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    satd_sad_mmi.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    23/07/2018 Created
37  *
38  *************************************************************************************
39  */
40 #include <stdint.h>
41 #include "asmdefs_mmi.h"
42 
43 #define MMI_SumWHorizon1(f0, f2, f4, f6, f8, f10, r0) \
44   "dli        "#r0", 0x10                               \n\t" \
45   "dmtc1      "#r0", "#f8"                              \n\t" \
46   "dli        "#r0", 0x20                               \n\t" \
47   "dmtc1      "#r0", "#f10"                             \n\t" \
48   "mov.d      "#f4", "#f2"                              \n\t" \
49   "xor        "#f6", "#f6", "#f6"                       \n\t" \
50   "paddush    "#f0", "#f0", "#f4"                       \n\t" \
51   "paddush    "#f2", "#f2", "#f6"                       \n\t" \
52   "dsrl       "#f6", "#f2", "#f10"                      \n\t" \
53   "punpcklwd  "#f4", "#f2", "#f2"                       \n\t" \
54   "punpckhwd  "#f4", "#f0", "#f4"                       \n\t" \
55   "paddush    "#f0", "#f0", "#f4"                       \n\t" \
56   "paddush    "#f2", "#f2", "#f6"                       \n\t" \
57   "dsrl       "#f4", "#f0", "#f8"                       \n\t" \
58   "pinsrh_3   "#f4", "#f4", "#f2"                       \n\t" \
59   "dsrl       "#f6", "#f2", "#f8"                       \n\t" \
60   "paddush    "#f0", "#f0", "#f4"                       \n\t" \
61   "paddush    "#f2", "#f2", "#f6"                       \n\t"
62 
63 #define MMI_GetSad8x4 \
64   PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
65   "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t" \
66   "gsldlc1    $f4, 0x7($8)                              \n\t" \
67   "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t" \
68   "gsldrc1    $f4, 0x0($8)                              \n\t" \
69   PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t" \
70   PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
71   "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t" \
72   "gsldlc1    $f6, 0x7($8)                              \n\t" \
73   "gsldlc1    $f8, 0x7(%[pSample2])                     \n\t" \
74   "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t" \
75   "gsldrc1    $f6, 0x0($8)                              \n\t" \
76   "gsldrc1    $f8, 0x0(%[pSample2])                     \n\t" \
77   PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
78   PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t" \
79   "gsldlc1    $f12, 0x7($9)                             \n\t" \
80   "gsldlc1    $f10, 0x7(%[pSample2])                    \n\t" \
81   "gsldrc1    $f12, 0x0($9)                             \n\t" \
82   PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
83   "gsldrc1    $f10, 0x0(%[pSample2])                    \n\t" \
84   "gsldlc1    $f14, 0x7($9)                             \n\t" \
85   "gsldrc1    $f14, 0x0($9)                             \n\t" \
86   "pasubub    $f0, $f0, $f8                             \n\t" \
87   PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t" \
88   "pasubub    $f2, $f2, $f10                            \n\t" \
89   "biadd      $f0, $f0                                  \n\t" \
90   "biadd      $f2, $f2                                  \n\t" \
91   "pasubub    $f4, $f4, $f12                            \n\t" \
92   PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t" \
93   "pasubub    $f6, $f6, $f14                            \n\t" \
94   "biadd      $f4, $f4                                  \n\t" \
95   "biadd      $f6, $f6                                  \n\t" \
96   "paddh      $f24, $f24, $f0                           \n\t" \
97   "paddh      $f26, $f26, $f2                           \n\t" \
98   "paddh      $f24, $f24, $f4                           \n\t" \
99   "paddh      $f26, $f26, $f6                           \n\t"
100 
101 #define MMI_GetSad8x4_End \
102   PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
103   "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t" \
104   "gsldlc1    $f4, 0x7($8)                              \n\t" \
105   "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t" \
106   "gsldrc1    $f4, 0x0($8)                              \n\t" \
107   PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t" \
108   PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
109   "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t" \
110   "gsldlc1    $f6, 0x7($8)                              \n\t" \
111   "gsldlc1    $f8, 0x7(%[pSample2])                     \n\t" \
112   "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t" \
113   "gsldrc1    $f6, 0x0($8)                              \n\t" \
114   "gsldrc1    $f8, 0x0(%[pSample2])                     \n\t" \
115   PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
116   PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t" \
117   "gsldlc1    $f12, 0x7($9)                             \n\t" \
118   "gsldlc1    $f10, 0x7(%[pSample2])                    \n\t" \
119   "gsldrc1    $f12, 0x0($9)                             \n\t" \
120   PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
121   "gsldrc1    $f10, 0x0(%[pSample2])                    \n\t" \
122   "gsldlc1    $f14, 0x7($9)                             \n\t" \
123   "gsldrc1    $f14, 0x0($9)                             \n\t" \
124   "pasubub    $f0, $f0, $f8                             \n\t" \
125   "pasubub    $f2, $f2, $f10                            \n\t" \
126   "biadd      $f0, $f0                                  \n\t" \
127   "biadd      $f2, $f2                                  \n\t" \
128   "pasubub    $f4, $f4, $f12                            \n\t" \
129   "pasubub    $f6, $f6, $f14                            \n\t" \
130   "biadd      $f4, $f4                                  \n\t" \
131   "biadd      $f6, $f6                                  \n\t" \
132   "paddh      $f24, $f24, $f0                           \n\t" \
133   "paddh      $f26, $f26, $f2                           \n\t" \
134   "paddh      $f24, $f24, $f4                           \n\t" \
135   "paddh      $f26, $f26, $f6                           \n\t"
136 
137 #define CACHE_SPLIT_CHECK(r0, width, cacheline) \
138   "and        "#r0", "#r0", 0x1f                        \n\t" \
139   PTR_ADDIU  ""#r0", "#r0", -0x1f                       \n\t"
140 
141 #define MMI_GetSad2x16 \
142   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
143   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
144   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
145   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
146   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
147   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
148   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
149   "pasubub    $f4, $f4, $f8                             \n\t" \
150   "pasubub    $f6, $f6, $f10                            \n\t" \
151   "biadd      $f4, $f4                                  \n\t" \
152   "biadd      $f6, $f6                                  \n\t" \
153   "paddh      $f0, $f0, $f4                             \n\t" \
154   "paddh      $f2, $f2, $f6                             \n\t" \
155   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
156   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
157   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
158   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
159   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
160   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
161   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
162   "pasubub    $f4, $f4, $f8                             \n\t" \
163   "pasubub    $f6, $f6, $f10                            \n\t" \
164   "biadd      $f4, $f4                                  \n\t" \
165   "biadd      $f6, $f6                                  \n\t" \
166   "paddh      $f0, $f0, $f4                             \n\t" \
167   "paddh      $f2, $f2, $f6                             \n\t"
168 
169 #define MMI_GetSad4x16 \
170   "gsldlc1    $f0, 0x7(%[pSample2])                     \n\t" \
171   "gsldlc1    $f2, 0xF(%[pSample2])                     \n\t" \
172   "gsldrc1    $f0, 0x0(%[pSample2])                     \n\t" \
173   "gsldrc1    $f2, 0x8(%[pSample2])                     \n\t" \
174   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
175   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
176   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
177   "pasubub    $f0, $f0, $f8                             \n\t" \
178   "pasubub    $f2, $f2, $f10                            \n\t" \
179   "biadd      $f0, $f0                                  \n\t" \
180   "biadd      $f2, $f2                                  \n\t" \
181   "paddh      $f28, $f28, $f0                           \n\t" \
182   "paddh      $f30, $f30, $f2                           \n\t" \
183   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
184   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
185   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
186   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
187   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
188   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
189   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
190   "pasubub    $f4, $f4, $f8                             \n\t" \
191   "pasubub    $f6, $f6, $f10                            \n\t" \
192   "biadd      $f4, $f4                                  \n\t" \
193   "biadd      $f6, $f6                                  \n\t" \
194   "paddh      $f28, $f28, $f4                           \n\t" \
195   "paddh      $f30, $f30, $f6                           \n\t" \
196   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
197   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
198   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
199   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
200   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
201   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
202   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
203   "pasubub    $f4, $f4, $f8                             \n\t" \
204   "pasubub    $f6, $f6, $f10                            \n\t" \
205   "biadd      $f4, $f4                                  \n\t" \
206   "biadd      $f6, $f6                                  \n\t" \
207   "paddh      $f28, $f28, $f4                           \n\t" \
208   "paddh      $f30, $f30, $f6                           \n\t" \
209   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
210   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
211   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
212   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
213   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
214   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
215   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
216   "pasubub    $f4, $f4, $f8                             \n\t" \
217   "pasubub    $f6, $f6, $f10                            \n\t" \
218   "biadd      $f4, $f4                                  \n\t" \
219   "biadd      $f6, $f6                                  \n\t" \
220   "paddh      $f28, $f28, $f4                           \n\t" \
221   "paddh      $f30, $f30, $f6                           \n\t"
222 
223 #define MMI_GetSad4x16_Aligned \
224   "gslqc1     $f2, $f0, 0x0(%[pSample2])                \n\t" \
225   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
226   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
227   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
228   "pasubub    $f0, $f0, $f8                             \n\t" \
229   "pasubub    $f2, $f2, $f10                            \n\t" \
230   "biadd      $f0, $f0                                  \n\t" \
231   "biadd      $f2, $f2                                  \n\t" \
232   "paddh      $f28, $f28, $f0                           \n\t" \
233   "paddh      $f30, $f30, $f2                           \n\t" \
234   "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
235   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
236   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
237   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
238   "pasubub    $f4, $f4, $f8                             \n\t" \
239   "pasubub    $f6, $f6, $f10                            \n\t" \
240   "biadd      $f4, $f4                                  \n\t" \
241   "biadd      $f6, $f6                                  \n\t" \
242   "paddh      $f28, $f28, $f4                           \n\t" \
243   "paddh      $f30, $f30, $f6                           \n\t" \
244   "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
245   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
246   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
247   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
248   "pasubub    $f4, $f4, $f8                             \n\t" \
249   "pasubub    $f6, $f6, $f10                            \n\t" \
250   "biadd      $f4, $f4                                  \n\t" \
251   "biadd      $f6, $f6                                  \n\t" \
252   "paddh      $f28, $f28, $f4                           \n\t" \
253   "paddh      $f30, $f30, $f6                           \n\t" \
254   "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
255   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
256   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
257   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
258   "pasubub    $f4, $f4, $f8                             \n\t" \
259   "pasubub    $f6, $f6, $f10                            \n\t" \
260   "biadd      $f4, $f4                                  \n\t" \
261   "biadd      $f6, $f6                                  \n\t" \
262   "paddh      $f28, $f28, $f4                           \n\t" \
263   "paddh      $f30, $f30, $f6                           \n\t"
264 
265 #define MMI_GetSad4x16_End \
266   "gsldlc1    $f0, 0x7(%[pSample2])                     \n\t" \
267   "gsldlc1    $f2, 0xF(%[pSample2])                     \n\t" \
268   "gsldrc1    $f0, 0x0(%[pSample2])                     \n\t" \
269   "gsldrc1    $f2, 0x8(%[pSample2])                     \n\t" \
270   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
271   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
272   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
273   "pasubub    $f0, $f0, $f8                             \n\t" \
274   "pasubub    $f2, $f2, $f10                            \n\t" \
275   "biadd      $f0, $f0                                  \n\t" \
276   "biadd      $f2, $f2                                  \n\t" \
277   "paddh      $f28, $f28, $f0                           \n\t" \
278   "paddh      $f30, $f30, $f2                           \n\t" \
279   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
280   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
281   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
282   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
283   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
284   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
285   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
286   "pasubub    $f4, $f4, $f8                             \n\t" \
287   "pasubub    $f6, $f6, $f10                            \n\t" \
288   "biadd      $f4, $f4                                  \n\t" \
289   "biadd      $f6, $f6                                  \n\t" \
290   "paddh      $f28, $f28, $f4                           \n\t" \
291   "paddh      $f30, $f30, $f6                           \n\t" \
292   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
293   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
294   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
295   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
296   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
297   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
298   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
299   "pasubub    $f4, $f4, $f8                             \n\t" \
300   "pasubub    $f6, $f6, $f10                            \n\t" \
301   "biadd      $f4, $f4                                  \n\t" \
302   "biadd      $f6, $f6                                  \n\t" \
303   "paddh      $f28, $f28, $f4                           \n\t" \
304   "paddh      $f30, $f30, $f6                           \n\t" \
305   "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
306   "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
307   "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
308   "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
309   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
310   "pasubub    $f4, $f4, $f8                             \n\t" \
311   "pasubub    $f6, $f6, $f10                            \n\t" \
312   "biadd      $f4, $f4                                  \n\t" \
313   "biadd      $f6, $f6                                  \n\t" \
314   "paddh      $f28, $f28, $f4                           \n\t" \
315   "paddh      $f30, $f30, $f6                           \n\t"
316 
317 #define MMI_GetSad4x16_Aligned_End \
318   "gslqc1     $f2, $f0, 0x0(%[pSample2])                \n\t" \
319   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
320   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
321   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
322   "pasubub    $f0, $f0, $f8                             \n\t" \
323   "pasubub    $f2, $f2, $f10                            \n\t" \
324   "biadd      $f0, $f0                                  \n\t" \
325   "biadd      $f2, $f2                                  \n\t" \
326   "paddh      $f28, $f28, $f0                           \n\t" \
327   "paddh      $f30, $f30, $f2                           \n\t" \
328   "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
329   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
330   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
331   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
332   "pasubub    $f4, $f4, $f8                             \n\t" \
333   "pasubub    $f6, $f6, $f10                            \n\t" \
334   "biadd      $f4, $f4                                  \n\t" \
335   "biadd      $f6, $f6                                  \n\t" \
336   "paddh      $f28, $f28, $f4                           \n\t" \
337   "paddh      $f30, $f30, $f6                           \n\t" \
338   "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
339   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
340   PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
341   PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
342   "pasubub    $f4, $f4, $f8                             \n\t" \
343   "pasubub    $f6, $f6, $f10                            \n\t" \
344   "biadd      $f4, $f4                                  \n\t" \
345   "biadd      $f6, $f6                                  \n\t" \
346   "paddh      $f28, $f28, $f4                           \n\t" \
347   "paddh      $f30, $f30, $f6                           \n\t" \
348   "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
349   "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
350   "pasubub    $f4, $f4, $f8                             \n\t" \
351   "pasubub    $f6, $f6, $f10                            \n\t" \
352   "biadd      $f4, $f4                                  \n\t" \
353   "biadd      $f6, $f6                                  \n\t" \
354   "paddh      $f28, $f28, $f4                           \n\t" \
355   "paddh      $f30, $f30, $f6                           \n\t"
356 
357 #define MMI_Get4LW16Sad(f0, f2, f4, f6, f8, f10, f12, f14, r0) \
358   "pasubub    "#f0", "#f0", "#f12"                      \n\t" \
359   "pasubub    "#f2", "#f2", "#f14"                      \n\t" \
360   "pasubub    "#f12", "#f12", "#f8"                     \n\t" \
361   "pasubub    "#f14", "#f14", "#f10"                    \n\t" \
362   "biadd      "#f0", "#f0"                              \n\t" \
363   "biadd      "#f2", "#f2"                              \n\t" \
364   "biadd      "#f12", "#f12"                            \n\t" \
365   "biadd      "#f14", "#f14"                            \n\t" \
366   "paddh      $f20, $f20, "#f0"                         \n\t" \
367   "paddh      $f22, $f22, "#f2"                         \n\t" \
368   "paddh      $f16, $f16, "#f12"                        \n\t" \
369   "paddh      $f18, $f18, "#f14"                        \n\t" \
370   "gsldlc1    "#f12", 0x6("#r0")                        \n\t" \
371   "gsldlc1    "#f14", 0xE("#r0")                        \n\t" \
372   "gsldrc1    "#f12", -0x1("#r0")                       \n\t" \
373   "gsldrc1    "#f14", 0x7("#r0")                        \n\t" \
374   "pasubub    "#f12", "#f12", "#f4"                     \n\t" \
375   "pasubub    "#f14", "#f14", "#f6"                     \n\t" \
376   "biadd      "#f12", "#f12"                            \n\t" \
377   "biadd      "#f14", "#f14"                            \n\t" \
378   "paddh      $f24, $f24, "#f12"                        \n\t" \
379   "paddh      $f26, $f26, "#f14"                        \n\t" \
380   "gsldlc1    "#f12", 0x8("#r0")                        \n\t" \
381   "gsldlc1    "#f14", 0x10("#r0")                       \n\t" \
382   "gsldrc1    "#f12", 0x1("#r0")                        \n\t" \
383   "gsldrc1    "#f14", 0x9("#r0")                        \n\t" \
384   "pasubub    "#f12", "#f12", "#f4"                     \n\t" \
385   "pasubub    "#f14", "#f14", "#f6"                     \n\t" \
386   "biadd      "#f12", "#f12"                            \n\t" \
387   "biadd      "#f14", "#f14"                            \n\t" \
388   "paddh      $f28, $f28, "#f12"                        \n\t" \
389   "paddh      $f30, $f30, "#f14"                        \n\t"
390 
391 #define MMI_HDMTwo4x4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
392   MMI_SumSub(f0, f2, f4, f6, f16, f18)      \
393   MMI_SumSub(f8, f10, f12, f14, f16, f18)   \
394   MMI_SumSub(f4, f6, f12, f14, f16, f18)    \
395   MMI_SumSub(f0, f2, f8, f10, f16, f18)
396 
397 #define MMI_SumAbs4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26) \
398   WELS_AbsH(f0, f2, f0, f2, f8, f10)                          \
399   WELS_AbsH(f4, f6, f4, f6, f8, f10)                          \
400   WELS_AbsH(f12, f14, f12, f14, f20, f22)                     \
401   WELS_AbsH(f16, f18, f16, f18, f20, f22)                     \
402   "paddush    "#f0", "#f0", "#f4"                       \n\t" \
403   "paddush    "#f2", "#f2", "#f6"                       \n\t" \
404   "paddush    "#f12", "#f12", "#f16"                    \n\t" \
405   "paddush    "#f14", "#f14", "#f18"                    \n\t" \
406   "paddush    "#f24", "#f24", "#f0"                     \n\t" \
407   "paddush    "#f26", "#f26", "#f2"                     \n\t" \
408   "paddush    "#f24", "#f24", "#f12"                    \n\t" \
409   "paddush    "#f26", "#f26", "#f14"                    \n\t"
410 
411 #define MMI_SumWHorizon(f0, f2, f4, f6, f8, f10) \
412   "paddh      "#f0", "#f0", "#f2"                       \n\t" \
413   "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
414   "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
415   "paddw      "#f0", "#f0", "#f2"                       \n\t" \
416   "pshufh     "#f2", "#f0", "#f10"                      \n\t" \
417   "paddw      "#f0", "#f0", "#f2"                       \n\t"
418 
419 #define MMI_LoadDiff8P_Offset_Stride0(f0, f2, f4, f6, f8, r0, r1) \
420   "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
421   "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
422   PTR_ADDU   "$11, %[pSample1], %[iStride1]   \n\t" \
423   "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
424   "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
425   PTR_ADDU   "$12, %[pSample2], %[iStride2]   \n\t" \
426   "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
427   "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
428   "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
429   "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
430   "psubh      "#f0", "#f0", "#f4"             \n\t" \
431   "psubh      "#f2", "#f2", "#f6"             \n\t"
432 
433 #define MMI_LoadDiff8P_Offset_Stride1(f0, f2, f4, f6, f8, r0, r1) \
434   "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
435   "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
436   PTR_ADDU   "%[pSample1], $11, %[iStride1]   \n\t" \
437   "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
438   "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
439   PTR_ADDU   "%[pSample2], $12, %[iStride2]   \n\t" \
440   "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
441   "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
442   "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
443   "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
444   "psubh      "#f0", "#f0", "#f4"             \n\t" \
445   "psubh      "#f2", "#f2", "#f6"             \n\t"
446 
447 #define MMI_LoadDiff8P_Offset8(f0, f2, f4, f6, f8, r0, r1) \
448   "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
449   "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
450   PTR_ADDU   "%[pSample1], $9, 0x8            \n\t" \
451   "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
452   "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
453   PTR_ADDU   "%[pSample2], $10, 0x8           \n\t" \
454   "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
455   "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
456   "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
457   "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
458   "psubh      "#f0", "#f0", "#f4"             \n\t" \
459   "psubh      "#f2", "#f2", "#f6"             \n\t"
460 
461 #define MMI_GetSatd8x8 \
462   MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
463   MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
464   MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
465   MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
466   MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
467   MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
468   MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
469   MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
470   MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
471   MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
472   MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
473   MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
474   MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
475   MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
476   MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
477   MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
478 
479 #define MMI_GetSatd8x8_Offset8 \
480   MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
481   MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
482   MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
483   MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
484   MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
485   MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
486   MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
487   MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
488   MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
489   MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
490   MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
491   MMI_LoadDiff8P_Offset8($f12, $f14, $f20, $f22, $f28, $11, $12)                             \
492   MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
493   MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
494   MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
495   MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
496 
497 #define MMI_GetSatd8x8_End \
498   MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
499   MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
500   MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
501   MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
502   MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
503   MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
504   MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
505   MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
506   MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
507   MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
508   MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
509   MMI_LoadDiff8P($f12, $f14, $f20, $f22, $f28, $11, $12)                                     \
510   MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
511   MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
512   MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
513   MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
514 
WelsSampleSad16x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)515 int32_t WelsSampleSad16x16_mmi (uint8_t* pSample1, int32_t iStride1,
516                                 uint8_t* pSample2, int32_t iStride2) {
517   int32_t iSadSum = 0;
518   BACKUP_REG;
519   __asm__ volatile (
520     ".set       arch=loongson3a                           \n\t"
521     "and        $8, %[pSample2], 0xF                      \n\t"
522     "xor        $f28, $f28, $f28                          \n\t"
523     "xor        $f30, $f30, $f30                          \n\t"
524     "bnez       $8, unaligned                             \n\t"
525     "aligned:                                             \n\t"
526     MMI_GetSad4x16_Aligned
527     MMI_GetSad4x16_Aligned
528     MMI_GetSad4x16_Aligned
529     MMI_GetSad4x16_Aligned_End
530     "b          out                                       \n\t"
531 
532     "unaligned:                                           \n\t"
533     MMI_GetSad4x16
534     MMI_GetSad4x16
535     MMI_GetSad4x16
536     MMI_GetSad4x16_End
537     "out:                                                 \n\t"
538     "mov.d      $f0, $f30                                 \n\t"
539     "paddh      $f0, $f0, $f28                            \n\t"
540     "dmfc1      %[iSadSum], $f0                           \n\t"
541     : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
542       [pSample2]"+&r"((unsigned char *)pSample2)
543     : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
544     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
545       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
546   );
547   RECOVER_REG;
548   return iSadSum;
549 }
550 
WelsSampleSad16x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)551 int32_t WelsSampleSad16x8_mmi (uint8_t* pSample1, int32_t iStride1,
552                                uint8_t* pSample2, int32_t iStride2) {
553   int32_t iSadSum = 0;
554   BACKUP_REG;
555   __asm__ volatile (
556     ".set       arch=loongson3a                           \n\t"
557     "gsldlc1    $f0, 0x7(%[pSample2])                     \n\t"
558     "gsldlc1    $f2, 0xF(%[pSample2])                     \n\t"
559     "gsldrc1    $f0, 0x0(%[pSample2])                     \n\t"
560     "gsldrc1    $f2, 0x8(%[pSample2])                     \n\t"
561     "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
562     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
563     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
564     "pasubub    $f0, $f0, $f8                             \n\t"
565     "pasubub    $f2, $f2, $f10                            \n\t"
566     "biadd      $f0, $f0                                  \n\t"
567     "biadd      $f2, $f2                                  \n\t"
568     "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
569     "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t"
570     "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
571     "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t"
572     "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
573     "pasubub    $f4, $f4, $f8                             \n\t"
574     "pasubub    $f6, $f6, $f10                            \n\t"
575     "biadd      $f4, $f4                                  \n\t"
576     "biadd      $f6, $f6                                  \n\t"
577     "paddh      $f0, $f0, $f4                             \n\t"
578     "paddh      $f2, $f2, $f6                             \n\t"
579 
580     MMI_GetSad2x16
581     MMI_GetSad2x16
582     MMI_GetSad2x16
583 
584     "paddh      $f0, $f0, $f2                             \n\t"
585     "dmfc1      %[iSadSum], $f0                           \n\t"
586     : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
587       [pSample2]"+&r"((unsigned char *)pSample2)
588     : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
589     : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
590       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
591   );
592   RECOVER_REG;
593   return iSadSum;
594 }
595 
WelsSampleSad8x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)596 int32_t WelsSampleSad8x16_mmi (uint8_t* pSample1, int32_t iStride1,
597                                uint8_t* pSample2, int32_t iStride2) {
598   int32_t iSadSum = 0;
599   BACKUP_REG;
600   __asm__ volatile (
601     ".set       arch=loongson3a                           \n\t"
602     "xor        $f24, $f24, $f24                          \n\t"
603     "xor        $f26, $f26, $f26                          \n\t"
604     MMI_GetSad8x4
605     MMI_GetSad8x4
606     MMI_GetSad8x4
607     MMI_GetSad8x4_End
608     "paddh      $f0, $f26, $f24                           \n\t"
609     "dmfc1      %[iSadSum], $f0                           \n\t"
610     : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
611       [pSample2]"+&r"((unsigned char *)pSample2)
612     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
613     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
614       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
615   );
616   RECOVER_REG;
617   return iSadSum;
618 }
619 
WelsSampleSad4x4_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)620 int32_t WelsSampleSad4x4_mmi (uint8_t* pSample1, int32_t iStride1,
621                               uint8_t* pSample2, int32_t iStride2) {
622   int32_t iSadSum = 0;
623   __asm__ volatile (
624     ".set       arch=loongson3a                           \n\t"
625     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
626     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
627     "gsldlc1    $f2, 0x7($8)                              \n\t"
628     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
629     "gsldrc1    $f2, 0x0($8)                              \n\t"
630     "punpcklwd  $f0, $f0, $f2                             \n\t"
631 
632     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
633     "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
634     "gsldlc1    $f8, 0x7($9)                              \n\t"
635     "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
636     "gsldrc1    $f8, 0x0($9)                              \n\t"
637     "punpcklwd  $f6, $f6, $f8                             \n\t"
638     "pasubub    $f0, $f0, $f6                             \n\t"
639     "biadd      $f0, $f0                                  \n\t"
640 
641     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
642     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
643 
644     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
645     "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
646     "gsldlc1    $f4, 0x7($8)                              \n\t"
647     "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
648     "gsldrc1    $f4, 0x0($8)                              \n\t"
649     "punpcklwd  $f2, $f2, $f4                             \n\t"
650 
651     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
652     "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
653     "gsldlc1    $f8, 0x7($9)                              \n\t"
654     "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
655     "gsldrc1    $f8, 0x0($9)                              \n\t"
656     "punpcklwd  $f6, $f6, $f8                             \n\t"
657     "pasubub    $f2, $f2, $f6                             \n\t"
658     "biadd      $f2, $f2                                  \n\t"
659     "paddh      $f0, $f0, $f2                             \n\t"
660 
661     "dmfc1      %[iSadSum], $f0                           \n\t"
662     : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
663       [pSample2]"+&r"((unsigned char *)pSample2)
664     : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
665     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8"
666   );
667   return iSadSum;
668 }
669 
WelsSampleSad8x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)670 int32_t WelsSampleSad8x8_mmi (uint8_t* pSample1, int32_t iStride1,
671                               uint8_t* pSample2, int32_t iStride2) {
672   int32_t iSadSum = 0;
673   BACKUP_REG;
674   __asm__ volatile (
675     ".set       arch=loongson3a                           \n\t"
676     CACHE_SPLIT_CHECK($8, 8, 32)
677     "blez       $8, 1f                                    \n\t"
678     "nop                                                  \n\t"
679     "xor        $f28, $f28, $f28                          \n\t"
680     "xor        $f30, $f30, $f30                          \n\t"
681 
682     "move       $9, %[pSample2]                           \n\t"
683     "and        $9, $9, 0x7                               \n\t"
684     PTR_SUBU   "%[pSample2], %[pSample2], $9              \n\t"
685     "dli        $8, 0x8                                   \n\t"
686     PTR_SUBU   "$8, $8, $9                                \n\t"
687 
688     "dsll       $9, $9, 0x3                               \n\t"
689     "dsll       $8, $8, 0x3                               \n\t"
690     "dmtc1      $9, $f20                                  \n\t"
691     "dmtc1      $8, $f24                                  \n\t"
692     "dli        $9, 0x8                                   \n\t"
693     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
694     PTR_ADDU   "$9, $9, %[pSample2]                       \n\t"
695     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
696     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
697     "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
698 
699     "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
700     "gsldlc1    $f8, 0x7($9)                              \n\t"
701     "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
702     "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
703     "gsldrc1    $f8, 0x0($9)                              \n\t"
704     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
705     "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
706     PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
707     "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
708     "gsldlc1    $f10, 0x7($9)                             \n\t"
709     "dsrl       $f4, $f4, $f20                            \n\t"
710     "gsldrc1    $f10, 0x0($9)                             \n\t"
711     "dsrl       $f6, $f6, $f20                            \n\t"
712     "dsll       $f8, $f8, $f24                            \n\t"
713     "dsll       $f10, $f10, $f24                          \n\t"
714     "or         $f4, $f4, $f8                             \n\t"
715     "or         $f6, $f6, $f10                            \n\t"
716 
717     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
718     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
719     "pasubub    $f0, $f0, $f4                             \n\t"
720     "pasubub    $f2, $f2, $f6                             \n\t"
721     "biadd      $f0, $f0                                  \n\t"
722     "biadd      $f2, $f2                                  \n\t"
723     "paddh      $f28, $f28, $f0                           \n\t"
724     "paddh      $f30, $f30, $f2                           \n\t"
725 
726     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
727     PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
728     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
729 
730     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
731     "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
732 
733     "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
734     "gsldlc1    $f8, 0x7($9)                              \n\t"
735     "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
736     "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
737     "gsldrc1    $f8, 0x0($9)                              \n\t"
738     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
739     PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
740     "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
741     "gsldlc1    $f10, 0x7($9)                             \n\t"
742     "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
743     "gsldrc1    $f10, 0x0($9)                             \n\t"
744     "dsrl       $f4, $f4, $f20                            \n\t"
745     "dsrl       $f6, $f6, $f20                            \n\t"
746     "dsll       $f8, $f8, $f24                            \n\t"
747     "dsll       $f10, $f10, $f24                          \n\t"
748     "or         $f4, $f4, $f8                             \n\t"
749     "or         $f6, $f6, $f10                            \n\t"
750 
751     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
752     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
753     PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
754 
755     "pasubub    $f0, $f0, $f4                             \n\t"
756     "pasubub    $f2, $f2, $f6                             \n\t"
757     "biadd      $f0, $f0                                  \n\t"
758     "biadd      $f2, $f2                                  \n\t"
759     "paddh      $f28, $f28, $f0                           \n\t"
760     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
761     "paddh      $f30, $f30, $f2                           \n\t"
762     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
763 
764     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
765     "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
766 
767     "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
768     "gsldlc1    $f8, 0x7($9)                              \n\t"
769     "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
770     "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
771     "gsldrc1    $f8, 0x0($9)                              \n\t"
772     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
773     PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
774     "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
775     "gsldlc1    $f10, 0x7($9)                             \n\t"
776     "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
777     "gsldrc1    $f10, 0x0($9)                             \n\t"
778     "dsrl       $f4, $f4, $f20                            \n\t"
779     "dsrl       $f6, $f6, $f20                            \n\t"
780     "dsll       $f8, $f8, $f24                            \n\t"
781     "dsll       $f10, $f10, $f24                          \n\t"
782     "or         $f4, $f4, $f8                             \n\t"
783     "or         $f6, $f6, $f10                            \n\t"
784 
785     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
786     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
787     PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
788 
789     "pasubub    $f0, $f0, $f4                             \n\t"
790     "pasubub    $f2, $f2, $f6                             \n\t"
791     "biadd      $f0, $f0                                  \n\t"
792     "biadd      $f2, $f2                                  \n\t"
793     "paddh      $f28, $f28, $f0                           \n\t"
794     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
795     "paddh      $f30, $f30, $f2                           \n\t"
796 
797     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
798     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
799     "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
800 
801     "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
802     "gsldlc1    $f8, 0x7($9)                              \n\t"
803     "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
804     "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
805     "gsldrc1    $f8, 0x0($9)                              \n\t"
806     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
807     PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
808     "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
809     "gsldlc1    $f10, 0x7($9)                             \n\t"
810     "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
811     "gsldrc1    $f10, 0x0($9)                             \n\t"
812     "dsrl       $f4, $f4, $f20                            \n\t"
813     "dsrl       $f6, $f6, $f20                            \n\t"
814     "dsll       $f8, $f8, $f24                            \n\t"
815     "dsll       $f10, $f10, $f24                          \n\t"
816     "or         $f4, $f4, $f8                             \n\t"
817     "or         $f6, $f6, $f10                            \n\t"
818 
819     "pasubub    $f0, $f0, $f4                             \n\t"
820     "pasubub    $f2, $f2, $f6                             \n\t"
821     "biadd      $f0, $f0                                  \n\t"
822     "biadd      $f2, $f2                                  \n\t"
823     "paddh      $f28, $f28, $f0                           \n\t"
824     "paddh      $f30, $f30, $f2                           \n\t"
825 
826     "mov.d      $f0, $f30                                 \n\t"
827     "paddh      $f0, $f0, $f28                            \n\t"
828     "dmfc1      %[iSadSum], $f0                           \n\t"
829     "j          2f                                        \n\t"
830     "nop                                                  \n\t"
831 
832     "1:                                                   \n\t"
833     "xor        $f24, $f24, $f24                          \n\t"
834     "xor        $f26, $f26, $f26                          \n\t"
835     MMI_GetSad8x4
836     MMI_GetSad8x4_End
837     "paddh      $f0, $f26, $f24                           \n\t"
838     "dmfc1      %[iSadSum], $f0                           \n\t"
839     "2:                                                   \n\t"
840     : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
841       [pSample2]"+&r"((unsigned char *)pSample2)
842     : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
843     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
844       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
845   );
846   RECOVER_REG;
847   return iSadSum;
848 }
849 
WelsSampleSatd4x4_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)850 int32_t WelsSampleSatd4x4_mmi (uint8_t* pSample1, int32_t iStride1,
851                                uint8_t* pSample2, int32_t iStride2) {
852   int32_t iSatdSum = 0;
853   BACKUP_REG;
854   __asm__ volatile (
855     ".set       arch=loongson3a                           \n\t"
856     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
857     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
858     "gsldlc1    $f4, 0x7($8)                              \n\t"
859     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
860     "gsldrc1    $f4, 0x0($8)                              \n\t"
861     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
862     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
863     "gsldlc1    $f8, 0x7(%[pSample1])                     \n\t"
864     "gsldlc1    $f12, 0x7($8)                             \n\t"
865     "gsldrc1    $f8, 0x0(%[pSample1])                     \n\t"
866     "gsldrc1    $f12, 0x0($8)                             \n\t"
867     "punpcklwd  $f0, $f0, $f8                             \n\t"
868     "punpcklwd  $f4, $f4, $f12                            \n\t"
869 
870     PTR_ADDU   "$8, %[pSample2], %[iStride2]              \n\t"
871     "gsldlc1    $f16, 0x7(%[pSample2])                    \n\t"
872     "gsldlc1    $f20, 0x7($8)                             \n\t"
873     "gsldrc1    $f16, 0x0(%[pSample2])                    \n\t"
874     "gsldrc1    $f20, 0x0($8)                             \n\t"
875     PTR_ADDU   "%[pSample2], $8, %[iStride2]              \n\t"
876     PTR_ADDU   "$8, %[pSample2], %[iStride2]              \n\t"
877     "gsldlc1    $f24, 0x7(%[pSample2])                    \n\t"
878     "gsldlc1    $f28, 0x7($8)                             \n\t"
879     "gsldrc1    $f24, 0x0(%[pSample2])                    \n\t"
880     "gsldrc1    $f28, 0x0($8)                             \n\t"
881     "punpcklwd  $f16, $f16, $f24                          \n\t"
882     "punpcklwd  $f20, $f20, $f28                          \n\t"
883 
884     "xor        $f24, $f24, $f24                          \n\t"
885     "xor        $f26, $f26, $f26                          \n\t"
886     "punpckhbh  $f2, $f0, $f24                            \n\t"
887     "punpcklbh  $f0, $f0, $f24                            \n\t"
888     "punpckhbh  $f6, $f4, $f24                            \n\t"
889     "punpcklbh  $f4, $f4, $f24                            \n\t"
890     "punpckhbh  $f18, $f16, $f24                          \n\t"
891     "punpcklbh  $f16, $f16, $f24                          \n\t"
892     "punpckhbh  $f22, $f20, $f24                          \n\t"
893     "punpcklbh  $f20, $f20, $f24                          \n\t"
894 
895     "psubh      $f0, $f0, $f16                            \n\t"
896     "psubh      $f2, $f2, $f18                            \n\t"
897     "psubh      $f4, $f4, $f20                            \n\t"
898     "psubh      $f6, $f6, $f22                            \n\t"
899 
900     "mov.d      $f8, $f0                                  \n\t"
901     "mov.d      $f10, $f2                                 \n\t"
902     "paddh      $f0, $f0, $f4                             \n\t"
903     "paddh      $f2, $f2, $f6                             \n\t"
904     "psubh      $f8, $f8, $f4                             \n\t"
905     "psubh      $f10, $f10, $f6                           \n\t"
906     MMI_XSawp_DQ($f0, $f2, $f8, $f10, $f12, $f14)
907 
908     "mov.d      $f16, $f0                                 \n\t"
909     "mov.d      $f18, $f2                                 \n\t"
910     "paddh      $f0, $f0, $f12                            \n\t"
911     "paddh      $f2, $f2, $f14                            \n\t"
912     "psubh      $f16, $f16, $f12                          \n\t"
913     "psubh      $f18, $f18, $f14                          \n\t"
914 
915     "mov.d      $f8, $f2                                  \n\t"
916     "punpckhhw  $f2, $f0, $f16                            \n\t"
917     "punpcklhw  $f0, $f0, $f16                            \n\t"
918     "punpcklhw  $f16, $f18, $f8                           \n\t"
919     "punpckhhw  $f18, $f18, $f8                           \n\t"
920 
921     MMI_XSawp_WD($f0, $f2, $f16, $f18, $f12, $f14)
922     MMI_XSawp_DQ($f0, $f2, $f12, $f14, $f20, $f22)
923 
924     "mov.d      $f28, $f0                                 \n\t"
925     "mov.d      $f30, $f2                                 \n\t"
926     "paddh      $f0, $f0, $f20                            \n\t"
927     "paddh      $f2, $f2, $f22                            \n\t"
928     "psubh      $f28, $f28, $f20                          \n\t"
929     "psubh      $f30, $f30, $f22                          \n\t"
930 
931     MMI_XSawp_DQ($f0, $f2, $f28, $f30, $f4, $f6)
932 
933     "psubh      $f8, $f0, $f4                             \n\t"
934     "psubh      $f10, $f2, $f6                            \n\t"
935     "paddh      $f0, $f0, $f4                             \n\t"
936     "paddh      $f2, $f2, $f6                             \n\t"
937 
938     WELS_AbsH($f0, $f2, $f0, $f2, $f12, $f14)
939     "paddush    $f24, $f24, $f0                           \n\t"
940     "paddush    $f26, $f26, $f2                           \n\t"
941     WELS_AbsH($f8, $f10, $f8, $f10, $f16, $f18)
942     "paddush    $f24, $f24, $f8                           \n\t"
943     "paddush    $f26, $f26, $f10                          \n\t"
944     MMI_SumWHorizon1($f24, $f26, $f16, $f18, $f28, $f30, $8)
945 
946     "dmfc1      $8, $f24                                  \n\t"
947     "dli        $9, 0xffff                                \n\t"
948     "and        $8, $8, $9                                \n\t"
949     "dsrl       %[iSatdSum], $8, 0x1                      \n\t"
950     : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
951       [pSample2]"+&r"((unsigned char *)pSample2)
952     : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
953     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
954       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
955   );
956   RECOVER_REG;
957   return iSatdSum;
958 }
959 
WelsSampleSatd8x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)960 int32_t WelsSampleSatd8x8_mmi (uint8_t* pSample1, int32_t iStride1,
961                                uint8_t* pSample2, int32_t iStride2) {
962   int32_t iSatdSum = 0;
963   BACKUP_REG;
964   __asm__ volatile (
965     ".set       arch=loongson3a                           \n\t"
966     "xor        $f24, $f24, $f24                          \n\t"
967     "xor        $f26, $f26, $f26                          \n\t"
968     "dli        $8, 0x1                                   \n\t"
969     "xor        $f28, $f28, $f28                          \n\t"
970     "dmtc1      $8, $f30                                  \n\t"
971     MMI_GetSatd8x8_End
972     "psrlh      $f24, $f24, $f30                          \n\t"
973     "dli        $8, 0x4e                                  \n\t"
974     "psrlh      $f26, $f26, $f30                          \n\t"
975     "dmtc1      $8, $f30                                  \n\t"
976     MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
977     "mfc1       %[iSatdSum], $f24                         \n\t"
978     : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
979       [pSample2]"+&r"((unsigned char *)pSample2)
980     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
981     : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
982       "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
983   );
984   RECOVER_REG;
985   return iSatdSum;
986 }
987 
WelsSampleSatd8x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)988 int32_t WelsSampleSatd8x16_mmi (uint8_t* pSample1, int32_t iStride1,
989                                 uint8_t* pSample2, int32_t iStride2) {
990   int32_t iSatdSum = 0;
991   BACKUP_REG;
992   __asm__ volatile (
993     ".set       arch=loongson3a                           \n\t"
994     "xor        $f24, $f24, $f24                          \n\t"
995     "xor        $f26, $f26, $f26                          \n\t"
996     "dli        $8, 0x1                                   \n\t"
997     "xor        $f28, $f28, $f28                          \n\t"
998     "dmtc1      $8, $f30                                  \n\t"
999     MMI_GetSatd8x8
1000     MMI_GetSatd8x8_End
1001     "psrlh      $f24, $f24, $f30                          \n\t"
1002     "dli        $8, 0x4e                                  \n\t"
1003     "psrlh      $f26, $f26, $f30                          \n\t"
1004     "dmtc1      $8, $f30                                  \n\t"
1005     MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
1006     "mfc1       %[iSatdSum], $f24                         \n\t"
1007     : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
1008       [pSample2]"+&r"((unsigned char *)pSample2)
1009     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
1010     : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
1011       "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
1012   );
1013   RECOVER_REG;
1014   return iSatdSum;
1015 }
1016 
WelsSampleSatd16x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)1017 int32_t WelsSampleSatd16x8_mmi (uint8_t* pSample1, int32_t iStride1,
1018                                 uint8_t* pSample2, int32_t iStride2) {
1019   int32_t iSatdSum = 0;
1020   BACKUP_REG;
1021   __asm__ volatile (
1022     ".set       arch=loongson3a                           \n\t"
1023     "xor        $f24, $f24, $f24                          \n\t"
1024     "xor        $f26, $f26, $f26                          \n\t"
1025     "dli        $8, 0x1                                   \n\t"
1026     "xor        $f28, $f28, $f28                          \n\t"
1027     "dmtc1      $8, $f30                                  \n\t"
1028     "move       $9, %[pSample1]                           \n\t"
1029     "move       $10, %[pSample2]                          \n\t"
1030     MMI_GetSatd8x8_Offset8
1031 
1032     MMI_GetSatd8x8_End
1033     "psrlh      $f24, $f24, $f30                          \n\t"
1034     "dli        $8, 0x4e                                  \n\t"
1035     "psrlh      $f26, $f26, $f30                          \n\t"
1036     "dmtc1      $8, $f30                                  \n\t"
1037     MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
1038     "mfc1       %[iSatdSum], $f24                         \n\t"
1039     : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
1040       [pSample2]"+&r"((unsigned char *)pSample2)
1041     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
1042     : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
1043       "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
1044       "$f26", "$f28", "$f30"
1045   );
1046   RECOVER_REG;
1047   return iSatdSum;
1048 }
1049 
WelsSampleSatd16x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)1050 int32_t WelsSampleSatd16x16_mmi (uint8_t* pSample1, int32_t iStride1,
1051                                  uint8_t* pSample2, int32_t iStride2) {
1052   int32_t iSatdSum = 0;
1053   BACKUP_REG;
1054   __asm__ volatile (
1055     ".set       arch=loongson3a                           \n\t"
1056     "xor        $f24, $f24, $f24                          \n\t"
1057     "xor        $f26, $f26, $f26                          \n\t"
1058     "dli        $8, 0x1                                   \n\t"
1059     "xor        $f28, $f28, $f28                          \n\t"
1060     "dmtc1      $8, $f30                                  \n\t"
1061     "move       $9, %[pSample1]                           \n\t"
1062     "move       $10, %[pSample2]                          \n\t"
1063 
1064     MMI_GetSatd8x8
1065     MMI_GetSatd8x8_Offset8
1066 
1067     MMI_GetSatd8x8
1068     MMI_GetSatd8x8_End
1069 
1070     "dli        $8, 0x4e                                  \n\t"
1071     "psrlh      $f24, $f24, $f30                          \n\t"
1072     "dmtc1      $8, $f0                                   \n\t"
1073     "psrlh      $f26, $f26, $f30                          \n\t"
1074     MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f0)
1075     "mfc1       %[iSatdSum], $f24                         \n\t"
1076     : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
1077       [pSample2]"+&r"((unsigned char *)pSample2)
1078     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
1079     : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
1080       "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
1081       "$f26", "$f28", "$f30"
1082   );
1083   RECOVER_REG;
1084   return iSatdSum;
1085 }
1086 
WelsSampleSadFour16x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1087 void WelsSampleSadFour16x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1088                                  int32_t iStride2, int32_t* pSad) {
1089   BACKUP_REG;
1090   __asm__ volatile (
1091     ".set       arch=loongson3a                           \n\t"
1092     "xor        $f16, $f16, $f16                          \n\t"
1093     "xor        $f18, $f18, $f18                          \n\t"
1094     "xor        $f20, $f20, $f20                          \n\t"
1095     "xor        $f22, $f22, $f22                          \n\t"
1096     PTR_SUBU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
1097     "xor        $f24, $f24, $f24                          \n\t"
1098     "xor        $f26, $f26, $f26                          \n\t"
1099     "xor        $f28, $f28, $f28                          \n\t"
1100     "xor        $f30, $f30, $f30                          \n\t"
1101     "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
1102     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1103     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1104     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1105     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1106     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
1107     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
1108     "pasubub    $f12, $f12, $f0                           \n\t"
1109     "pasubub    $f14, $f14, $f2                           \n\t"
1110     "biadd      $f12, $f12                                \n\t"
1111     "biadd      $f14, $f14                                \n\t"
1112     "paddh      $f16, $f16, $f12                          \n\t"
1113     "paddh      $f18, $f18, $f14                          \n\t"
1114 
1115     "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
1116     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1117     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1118     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1119     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1120     "pasubub    $f12, $f12, $f4                           \n\t"
1121     "pasubub    $f14, $f14, $f6                           \n\t"
1122     "biadd      $f12, $f12                                \n\t"
1123     "biadd      $f14, $f14                                \n\t"
1124     "paddh      $f16, $f16, $f12                          \n\t"
1125     "paddh      $f18, $f18, $f14                          \n\t"
1126 
1127     "gsldlc1    $f8, 0x6(%[pSample2])                     \n\t"
1128     "gsldlc1    $f10, 0xE(%[pSample2])                    \n\t"
1129     "gsldrc1    $f8, -0x1(%[pSample2])                    \n\t"
1130     "gsldrc1    $f10, 0x7(%[pSample2])                    \n\t"
1131     "pasubub    $f8, $f8, $f0                             \n\t"
1132     "pasubub    $f10, $f10, $f2                           \n\t"
1133     "biadd      $f8, $f8                                  \n\t"
1134     "biadd      $f10, $f10                                \n\t"
1135     "paddh      $f24, $f24, $f8                           \n\t"
1136     "paddh      $f26, $f26, $f10                          \n\t"
1137 
1138     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1139     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1140     "gsldlc1    $f14, 0x10(%[pSample2])                   \n\t"
1141     "gsldrc1    $f14, 0x9(%[pSample2])                    \n\t"
1142     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1143     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1144     "pasubub    $f12, $f12, $f0                           \n\t"
1145     "pasubub    $f14, $f14, $f2                           \n\t"
1146     "biadd      $f12, $f12                                \n\t"
1147     "biadd      $f14, $f14                                \n\t"
1148     "paddh      $f28, $f28, $f12                          \n\t"
1149     "paddh      $f30, $f30, $f14                          \n\t"
1150 
1151     "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
1152     "gsldlc1    $f12, 0x7($9)                             \n\t"
1153     "gsldlc1    $f14, 0xF($9)                             \n\t"
1154     "gsldrc1    $f12, 0x0($9)                             \n\t"
1155     "gsldrc1    $f14, 0x8($9)                             \n\t"
1156     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1157     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1158     MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1159     "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
1160     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1161     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1162     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1163     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1164     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1165     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1166     MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1167     "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
1168     "gsldlc1    $f12, 0x7($9)                             \n\t"
1169     "gsldlc1    $f14, 0xF($9)                             \n\t"
1170     "gsldrc1    $f12, 0x0($9)                             \n\t"
1171     "gsldrc1    $f14, 0x8($9)                             \n\t"
1172     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1173     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1174     MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
1175     "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
1176     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1177     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1178     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1179     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1180     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1181     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1182     MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
1183     "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
1184     "gsldlc1    $f12, 0x7($9)                             \n\t"
1185     "gsldlc1    $f14, 0xF($9)                             \n\t"
1186     "gsldrc1    $f12, 0x0($9)                             \n\t"
1187     "gsldrc1    $f14, 0x8($9)                             \n\t"
1188     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1189     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1190     MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
1191     "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
1192     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1193     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1194     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1195     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1196     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1197     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1198     MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
1199 
1200     "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
1201     "gsldlc1    $f12, 0x7($9)                             \n\t"
1202     "gsldlc1    $f14, 0xF($9)                             \n\t"
1203     "gsldrc1    $f12, 0x0($9)                             \n\t"
1204     "gsldrc1    $f14, 0x8($9)                             \n\t"
1205     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1206     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1207     MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1208     "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
1209     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1210     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1211     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1212     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1213     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1214     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1215     MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1216     "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
1217     "gsldlc1    $f12, 0x7($9)                             \n\t"
1218     "gsldlc1    $f14, 0xF($9)                             \n\t"
1219     "gsldrc1    $f12, 0x0($9)                             \n\t"
1220     "gsldrc1    $f14, 0x8($9)                             \n\t"
1221     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1222     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1223     MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
1224     "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
1225     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1226     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1227     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1228     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1229     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1230     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1231     MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
1232     "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
1233     "gsldlc1    $f12, 0x7($9)                             \n\t"
1234     "gsldlc1    $f14, 0xF($9)                             \n\t"
1235     "gsldrc1    $f12, 0x0($9)                             \n\t"
1236     "gsldrc1    $f14, 0x8($9)                             \n\t"
1237     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1238     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1239     MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
1240     "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
1241     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1242     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1243     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1244     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1245     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1246     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1247     MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
1248 
1249     "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
1250     "gsldlc1    $f12, 0x7($9)                             \n\t"
1251     "gsldlc1    $f14, 0xF($9)                             \n\t"
1252     "gsldrc1    $f12, 0x0($9)                             \n\t"
1253     "gsldrc1    $f14, 0x8($9)                             \n\t"
1254     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1255     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1256     MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1257     "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
1258     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1259     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1260     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1261     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1262     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1263     MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1264     "gsldlc1    $f12, 0x7($9)                             \n\t"
1265     "gsldlc1    $f14, 0xF($9)                             \n\t"
1266     "gsldrc1    $f12, 0x0($9)                             \n\t"
1267     "gsldrc1    $f14, 0x8($9)                             \n\t"
1268     "pasubub    $f8, $f8, $f12                            \n\t"
1269     "pasubub    $f10, $f10, $f14                          \n\t"
1270     "biadd      $f8, $f8                                  \n\t"
1271     "biadd      $f10, $f10                                \n\t"
1272     "paddh      $f20, $f20, $f8                           \n\t"
1273     "paddh      $f22, $f22, $f10                          \n\t"
1274 
1275     "gsldlc1    $f8, 0x6($9)                              \n\t"
1276     "gsldlc1    $f10, 0xE($9)                             \n\t"
1277     "gsldrc1    $f8, -0x1($9)                             \n\t"
1278     "gsldrc1    $f10, 0x7($9)                             \n\t"
1279     "pasubub    $f8, $f8, $f0                             \n\t"
1280     "pasubub    $f10, $f10, $f2                           \n\t"
1281     "biadd      $f8, $f8                                  \n\t"
1282     "biadd      $f10, $f10                                \n\t"
1283     "paddh      $f24, $f24, $f8                           \n\t"
1284     "paddh      $f26, $f26, $f10                          \n\t"
1285 
1286     "gsldlc1    $f12, 0x8($9)                             \n\t"
1287     "gsldlc1    $f14, 0x10($9)                            \n\t"
1288     "gsldrc1    $f12, 0x1($9)                             \n\t"
1289     "gsldrc1    $f14, 0x9($9)                             \n\t"
1290     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1291     "pasubub    $f12, $f12, $f0                           \n\t"
1292     "pasubub    $f14, $f14, $f2                           \n\t"
1293     "biadd      $f12, $f12                                \n\t"
1294     "biadd      $f14, $f14                                \n\t"
1295     "paddh      $f28, $f28, $f12                          \n\t"
1296     "paddh      $f30, $f30, $f14                          \n\t"
1297 
1298     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1299     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1300     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1301     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1302     "pasubub    $f0, $f0, $f12                            \n\t"
1303     "pasubub    $f2, $f2, $f14                            \n\t"
1304     "biadd      $f0, $f0                                  \n\t"
1305     "biadd      $f2, $f2                                  \n\t"
1306     "paddh      $f20, $f20, $f0                           \n\t"
1307     "paddh      $f22, $f22, $f2                           \n\t"
1308 
1309     "paddh      $f16, $f16, $f18                          \n\t"
1310     "paddh      $f20, $f20, $f22                          \n\t"
1311     "paddh      $f24, $f24, $f26                          \n\t"
1312     "paddh      $f28, $f28, $f30                          \n\t"
1313     "punpcklwd  $f16, $f16, $f20                          \n\t"
1314     "punpcklwd  $f24, $f24, $f28                          \n\t"
1315     "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
1316     : [pSample1]"+&r"((unsigned char *)pSample1),
1317       [pSample2]"+&r"((unsigned char *)pSample2)
1318     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
1319       [pSad]"r"((int *)pSad)
1320     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1321       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
1322   );
1323   RECOVER_REG;
1324 }
1325 
WelsSampleSadFour16x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1326 void WelsSampleSadFour16x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1327                                 int32_t iStride2, int32_t* pSad) {
1328   BACKUP_REG;
1329   __asm__ volatile (
1330     ".set       arch=loongson3a                           \n\t"
1331     "xor        $f16, $f16, $f16                          \n\t"
1332     "xor        $f18, $f18, $f18                          \n\t"
1333     "xor        $f20, $f20, $f20                          \n\t"
1334     "xor        $f22, $f22, $f22                          \n\t"
1335     "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
1336     PTR_SUBU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
1337     "xor        $f24, $f24, $f24                          \n\t"
1338     "xor        $f26, $f26, $f26                          \n\t"
1339     "xor        $f28, $f28, $f28                          \n\t"
1340     "xor        $f30, $f30, $f30                          \n\t"
1341     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1342     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1343     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1344     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1345     PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
1346     "pasubub    $f12, $f12, $f0                           \n\t"
1347     "pasubub    $f14, $f14, $f2                           \n\t"
1348     PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
1349     "biadd      $f12, $f12                                \n\t"
1350     "biadd      $f14, $f14                                \n\t"
1351     "paddh      $f16, $f16, $f12                          \n\t"
1352     "paddh      $f18, $f18, $f14                          \n\t"
1353 
1354     "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
1355     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1356     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1357     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1358     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1359     "pasubub    $f12, $f12, $f4                           \n\t"
1360     "pasubub    $f14, $f14, $f6                           \n\t"
1361     "biadd      $f12, $f12                                \n\t"
1362     "biadd      $f14, $f14                                \n\t"
1363     "paddh      $f16, $f16, $f12                          \n\t"
1364     "paddh      $f18, $f18, $f14                          \n\t"
1365 
1366     "gsldlc1    $f8, 0x6(%[pSample2])                     \n\t"
1367     "gsldlc1    $f10, 0xE(%[pSample2])                    \n\t"
1368     "gsldrc1    $f8, -0x1(%[pSample2])                    \n\t"
1369     "gsldrc1    $f10, 0x7(%[pSample2])                    \n\t"
1370     "pasubub    $f8, $f8, $f0                             \n\t"
1371     "pasubub    $f10, $f10, $f2                           \n\t"
1372     "biadd      $f8, $f8                                  \n\t"
1373     "biadd      $f10, $f10                                \n\t"
1374     "paddh      $f24, $f24, $f8                           \n\t"
1375     "paddh      $f26, $f26, $f10                          \n\t"
1376 
1377     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1378     "gsldlc1    $f14, 0x10(%[pSample2])                   \n\t"
1379     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1380     "gsldrc1    $f14, 0x9(%[pSample2])                    \n\t"
1381     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1382     "pasubub    $f12, $f12, $f0                           \n\t"
1383     "pasubub    $f14, $f14, $f2                           \n\t"
1384     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1385     "biadd      $f12, $f12                                \n\t"
1386     "biadd      $f14, $f14                                \n\t"
1387     "paddh      $f28, $f28, $f12                          \n\t"
1388     "paddh      $f30, $f30, $f14                          \n\t"
1389 
1390     "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
1391     "gsldlc1    $f12, 0x7($9)                             \n\t"
1392     "gsldlc1    $f14, 0xF($9)                             \n\t"
1393     "gsldrc1    $f12, 0x0($9)                             \n\t"
1394     "gsldrc1    $f14, 0x8($9)                             \n\t"
1395     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1396     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1397     MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
1398     "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
1399     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1400     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1401     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1402     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1403     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1404     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1405     MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
1406     "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
1407     "gsldlc1    $f12, 0x7($9)                             \n\t"
1408     "gsldlc1    $f14, 0xF($9)                             \n\t"
1409     "gsldrc1    $f12, 0x0($9)                             \n\t"
1410     "gsldrc1    $f14, 0x8($9)                             \n\t"
1411     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1412     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1413     MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
1414     "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
1415     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1416     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1417     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1418     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1419     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1420     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1421     MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
1422     "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
1423     "gsldlc1    $f12, 0x7($9)                             \n\t"
1424     "gsldlc1    $f14, 0xF($9)                             \n\t"
1425     "gsldrc1    $f12, 0x0($9)                             \n\t"
1426     "gsldrc1    $f14, 0x8($9)                             \n\t"
1427     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1428     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1429     MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
1430     "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
1431     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1432     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1433     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1434     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1435     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1436     MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
1437     "gsldlc1    $f12, 0x7($9)                             \n\t"
1438     "gsldlc1    $f14, 0xF($9)                             \n\t"
1439     "gsldrc1    $f12, 0x0($9)                             \n\t"
1440     "gsldrc1    $f14, 0x8($9)                             \n\t"
1441     "pasubub    $f0, $f0, $f12                            \n\t"
1442     "pasubub    $f2, $f2, $f14                            \n\t"
1443     "biadd      $f0, $f0                                  \n\t"
1444     "biadd      $f2, $f2                                  \n\t"
1445     "paddh      $f20, $f20, $f0                           \n\t"
1446     "paddh      $f22, $f22, $f2                           \n\t"
1447 
1448     "gsldlc1    $f0, 0x6($9)                              \n\t"
1449     "gsldlc1    $f2, 0xE($9)                              \n\t"
1450     "gsldrc1    $f0, -0x1($9)                             \n\t"
1451     "gsldrc1    $f2, 0x7($9)                              \n\t"
1452     "pasubub    $f0, $f0, $f4                             \n\t"
1453     "pasubub    $f2, $f2, $f6                             \n\t"
1454     "biadd      $f0, $f0                                  \n\t"
1455     "biadd      $f2, $f2                                  \n\t"
1456     "paddh      $f24, $f24, $f0                           \n\t"
1457     "paddh      $f26, $f26, $f2                           \n\t"
1458 
1459     "gsldlc1    $f12, 0x8($9)                             \n\t"
1460     "gsldlc1    $f14, 0x10($9)                            \n\t"
1461     "gsldrc1    $f12, 0x1($9)                             \n\t"
1462     "gsldrc1    $f14, 0x9($9)                             \n\t"
1463     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1464     "pasubub    $f12, $f12, $f4                           \n\t"
1465     "pasubub    $f14, $f14, $f6                           \n\t"
1466     "biadd      $f12, $f12                                \n\t"
1467     "biadd      $f14, $f14                                \n\t"
1468     "paddh      $f28, $f28, $f12                          \n\t"
1469     "paddh      $f30, $f30, $f14                          \n\t"
1470 
1471     "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
1472     "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
1473     "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
1474     "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
1475     "pasubub    $f4, $f4, $f12                            \n\t"
1476     "pasubub    $f6, $f6, $f14                            \n\t"
1477     "biadd      $f4, $f4                                  \n\t"
1478     "biadd      $f6, $f6                                  \n\t"
1479     "paddh      $f20, $f20, $f4                           \n\t"
1480     "paddh      $f22, $f22, $f6                           \n\t"
1481 
1482     "paddh      $f16, $f16, $f18                          \n\t"
1483     "paddh      $f20, $f20, $f22                          \n\t"
1484     "paddh      $f24, $f24, $f26                          \n\t"
1485     "paddh      $f28, $f28, $f30                          \n\t"
1486     "punpcklwd  $f16, $f16, $f20                          \n\t"
1487     "punpcklwd  $f24, $f24, $f28                          \n\t"
1488     "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
1489     : [pSample1]"+&r"((unsigned char *)pSample1),
1490       [pSample2]"+&r"((unsigned char *)pSample2)
1491     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
1492       [pSad]"r"((int *)pSad)
1493     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1494       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
1495   );
1496   RECOVER_REG;
1497 }
1498 
WelsSampleSadFour8x16_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1499 void WelsSampleSadFour8x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1500                                 int32_t iStride2, int32_t* pSad) {
1501   BACKUP_REG;
1502   __asm__ volatile (
1503     ".set       arch=loongson3a                           \n\t"
1504     "xor        $f16, $f16, $f16                          \n\t"
1505     "xor        $f18, $f18, $f18                          \n\t"
1506     "xor        $f20, $f20, $f20                          \n\t"
1507     "xor        $f22, $f22, $f22                          \n\t"
1508     "xor        $f24, $f24, $f24                          \n\t"
1509     "xor        $f26, $f26, $f26                          \n\t"
1510     "xor        $f28, $f28, $f28                          \n\t"
1511     "xor        $f30, $f30, $f30                          \n\t"
1512     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1513     PTR_SUBU   "$9, %[pSample2], %[iStride2]              \n\t"
1514     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1515     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1516     "gsldlc1    $f2, 0x7($8)                              \n\t"
1517     "gsldlc1    $f12, 0x7($9)                             \n\t"
1518     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1519     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1520     "gsldrc1    $f2, 0x0($8)                              \n\t"
1521     "gsldrc1    $f12, 0x0($9)                             \n\t"
1522     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1523     "pasubub    $f12, $f12, $f0                           \n\t"
1524     "pasubub    $f14, $f14, $f2                           \n\t"
1525     "biadd      $f12, $f12                                \n\t"
1526     "biadd      $f14, $f14                                \n\t"
1527     "paddh      $f16, $f16, $f12                          \n\t"
1528     "paddh      $f18, $f18, $f14                          \n\t"
1529 
1530     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1531     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1532     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1533     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1534     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1535 
1536     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1537     "gsldlc1    $f6, 0x6($9)                              \n\t"
1538     "gsldlc1    $f14, 0x8($9)                             \n\t"
1539     "gsldrc1    $f6, -0x1($9)                             \n\t"
1540     "gsldrc1    $f14, 0x1($9)                             \n\t"
1541     "pasubub    $f4, $f4, $f0                             \n\t"
1542     "pasubub    $f6, $f6, $f2                             \n\t"
1543     "biadd      $f4, $f4                                  \n\t"
1544     "biadd      $f6, $f6                                  \n\t"
1545     "paddh      $f24, $f24, $f4                           \n\t"
1546     "paddh      $f26, $f26, $f6                           \n\t"
1547     "pasubub    $f12, $f12, $f0                           \n\t"
1548     "pasubub    $f14, $f14, $f2                           \n\t"
1549     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1550     "biadd      $f12, $f12                                \n\t"
1551     "biadd      $f14, $f14                                \n\t"
1552     "paddh      $f28, $f28, $f12                          \n\t"
1553     "paddh      $f30, $f30, $f14                          \n\t"
1554 
1555     "gsldlc1    $f12, 0x7($9)                             \n\t"
1556     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1557     "gsldrc1    $f12, 0x0($9)                             \n\t"
1558     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1559     "pasubub    $f0, $f0, $f12                            \n\t"
1560     "pasubub    $f2, $f2, $f14                            \n\t"
1561     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1562     "biadd      $f0, $f0                                  \n\t"
1563     "biadd      $f2, $f2                                  \n\t"
1564     "paddh      $f20, $f20, $f0                           \n\t"
1565     "paddh      $f22, $f22, $f2                           \n\t"
1566 
1567     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1568     "gsldlc1    $f2, 0x7($8)                              \n\t"
1569     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1570     "gsldrc1    $f2, 0x0($8)                              \n\t"
1571     "pasubub    $f12, $f12, $f0                           \n\t"
1572     "pasubub    $f14, $f14, $f2                           \n\t"
1573     "biadd      $f12, $f12                                \n\t"
1574     "biadd      $f14, $f14                                \n\t"
1575     "paddh      $f16, $f16, $f12                          \n\t"
1576     "paddh      $f18, $f18, $f14                          \n\t"
1577 
1578     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1579     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1580     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1581     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1582     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1583 
1584     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1585     "gsldlc1    $f6, 0x6($9)                              \n\t"
1586     "gsldlc1    $f14, 0x8($9)                             \n\t"
1587     "gsldrc1    $f6, -0x1($9)                             \n\t"
1588     "gsldrc1    $f14, 0x1($9)                             \n\t"
1589 
1590     "pasubub    $f4, $f4, $f0                             \n\t"
1591     "pasubub    $f6, $f6, $f2                             \n\t"
1592     "biadd      $f4, $f4                                  \n\t"
1593     "biadd      $f6, $f6                                  \n\t"
1594     "paddh      $f24, $f24, $f4                           \n\t"
1595     "paddh      $f26, $f26, $f6                           \n\t"
1596     "pasubub    $f12, $f12, $f0                           \n\t"
1597     "pasubub    $f14, $f14, $f2                           \n\t"
1598     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1599     "biadd      $f12, $f12                                \n\t"
1600     "biadd      $f14, $f14                                \n\t"
1601     "paddh      $f28, $f28, $f12                          \n\t"
1602     "paddh      $f30, $f30, $f14                          \n\t"
1603 
1604     "gsldlc1    $f12, 0x7($9)                             \n\t"
1605     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1606     "gsldrc1    $f12, 0x0($9)                             \n\t"
1607     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1608     "pasubub    $f0, $f0, $f12                            \n\t"
1609     "pasubub    $f2, $f2, $f14                            \n\t"
1610     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1611     "biadd      $f0, $f0                                  \n\t"
1612     "biadd      $f2, $f2                                  \n\t"
1613     "paddh      $f20, $f20, $f0                           \n\t"
1614     "paddh      $f22, $f22, $f2                           \n\t"
1615 
1616     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1617     "gsldlc1    $f2, 0x7($8)                              \n\t"
1618     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1619     "gsldrc1    $f2, 0x0($8)                              \n\t"
1620     "pasubub    $f12, $f12, $f0                           \n\t"
1621     "pasubub    $f14, $f14, $f2                           \n\t"
1622     "biadd      $f12, $f12                                \n\t"
1623     "biadd      $f14, $f14                                \n\t"
1624     "paddh      $f16, $f16, $f12                          \n\t"
1625     "paddh      $f18, $f18, $f14                          \n\t"
1626 
1627     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1628     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1629     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1630     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1631     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1632 
1633     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1634     "gsldlc1    $f6, 0x6($9)                              \n\t"
1635     "gsldlc1    $f14, 0x8($9)                             \n\t"
1636     "gsldrc1    $f6, -0x1($9)                             \n\t"
1637     "gsldrc1    $f14, 0x1($9)                             \n\t"
1638 
1639     "pasubub    $f4, $f4, $f0                             \n\t"
1640     "pasubub    $f6, $f6, $f2                             \n\t"
1641     "biadd      $f4, $f4                                  \n\t"
1642     "biadd      $f6, $f6                                  \n\t"
1643     "paddh      $f24, $f24, $f4                           \n\t"
1644     "paddh      $f26, $f26, $f6                           \n\t"
1645     "pasubub    $f12, $f12, $f0                           \n\t"
1646     "pasubub    $f14, $f14, $f2                           \n\t"
1647     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1648     "biadd      $f12, $f12                                \n\t"
1649     "biadd      $f14, $f14                                \n\t"
1650     "paddh      $f28, $f28, $f12                          \n\t"
1651     "paddh      $f30, $f30, $f14                          \n\t"
1652 
1653     "gsldlc1    $f12, 0x7($9)                             \n\t"
1654     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1655     "gsldrc1    $f12, 0x0($9)                             \n\t"
1656     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1657     "pasubub    $f0, $f0, $f12                            \n\t"
1658     "pasubub    $f2, $f2, $f14                            \n\t"
1659     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1660     "biadd      $f0, $f0                                  \n\t"
1661     "biadd      $f2, $f2                                  \n\t"
1662     "paddh      $f20, $f20, $f0                           \n\t"
1663     "paddh      $f22, $f22, $f2                           \n\t"
1664 
1665     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1666     "gsldlc1    $f2, 0x7($8)                              \n\t"
1667     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1668     "gsldrc1    $f2, 0x0($8)                              \n\t"
1669     "pasubub    $f12, $f12, $f0                           \n\t"
1670     "pasubub    $f14, $f14, $f2                           \n\t"
1671     "biadd      $f12, $f12                                \n\t"
1672     "biadd      $f14, $f14                                \n\t"
1673     "paddh      $f16, $f16, $f12                          \n\t"
1674     "paddh      $f18, $f18, $f14                          \n\t"
1675 
1676     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1677     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1678     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1679     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1680     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1681 
1682     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1683     "gsldlc1    $f6, 0x6($9)                              \n\t"
1684     "gsldlc1    $f14, 0x8($9)                             \n\t"
1685     "gsldrc1    $f6, -0x1($9)                             \n\t"
1686     "gsldrc1    $f14, 0x1($9)                             \n\t"
1687     "pasubub    $f4, $f4, $f0                             \n\t"
1688     "pasubub    $f6, $f6, $f2                             \n\t"
1689     "biadd      $f4, $f4                                  \n\t"
1690     "biadd      $f6, $f6                                  \n\t"
1691     "paddh      $f24, $f24, $f4                           \n\t"
1692     "paddh      $f26, $f26, $f6                           \n\t"
1693     "pasubub    $f12, $f12, $f0                           \n\t"
1694     "pasubub    $f14, $f14, $f2                           \n\t"
1695     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1696     "biadd      $f12, $f12                                \n\t"
1697     "biadd      $f14, $f14                                \n\t"
1698     "paddh      $f28, $f28, $f12                          \n\t"
1699     "paddh      $f30, $f30, $f14                          \n\t"
1700 
1701     "gsldlc1    $f12, 0x7($9)                             \n\t"
1702     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1703     "gsldrc1    $f12, 0x0($9)                             \n\t"
1704     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1705     "pasubub    $f0, $f0, $f12                            \n\t"
1706     "pasubub    $f2, $f2, $f14                            \n\t"
1707     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1708     "biadd      $f0, $f0                                  \n\t"
1709     "biadd      $f2, $f2                                  \n\t"
1710     "paddh      $f20, $f20, $f0                           \n\t"
1711     "paddh      $f22, $f22, $f2                           \n\t"
1712 
1713     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1714     "gsldlc1    $f2, 0x7($8)                              \n\t"
1715     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1716     "gsldrc1    $f2, 0x0($8)                              \n\t"
1717     "pasubub    $f12, $f12, $f0                           \n\t"
1718     "pasubub    $f14, $f14, $f2                           \n\t"
1719     "biadd      $f12, $f12                                \n\t"
1720     "biadd      $f14, $f14                                \n\t"
1721     "paddh      $f16, $f16, $f12                          \n\t"
1722     "paddh      $f18, $f18, $f14                          \n\t"
1723 
1724     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1725     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1726     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1727     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1728     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1729 
1730     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1731     "gsldlc1    $f6, 0x6($9)                              \n\t"
1732     "gsldlc1    $f14, 0x8($9)                             \n\t"
1733     "gsldrc1    $f6, -0x1($9)                             \n\t"
1734     "gsldrc1    $f14, 0x1($9)                             \n\t"
1735 
1736     "pasubub    $f4, $f4, $f0                             \n\t"
1737     "pasubub    $f6, $f6, $f2                             \n\t"
1738     "biadd      $f4, $f4                                  \n\t"
1739     "biadd      $f6, $f6                                  \n\t"
1740     "paddh      $f24, $f24, $f4                           \n\t"
1741     "paddh      $f26, $f26, $f6                           \n\t"
1742     "pasubub    $f12, $f12, $f0                           \n\t"
1743     "pasubub    $f14, $f14, $f2                           \n\t"
1744     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1745     "biadd      $f12, $f12                                \n\t"
1746     "biadd      $f14, $f14                                \n\t"
1747     "paddh      $f28, $f28, $f12                          \n\t"
1748     "paddh      $f30, $f30, $f14                          \n\t"
1749 
1750     "gsldlc1    $f12, 0x7($9)                             \n\t"
1751     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1752     "gsldrc1    $f12, 0x0($9)                             \n\t"
1753     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1754     "pasubub    $f0, $f0, $f12                            \n\t"
1755     "pasubub    $f2, $f2, $f14                            \n\t"
1756     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1757     "biadd      $f0, $f0                                  \n\t"
1758     "biadd      $f2, $f2                                  \n\t"
1759     "paddh      $f20, $f20, $f0                           \n\t"
1760     "paddh      $f22, $f22, $f2                           \n\t"
1761 
1762     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1763     "gsldlc1    $f2, 0x7($8)                              \n\t"
1764     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1765     "gsldrc1    $f2, 0x0($8)                              \n\t"
1766     "pasubub    $f12, $f12, $f0                           \n\t"
1767     "pasubub    $f14, $f14, $f2                           \n\t"
1768     "biadd      $f12, $f12                                \n\t"
1769     "biadd      $f14, $f14                                \n\t"
1770     "paddh      $f16, $f16, $f12                          \n\t"
1771     "paddh      $f18, $f18, $f14                          \n\t"
1772 
1773     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1774     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1775     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1776     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1777     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1778 
1779     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1780     "gsldlc1    $f6, 0x6($9)                              \n\t"
1781     "gsldlc1    $f14, 0x8($9)                             \n\t"
1782     "gsldrc1    $f6, -0x1($9)                             \n\t"
1783     "gsldrc1    $f14, 0x1($9)                             \n\t"
1784 
1785     "pasubub    $f4, $f4, $f0                             \n\t"
1786     "pasubub    $f6, $f6, $f2                             \n\t"
1787     "biadd      $f4, $f4                                  \n\t"
1788     "biadd      $f6, $f6                                  \n\t"
1789     "paddh      $f24, $f24, $f4                           \n\t"
1790     "paddh      $f26, $f26, $f6                           \n\t"
1791     "pasubub    $f12, $f12, $f0                           \n\t"
1792     "pasubub    $f14, $f14, $f2                           \n\t"
1793     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1794     "biadd      $f12, $f12                                \n\t"
1795     "biadd      $f14, $f14                                \n\t"
1796     "paddh      $f28, $f28, $f12                          \n\t"
1797     "paddh      $f30, $f30, $f14                          \n\t"
1798 
1799     "gsldlc1    $f12, 0x7($9)                             \n\t"
1800     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1801     "gsldrc1    $f12, 0x0($9)                             \n\t"
1802     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1803     "pasubub    $f0, $f0, $f12                            \n\t"
1804     "pasubub    $f2, $f2, $f14                            \n\t"
1805     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1806     "biadd      $f0, $f0                                  \n\t"
1807     "biadd      $f2, $f2                                  \n\t"
1808     "paddh      $f20, $f20, $f0                           \n\t"
1809     "paddh      $f22, $f22, $f2                           \n\t"
1810 
1811     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1812     "gsldlc1    $f2, 0x7($8)                              \n\t"
1813     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1814     "gsldrc1    $f2, 0x0($8)                              \n\t"
1815     "pasubub    $f12, $f12, $f0                           \n\t"
1816     "pasubub    $f14, $f14, $f2                           \n\t"
1817     "biadd      $f12, $f12                                \n\t"
1818     "biadd      $f14, $f14                                \n\t"
1819     "paddh      $f16, $f16, $f12                          \n\t"
1820     "paddh      $f18, $f18, $f14                          \n\t"
1821 
1822     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1823     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1824     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1825     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1826     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1827 
1828     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1829     "gsldlc1    $f6, 0x6($9)                              \n\t"
1830     "gsldlc1    $f14, 0x8($9)                             \n\t"
1831     "gsldrc1    $f6, -0x1($9)                             \n\t"
1832     "gsldrc1    $f14, 0x1($9)                             \n\t"
1833 
1834     "pasubub    $f4, $f4, $f0                             \n\t"
1835     "pasubub    $f6, $f6, $f2                             \n\t"
1836     "biadd      $f4, $f4                                  \n\t"
1837     "biadd      $f6, $f6                                  \n\t"
1838     "paddh      $f24, $f24, $f4                           \n\t"
1839     "paddh      $f26, $f26, $f6                           \n\t"
1840     "pasubub    $f12, $f12, $f0                           \n\t"
1841     "pasubub    $f14, $f14, $f2                           \n\t"
1842     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1843     "biadd      $f12, $f12                                \n\t"
1844     "biadd      $f14, $f14                                \n\t"
1845     "paddh      $f28, $f28, $f12                          \n\t"
1846     "paddh      $f30, $f30, $f14                          \n\t"
1847 
1848     "gsldlc1    $f12, 0x7($9)                             \n\t"
1849     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1850     "gsldrc1    $f12, 0x0($9)                             \n\t"
1851     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1852     "pasubub    $f0, $f0, $f12                            \n\t"
1853     "pasubub    $f2, $f2, $f14                            \n\t"
1854     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1855     "biadd      $f0, $f0                                  \n\t"
1856     "biadd      $f2, $f2                                  \n\t"
1857     "paddh      $f20, $f20, $f0                           \n\t"
1858     "paddh      $f22, $f22, $f2                           \n\t"
1859 
1860     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1861     "gsldlc1    $f2, 0x7($8)                              \n\t"
1862     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1863     "gsldrc1    $f2, 0x0($8)                              \n\t"
1864     "pasubub    $f12, $f12, $f0                           \n\t"
1865     "pasubub    $f14, $f14, $f2                           \n\t"
1866     "biadd      $f12, $f12                                \n\t"
1867     "biadd      $f14, $f14                                \n\t"
1868     "paddh      $f16, $f16, $f12                          \n\t"
1869     "paddh      $f18, $f18, $f14                          \n\t"
1870 
1871     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1872     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1873     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1874     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1875     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1876 
1877     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1878     "gsldlc1    $f6, 0x6($9)                              \n\t"
1879     "gsldlc1    $f14, 0x8($9)                             \n\t"
1880     "gsldrc1    $f6, -0x1($9)                             \n\t"
1881     "gsldrc1    $f14, 0x1($9)                             \n\t"
1882 
1883     "pasubub    $f4, $f4, $f0                             \n\t"
1884     "pasubub    $f6, $f6, $f2                             \n\t"
1885     "biadd      $f4, $f4                                  \n\t"
1886     "biadd      $f6, $f6                                  \n\t"
1887     "paddh      $f24, $f24, $f4                           \n\t"
1888     "paddh      $f26, $f26, $f6                           \n\t"
1889     "pasubub    $f12, $f12, $f0                           \n\t"
1890     "pasubub    $f14, $f14, $f2                           \n\t"
1891     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1892     "biadd      $f12, $f12                                \n\t"
1893     "biadd      $f14, $f14                                \n\t"
1894     "paddh      $f28, $f28, $f12                          \n\t"
1895     "paddh      $f30, $f30, $f14                          \n\t"
1896 
1897     "gsldlc1    $f12, 0x7($9)                             \n\t"
1898     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1899     "gsldrc1    $f12, 0x0($9)                             \n\t"
1900     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1901     "pasubub    $f0, $f0, $f12                            \n\t"
1902     "pasubub    $f2, $f2, $f14                            \n\t"
1903     "biadd      $f0, $f0                                  \n\t"
1904     "biadd      $f2, $f2                                  \n\t"
1905     "paddh      $f20, $f20, $f0                           \n\t"
1906     "paddh      $f22, $f22, $f2                           \n\t"
1907 
1908     "paddh      $f16, $f16, $f18                          \n\t"
1909     "paddh      $f20, $f20, $f22                          \n\t"
1910     "paddh      $f24, $f24, $f26                          \n\t"
1911     "paddh      $f28, $f28, $f30                          \n\t"
1912     "punpcklwd  $f16, $f16, $f20                          \n\t"
1913     "punpcklwd  $f24, $f24, $f28                          \n\t"
1914     "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
1915     : [pSample1]"+&r"((unsigned char *)pSample1),
1916       [pSample2]"+&r"((unsigned char *)pSample2)
1917     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
1918       [pSad]"r"((int *)pSad)
1919     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1920       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
1921   );
1922   RECOVER_REG;
1923 }
1924 
WelsSampleSadFour8x8_mmi(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2,int32_t * pSad)1925 void WelsSampleSadFour8x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
1926                                int32_t iStride2, int32_t* pSad) {
1927   BACKUP_REG;
1928   __asm__ volatile (
1929     ".set       arch=loongson3a                           \n\t"
1930     "xor        $f16, $f16, $f16                          \n\t"
1931     "xor        $f18, $f18, $f18                          \n\t"
1932     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1933     "xor        $f20, $f20, $f20                          \n\t"
1934     "xor        $f22, $f22, $f22                          \n\t"
1935     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1936     "xor        $f24, $f24, $f24                          \n\t"
1937     "xor        $f26, $f26, $f26                          \n\t"
1938     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1939     PTR_SUBU   "$9, %[pSample2], %[iStride2]              \n\t"
1940     "xor        $f28, $f28, $f28                          \n\t"
1941     "xor        $f30, $f30, $f30                          \n\t"
1942     "gsldlc1    $f2, 0x7($8)                              \n\t"
1943     "gsldlc1    $f12, 0x7($9)                             \n\t"
1944     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1945     "gsldrc1    $f2, 0x0($8)                              \n\t"
1946     "gsldrc1    $f12, 0x0($9)                             \n\t"
1947     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1948     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1949     "pasubub    $f12, $f12, $f0                           \n\t"
1950     "pasubub    $f14, $f14, $f2                           \n\t"
1951     "biadd      $f12, $f12                                \n\t"
1952     "biadd      $f14, $f14                                \n\t"
1953     "paddh      $f16, $f16, $f12                          \n\t"
1954     "paddh      $f18, $f18, $f14                          \n\t"
1955 
1956     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
1957     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
1958     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
1959     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
1960     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
1961     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
1962 
1963     "gsldlc1    $f6, 0x6($9)                              \n\t"
1964     "gsldlc1    $f14, 0x8($9)                             \n\t"
1965     "gsldrc1    $f6, -0x1($9)                             \n\t"
1966     "gsldrc1    $f14, 0x1($9)                             \n\t"
1967     "pasubub    $f4, $f4, $f0                             \n\t"
1968     "pasubub    $f6, $f6, $f2                             \n\t"
1969     "biadd      $f4, $f4                                  \n\t"
1970     "biadd      $f6, $f6                                  \n\t"
1971     "paddh      $f24, $f24, $f4                           \n\t"
1972     "paddh      $f26, $f26, $f6                           \n\t"
1973     "pasubub    $f12, $f12, $f0                           \n\t"
1974     "pasubub    $f14, $f14, $f2                           \n\t"
1975     "biadd      $f12, $f12                                \n\t"
1976     "biadd      $f14, $f14                                \n\t"
1977     "paddh      $f28, $f28, $f12                          \n\t"
1978     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
1979     "paddh      $f30, $f30, $f14                          \n\t"
1980 
1981     "gsldlc1    $f12, 0x7($9)                             \n\t"
1982     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
1983     "gsldrc1    $f12, 0x0($9)                             \n\t"
1984     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
1985     "pasubub    $f0, $f0, $f12                            \n\t"
1986     "pasubub    $f2, $f2, $f14                            \n\t"
1987     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
1988     "biadd      $f0, $f0                                  \n\t"
1989     "biadd      $f2, $f2                                  \n\t"
1990     "paddh      $f20, $f20, $f0                           \n\t"
1991     "paddh      $f22, $f22, $f2                           \n\t"
1992 
1993     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
1994     "gsldlc1    $f2, 0x7($8)                              \n\t"
1995     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
1996     "gsldrc1    $f2, 0x0($8)                              \n\t"
1997     "pasubub    $f12, $f12, $f0                           \n\t"
1998     "pasubub    $f14, $f14, $f2                           \n\t"
1999     "biadd      $f12, $f12                                \n\t"
2000     "biadd      $f14, $f14                                \n\t"
2001     "paddh      $f16, $f16, $f12                          \n\t"
2002     "paddh      $f18, $f18, $f14                          \n\t"
2003 
2004     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
2005     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
2006     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
2007     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
2008     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
2009     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
2010 
2011     "gsldlc1    $f6, 0x6($9)                              \n\t"
2012     "gsldlc1    $f14, 0x8($9)                             \n\t"
2013     "gsldrc1    $f6, -0x1($9)                             \n\t"
2014     "gsldrc1    $f14, 0x1($9)                             \n\t"
2015 
2016     "pasubub    $f4, $f4, $f0                             \n\t"
2017     "pasubub    $f6, $f6, $f2                             \n\t"
2018     "biadd      $f4, $f4                                  \n\t"
2019     "biadd      $f6, $f6                                  \n\t"
2020     "paddh      $f24, $f24, $f4                           \n\t"
2021     "paddh      $f26, $f26, $f6                           \n\t"
2022     "pasubub    $f12, $f12, $f0                           \n\t"
2023     "pasubub    $f14, $f14, $f2                           \n\t"
2024     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
2025     "biadd      $f12, $f12                                \n\t"
2026     "biadd      $f14, $f14                                \n\t"
2027     "paddh      $f28, $f28, $f12                          \n\t"
2028     "paddh      $f30, $f30, $f14                          \n\t"
2029 
2030     "gsldlc1    $f12, 0x7($9)                             \n\t"
2031     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
2032     "gsldrc1    $f12, 0x0($9)                             \n\t"
2033     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
2034     "pasubub    $f0, $f0, $f12                            \n\t"
2035     "pasubub    $f2, $f2, $f14                            \n\t"
2036     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
2037     "biadd      $f0, $f0                                  \n\t"
2038     "biadd      $f2, $f2                                  \n\t"
2039     "paddh      $f20, $f20, $f0                           \n\t"
2040     "paddh      $f22, $f22, $f2                           \n\t"
2041 
2042     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
2043     "gsldlc1    $f2, 0x7($8)                              \n\t"
2044     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
2045     "gsldrc1    $f2, 0x0($8)                              \n\t"
2046     "pasubub    $f12, $f12, $f0                           \n\t"
2047     "pasubub    $f14, $f14, $f2                           \n\t"
2048     "biadd      $f12, $f12                                \n\t"
2049     "biadd      $f14, $f14                                \n\t"
2050     "paddh      $f16, $f16, $f12                          \n\t"
2051     "paddh      $f18, $f18, $f14                          \n\t"
2052 
2053     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
2054     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
2055     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
2056     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
2057     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
2058     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
2059 
2060     "gsldlc1    $f6, 0x6($9)                              \n\t"
2061     "gsldlc1    $f14, 0x8($9)                             \n\t"
2062     "gsldrc1    $f6, -0x1($9)                             \n\t"
2063     "gsldrc1    $f14, 0x1($9)                             \n\t"
2064 
2065     "pasubub    $f4, $f4, $f0                             \n\t"
2066     "pasubub    $f6, $f6, $f2                             \n\t"
2067     "biadd      $f4, $f4                                  \n\t"
2068     "biadd      $f6, $f6                                  \n\t"
2069     "paddh      $f24, $f24, $f4                           \n\t"
2070     "paddh      $f26, $f26, $f6                           \n\t"
2071     "pasubub    $f12, $f12, $f0                           \n\t"
2072     "pasubub    $f14, $f14, $f2                           \n\t"
2073     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
2074     "biadd      $f12, $f12                                \n\t"
2075     "biadd      $f14, $f14                                \n\t"
2076     "paddh      $f28, $f28, $f12                          \n\t"
2077     "paddh      $f30, $f30, $f14                          \n\t"
2078 
2079     "gsldlc1    $f12, 0x7($9)                             \n\t"
2080     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
2081     "gsldrc1    $f12, 0x0($9)                             \n\t"
2082     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
2083     "pasubub    $f0, $f0, $f12                            \n\t"
2084     "pasubub    $f2, $f2, $f14                            \n\t"
2085     PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
2086     "biadd      $f0, $f0                                  \n\t"
2087     "biadd      $f2, $f2                                  \n\t"
2088     "paddh      $f20, $f20, $f0                           \n\t"
2089     "paddh      $f22, $f22, $f2                           \n\t"
2090 
2091     "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
2092     "gsldlc1    $f2, 0x7($8)                              \n\t"
2093     "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
2094     "gsldrc1    $f2, 0x0($8)                              \n\t"
2095     "pasubub    $f12, $f12, $f0                           \n\t"
2096     "pasubub    $f14, $f14, $f2                           \n\t"
2097     "biadd      $f12, $f12                                \n\t"
2098     "biadd      $f14, $f14                                \n\t"
2099     "paddh      $f16, $f16, $f12                          \n\t"
2100     "paddh      $f18, $f18, $f14                          \n\t"
2101 
2102     "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
2103     "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
2104     PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
2105     PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
2106     "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
2107     "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
2108 
2109     "gsldlc1    $f6, 0x6($9)                              \n\t"
2110     "gsldlc1    $f14, 0x8($9)                             \n\t"
2111     "gsldrc1    $f6, -0x1($9)                             \n\t"
2112     "gsldrc1    $f14, 0x1($9)                             \n\t"
2113 
2114     "pasubub    $f4, $f4, $f0                             \n\t"
2115     "pasubub    $f6, $f6, $f2                             \n\t"
2116     "biadd      $f4, $f4                                  \n\t"
2117     "biadd      $f6, $f6                                  \n\t"
2118     "paddh      $f24, $f24, $f4                           \n\t"
2119     "paddh      $f26, $f26, $f6                           \n\t"
2120     "pasubub    $f12, $f12, $f0                           \n\t"
2121     "pasubub    $f14, $f14, $f2                           \n\t"
2122     PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
2123     "biadd      $f12, $f12                                \n\t"
2124     "biadd      $f14, $f14                                \n\t"
2125     "paddh      $f28, $f28, $f12                          \n\t"
2126     "paddh      $f30, $f30, $f14                          \n\t"
2127 
2128     "gsldlc1    $f12, 0x7($9)                             \n\t"
2129     "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
2130     "gsldrc1    $f12, 0x0($9)                             \n\t"
2131     "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
2132     "pasubub    $f0, $f0, $f12                            \n\t"
2133     "pasubub    $f2, $f2, $f14                            \n\t"
2134     "biadd      $f0, $f0                                  \n\t"
2135     "biadd      $f2, $f2                                  \n\t"
2136     "paddh      $f20, $f20, $f0                           \n\t"
2137     "paddh      $f22, $f22, $f2                           \n\t"
2138 
2139     "paddh      $f16, $f16, $f18                          \n\t"
2140     "paddh      $f20, $f20, $f22                          \n\t"
2141     "paddh      $f24, $f24, $f26                          \n\t"
2142     "paddh      $f28, $f28, $f30                          \n\t"
2143     "punpcklwd  $f16, $f16, $f20                          \n\t"
2144     "punpcklwd  $f24, $f24, $f28                          \n\t"
2145     "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
2146     : [pSample1]"+&r"((unsigned char *)pSample1),
2147       [pSample2]"+&r"((unsigned char *)pSample2)
2148     : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
2149       [pSad]"r"((int *)pSad)
2150     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
2151       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
2152   );
2153   RECOVER_REG;
2154 }
2155