• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2013, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    mc.c
33  *
34  * \brief   Interfaces implementation for motion compensation
35  *
36  * \date    03/17/2009 Created
37  *
38  *************************************************************************************
39  */
40 
41 #include "mc.h"
42 
43 #include "cpu_core.h"
44 #include "ls_defines.h"
45 #include "macros.h"
46 #include "asmdefs_mmi.h"
47 
48 namespace {
49 
50 typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
51                                        const uint8_t* kpABCD, int32_t iHeight);
52 typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,
53     int32_t, int32_t);
54 typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
55                                         int32_t iWidth, int32_t iHeight);
56 
57 /*------------------weight for chroma fraction pixel interpolation------------------*/
58 //iA = (8 - dx) * (8 - dy);
59 //iB = dx * (8 - dy);
60 //iC = (8 - dx) * dy;
61 //iD = dx * dy
62 static const uint8_t g_kuiABCD[8][8][4] = { //g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
63   {
64     {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
65     {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
66   },
67   {
68     {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
69     {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
70   },
71   {
72     {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
73     {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
74   },
75   {
76     {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
77     {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
78   },
79   {
80     {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
81     {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
82   },
83   {
84     {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
85     {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
86   },
87   {
88     {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
89     {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
90   },
91   {
92     {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
93     {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
94   }
95 };
96 
97 //***************************************************************************//
98 //                          C code implementation                            //
99 //***************************************************************************//
McCopyWidthEq2_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)100 static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
101                                      int32_t iHeight) {
102   int32_t i;
103   for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
104     ST16A2 (pDst, LD16 (pSrc));
105     pDst += iDstStride;
106     pSrc += iSrcStride;
107   }
108 }
109 
McCopyWidthEq4_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)110 static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
111                                      int32_t iHeight) {
112   int32_t i;
113   for (i = 0; i < iHeight; i++) {
114     ST32A4 (pDst, LD32 (pSrc));
115     pDst += iDstStride;
116     pSrc += iSrcStride;
117   }
118 }
119 
McCopyWidthEq8_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)120 static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
121                                      int32_t iHeight) {
122   int32_t i;
123   for (i = 0; i < iHeight; i++) {
124     ST64A8 (pDst, LD64 (pSrc));
125     pDst += iDstStride;
126     pSrc += iSrcStride;
127   }
128 }
129 
McCopyWidthEq16_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)130 static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
131                                       int32_t iHeight) {
132   int32_t i;
133   for (i = 0; i < iHeight; i++) {
134     ST64A8 (pDst  , LD64 (pSrc));
135     ST64A8 (pDst + 8, LD64 (pSrc + 8));
136     pDst += iDstStride;
137     pSrc += iSrcStride;
138   }
139 }
140 
141 //--------------------Luma sample MC------------------//
142 
HorFilterInput16bit_c(const int16_t * pSrc)143 static inline int32_t HorFilterInput16bit_c (const int16_t* pSrc) {
144   int32_t iPix05 = pSrc[0] + pSrc[5];
145   int32_t iPix14 = pSrc[1] + pSrc[4];
146   int32_t iPix23 = pSrc[2] + pSrc[3];
147 
148   return (iPix05 - (iPix14 * 5) + (iPix23 * 20));
149 }
150 // h: iOffset=1 / v: iOffset=iSrcStride
FilterInput8bitWithStride_c(const uint8_t * pSrc,const int32_t kiOffset)151 static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
152   const int32_t kiOffset1 = kiOffset;
153   const int32_t kiOffset2 = (kiOffset << 1);
154   const int32_t kiOffset3 = kiOffset + kiOffset2;
155   const uint32_t kuiPix05   = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
156   const uint32_t kuiPix14   = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
157   const uint32_t kuiPix23   = * (pSrc) + * (pSrc + kiOffset1);
158 
159   return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
160 }
161 
PixelAvg_c(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)162 static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
163                                const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
164   int32_t i, j;
165   for (i = 0; i < iHeight; i++) {
166     for (j = 0; j < iWidth; j++) {
167       pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
168     }
169     pDst  += iDstStride;
170     pSrcA += iSrcAStride;
171     pSrcB += iSrcBStride;
172   }
173 }
McCopy_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)174 static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
175                              int32_t iHeight) {
176   if (iWidth == 16)
177     McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
178   else if (iWidth == 8)
179     McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
180   else if (iWidth == 4)
181     McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
182   else //here iWidth == 2
183     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
184 }
185 
186 //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
McHorVer20_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)187 static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
188                                  int32_t iWidth,
189                                  int32_t iHeight) {
190   int32_t i, j;
191   for (i = 0; i < iHeight; i++) {
192     for (j = 0; j < iWidth; j++) {
193       pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
194     }
195     pDst += iDstStride;
196     pSrc += iSrcStride;
197   }
198 }
199 
200 //vertical filter to gain half sample, that is (0, 2) location in quarter sample
McHorVer02_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)201 static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
202                                  int32_t iWidth,
203                                  int32_t iHeight) {
204   int32_t i, j;
205   for (i = 0; i < iHeight; i++) {
206     for (j = 0; j < iWidth; j++) {
207       pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
208     }
209     pDst += iDstStride;
210     pSrc += iSrcStride;
211   }
212 }
213 
214 //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
McHorVer22_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)215 static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
216                                  int32_t iWidth,
217                                  int32_t iHeight) {
218   int16_t iTmp[17 + 5];
219   int32_t i, j, k;
220 
221   for (i = 0; i < iHeight; i++) {
222     for (j = 0; j < iWidth + 5; j++) {
223       iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
224     }
225     for (k = 0; k < iWidth; k++) {
226       pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
227     }
228     pSrc += iSrcStride;
229     pDst += iDstStride;
230   }
231 }
232 
233 /////////////////////luma MC//////////////////////////
McHorVer01_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)234 static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
235                                  int32_t iWidth,
236                                  int32_t iHeight) {
237   uint8_t uiTmp[256];
238   McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
239   PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
240 }
McHorVer03_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)241 static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
242                                  int32_t iWidth,
243                                  int32_t iHeight) {
244   uint8_t uiTmp[256];
245   McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
246   PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
247 }
McHorVer10_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)248 static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
249                                  int32_t iWidth,
250                                  int32_t iHeight) {
251   uint8_t uiTmp[256];
252   McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
253   PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
254 }
McHorVer11_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)255 static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
256                                  int32_t iWidth,
257                                  int32_t iHeight) {
258   uint8_t uiHorTmp[256];
259   uint8_t uiVerTmp[256];
260   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
261   McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
262   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
263 }
McHorVer12_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)264 static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
265                                  int32_t iWidth,
266                                  int32_t iHeight) {
267   uint8_t uiVerTmp[256];
268   uint8_t uiCtrTmp[256];
269   McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
270   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
271   PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
272 }
McHorVer13_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)273 static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
274                                  int32_t iWidth,
275                                  int32_t iHeight) {
276   uint8_t uiHorTmp[256];
277   uint8_t uiVerTmp[256];
278   McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
279   McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
280   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
281 }
McHorVer21_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)282 static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
283                                  int32_t iWidth,
284                                  int32_t iHeight) {
285   uint8_t uiHorTmp[256];
286   uint8_t uiCtrTmp[256];
287   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
288   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
289   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
290 }
McHorVer23_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)291 static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
292                                  int32_t iWidth,
293                                  int32_t iHeight) {
294   uint8_t uiHorTmp[256];
295   uint8_t uiCtrTmp[256];
296   McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
297   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
298   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
299 }
McHorVer30_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)300 static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
301                                  int32_t iWidth,
302                                  int32_t iHeight) {
303   uint8_t uiHorTmp[256];
304   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
305   PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
306 }
McHorVer31_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)307 static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
308                                  int32_t iWidth,
309                                  int32_t iHeight) {
310   uint8_t uiHorTmp[256];
311   uint8_t uiVerTmp[256];
312   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
313   McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
314   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
315 }
McHorVer32_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)316 static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
317                                  int32_t iWidth,
318                                  int32_t iHeight) {
319   uint8_t uiVerTmp[256];
320   uint8_t uiCtrTmp[256];
321   McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
322   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
323   PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
324 }
McHorVer33_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)325 static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
326                                  int32_t iWidth,
327                                  int32_t iHeight) {
328   uint8_t uiHorTmp[256];
329   uint8_t uiVerTmp[256];
330   McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
331   McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
332   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
333 }
334 
McLuma_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)335 void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
336                int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
337 //pSrc has been added the offset of mv
338 {
339   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
340     {McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
341     {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
342     {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
343     {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
344   };
345 
346   pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
347 }
348 
McChromaWithFragMv_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)349 static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
350     int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
351   int32_t i, j;
352   int32_t iA, iB, iC, iD;
353   const uint8_t* pSrcNext = pSrc + iSrcStride;
354   const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
355   iA = pABCD[0];
356   iB = pABCD[1];
357   iC = pABCD[2];
358   iD = pABCD[3];
359   for (i = 0; i < iHeight; i++) {
360     for (j = 0; j < iWidth; j++) {
361       pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
362     }
363     pDst     += iDstStride;
364     pSrc      = pSrcNext;
365     pSrcNext += iSrcStride;
366   }
367 }
368 
McChroma_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)369 void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
370                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
371 //pSrc has been added the offset of mv
372 {
373   const int32_t kiD8x = iMvX & 0x07;
374   const int32_t kiD8y = iMvY & 0x07;
375   if (0 == kiD8x && 0 == kiD8y)
376     McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
377   else
378     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
379 }
380 
381 #if defined(X86_ASM)
382 //***************************************************************************//
383 //                       SSE2 implement                          //
384 //***************************************************************************//
McHorVer22WidthEq8_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)385 static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
386     int32_t iHeight) {
387   ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
388   McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
389   McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
390 }
391 
McHorVer02WidthEq16_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)392 static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
393     int32_t iHeight) {
394   McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
395   McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
396 }
397 
McHorVer22WidthEq16_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)398 static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
399     int32_t iHeight) {
400   McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
401   McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
402 }
403 
McHorVer20Width5Or9Or17_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)404 void McHorVer20Width5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
405         int32_t iWidth, int32_t iHeight) {
406     if (iWidth == 17 || iWidth == 9)
407         McHorVer20Width9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
408     else //if (iWidth == 5)
409         McHorVer20Width5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
410 }
411 
McHorVer02Height5Or9Or17_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)412 void McHorVer02Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
413         int32_t iWidth, int32_t iHeight) {
414     if (iWidth == 16 || iWidth == 8)
415         McHorVer02Height9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
416     else //if (iWidth == 4)
417         McHorVer02Height5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
418 }
419 
McHorVer22Width5Or9Or17Height5Or9Or17_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)420 void McHorVer22Width5Or9Or17Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
421         int32_t iWidth, int32_t iHeight) {
422     ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
423     if (iWidth == 17 || iWidth == 9){
424         int32_t tmp1 = 2 * (iWidth - 8);
425         McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
426         McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
427         McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
428     }
429     else{ //if(iWidth == 5)
430         int32_t tmp1 = 2 * (iWidth - 4);
431         McHorVer22Width5HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
432         McHorVer22Width4VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
433         McHorVer22Width4VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 4, iDstStride, 4, iHeight);
434     }
435 
436 }
437 
McCopy_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)438 static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
439                                 int32_t iWidth,
440                                 int32_t iHeight) {
441   if (iWidth == 16)
442     McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
443   else if (iWidth == 8)
444     McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
445   else if (iWidth == 4)
446     McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
447   else
448     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
449 }
450 
McHorVer20_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)451 static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
452                                     int32_t iWidth, int32_t iHeight) {
453   if (iWidth == 16)
454     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
455   else if (iWidth == 8)
456     McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
457   else
458     McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
459 }
460 
McHorVer02_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)461 static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
462                                     int32_t iWidth, int32_t iHeight) {
463   if (iWidth == 16)
464     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
465   else if (iWidth == 8)
466     McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
467   else
468     McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
469 }
470 
McHorVer22_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)471 static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
472                                     int32_t iWidth, int32_t iHeight) {
473   if (iWidth == 16)
474     McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
475   else if (iWidth == 8)
476     McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
477   else
478     McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
479 }
480 
McHorVer01_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)481 static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
482                                     int32_t iWidth, int32_t iHeight) {
483   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
484   if (iWidth == 16) {
485     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
486     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
487   } else if (iWidth == 8) {
488     McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
489     PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
490   } else {
491     McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
492     PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
493   }
494 }
McHorVer03_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)495 static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
496                                     int32_t iWidth, int32_t iHeight) {
497   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
498   if (iWidth == 16) {
499     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
500     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
501   } else if (iWidth == 8) {
502     McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
503     PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
504   } else {
505     McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
506     PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
507   }
508 }
McHorVer10_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)509 static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
510                                     int32_t iWidth, int32_t iHeight) {
511   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
512   if (iWidth == 16) {
513     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
514     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
515   } else if (iWidth == 8) {
516     McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
517     PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
518   } else {
519     McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
520     PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
521   }
522 }
McHorVer11_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)523 static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
524                                     int32_t iWidth, int32_t iHeight) {
525   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
526   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
527   if (iWidth == 16) {
528     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
529     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
530     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
531   } else if (iWidth == 8) {
532     McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
533     McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
534     PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
535   } else {
536     McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
537     McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
538     PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
539   }
540 }
McHorVer12_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)541 static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
542                                     int32_t iWidth, int32_t iHeight) {
543   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
544   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
545   if (iWidth == 16) {
546     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
547     McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
548     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
549   } else if (iWidth == 8) {
550     McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
551     McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
552     PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
553   } else {
554     McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
555     McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
556     PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
557   }
558 }
McHorVer13_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)559 static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
560                                     int32_t iWidth, int32_t iHeight) {
561   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
562   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
563   if (iWidth == 16) {
564     McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
565     McHorVer02WidthEq16_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
566     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
567   } else if (iWidth == 8) {
568     McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
569     McHorVer02WidthEq8_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
570     PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
571   } else {
572     McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
573     McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
574     PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
575   }
576 }
McHorVer21_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)577 static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
578                                     int32_t iWidth, int32_t iHeight) {
579   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
580   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
581   if (iWidth == 16) {
582     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
583     McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
584     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
585   } else if (iWidth == 8) {
586     McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
587     McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
588     PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
589   } else {
590     McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
591     McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
592     PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
593   }
594 }
McHorVer23_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)595 static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
596                                     int32_t iWidth, int32_t iHeight) {
597   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
598   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
599   if (iWidth == 16) {
600     McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
601     McHorVer22WidthEq16_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
602     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
603   } else if (iWidth == 8) {
604     McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
605     McHorVer22WidthEq8_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
606     PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
607   } else {
608     McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
609     McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
610     PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
611   }
612 }
McHorVer30_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)613 static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
614                                     int32_t iWidth, int32_t iHeight) {
615   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
616   if (iWidth == 16) {
617     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
618     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
619   } else if (iWidth == 8) {
620     McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
621     PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
622   } else {
623     McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
624     PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
625   }
626 }
McHorVer31_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)627 static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
628                                     int32_t iWidth, int32_t iHeight) {
629   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
630   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
631   if (iWidth == 16) {
632     McHorVer20WidthEq16_sse2 (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
633     McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
634     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
635   } else if (iWidth == 8) {
636     McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
637     McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
638     PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
639   } else {
640     McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
641     McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
642     PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
643   }
644 }
McHorVer32_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)645 static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
646                                     int32_t iWidth, int32_t iHeight) {
647   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
648   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
649   if (iWidth == 16) {
650     McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
651     McHorVer22WidthEq16_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
652     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
653   } else if (iWidth == 8) {
654     McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
655     McHorVer22WidthEq8_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
656     PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
657   } else {
658     McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
659     McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
660     PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
661   }
662 }
McHorVer33_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)663 static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
664                                     int32_t iWidth, int32_t iHeight) {
665   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
666   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
667   if (iWidth == 16) {
668     McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
669     McHorVer02WidthEq16_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
670     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
671   } else if (iWidth == 8) {
672     McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
673     McHorVer02WidthEq8_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
674     PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
675   } else {
676     McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
677     McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
678     PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
679   }
680 }
681 
McLuma_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)682 void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
683                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
684 //pSrc has been added the offset of mv
685 {
686   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
687     {McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
688     {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
689     {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
690     {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
691   };
692 
693   pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
694 }
695 
McChroma_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)696 void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
697                     int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
698   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
699     McChromaWidthEq4_mmx,
700     McChromaWidthEq8_sse2
701   };
702   const int32_t kiD8x = iMvX & 0x07;
703   const int32_t kiD8y = iMvY & 0x07;
704   if (kiD8x == 0 && kiD8y == 0) {
705     McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
706     return;
707   }
708   if (iWidth != 2) {
709     kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
710   } else
711     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
712 }
713 
714 //***************************************************************************//
715 //                          SSSE3 implementation                             //
716 //***************************************************************************//
717 
PixelAvgWidth4Or8Or16_sse2(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)718 void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
719                                  const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
720   if (iWidth < 8) {
721     PixelAvgWidthEq4_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
722   } else if (iWidth == 8) {
723     PixelAvgWidthEq8_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
724   } else {
725     PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
726   }
727 }
728 
McCopy_sse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)729 void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
730                   int32_t iWidth, int32_t iHeight) {
731   switch (iWidth) {
732   case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
733   case 8:  return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
734   case 4:  return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
735   }
736   return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
737 }
738 
McHorVer22_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)739 void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
740                        int32_t iWidth, int32_t iHeight) {
741   ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16);
742   if (iWidth < 8) {
743     McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
744     McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight);
745   } else if (iWidth == 8) {
746     McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
747     McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
748   } else {
749     McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
750     McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight);
751     McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
752     McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight);
753   }
754 }
755 
McHorVer01_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)756 void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
757                        int32_t iWidth, int32_t iHeight) {
758   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
759   McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
760   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
761                               &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
762 }
763 
McHorVer03_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)764 void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
765                        int32_t iWidth, int32_t iHeight) {
766   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
767   McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
768   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
769                               &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
770 }
771 
McHorVer10_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)772 void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
773                        int32_t iWidth, int32_t iHeight) {
774   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
775   McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
776   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
777                               &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
778 }
779 
McHorVer11_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)780 void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
781                        int32_t iWidth, int32_t iHeight) {
782   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
783   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
784   McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
785   McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
786   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
787                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
788 }
789 
McHorVer12_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)790 void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
791                        int32_t iWidth, int32_t iHeight) {
792   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
793   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
794   McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
795   McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
796   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
797                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
798 }
799 
McHorVer13_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)800 void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
801                        int32_t iWidth, int32_t iHeight) {
802   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
803   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
804   McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
805   McHorVer02_ssse3 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
806   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
807                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
808 }
809 
McHorVer21_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)810 void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
811                        int32_t iWidth, int32_t iHeight) {
812   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
813   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
814   McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
815   McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
816   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
817                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
818 }
819 
McHorVer23_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)820 void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
821                        int32_t iWidth, int32_t iHeight) {
822   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
823   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
824   McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
825   McHorVer22_ssse3 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
826   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
827                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
828 }
829 
McHorVer30_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)830 void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
831                        int32_t iWidth, int32_t iHeight) {
832   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
833   McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
834   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
835 }
836 
McHorVer31_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)837 void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
838                        int32_t iWidth, int32_t iHeight) {
839   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
840   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
841   McHorVer20_ssse3 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
842   McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
843   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
844                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
845 }
846 
McHorVer32_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)847 void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
848                        int32_t iWidth, int32_t iHeight) {
849   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
850   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
851   McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
852   McHorVer22_ssse3 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
853   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
854                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
855 }
856 
McHorVer33_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)857 void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
858                        int32_t iWidth, int32_t iHeight) {
859   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
860   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
861   McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
862   McHorVer02_ssse3 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
863   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
864                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
865 }
866 
McHorVer22Width5Or9Or17_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)867 void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
868                                     int32_t iWidth, int32_t iHeight) {
869   ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16)
870   if (iWidth > 5) {
871     McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5);
872     McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
873   } else {
874     McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
875     McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight);
876   }
877 }
878 
McLuma_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)879 void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
880                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
881   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
882     {McCopy_sse3,      McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3},
883     {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3},
884     {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3},
885     {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3},
886   };
887 
888   pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
889 }
890 
McChroma_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)891 void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
892                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
893   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
894     McChromaWidthEq4_mmx,
895     McChromaWidthEq8_ssse3
896   };
897   const int32_t kiD8x = iMvX & 0x07;
898   const int32_t kiD8y = iMvY & 0x07;
899   if (kiD8x == 0 && kiD8y == 0) {
900     McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
901     return;
902   }
903   if (iWidth != 2) {
904     kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
905   } else
906     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
907 }
908 
909 //***************************************************************************//
910 //                          AVX2 implementation                              //
911 //***************************************************************************//
912 
913 #ifdef HAVE_AVX2
914 
McHorVer22_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)915 void McHorVer22_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
916                       int32_t iWidth, int32_t iHeight) {
917   ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 16, 32);
918   if (iWidth < 8) {
919     McHorVer20Width4U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
920     McHorVer02Width4S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
921   } else if (iWidth == 8) {
922     McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
923     McHorVer02Width8S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
924   } else {
925     McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
926     McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
927   }
928 }
929 
McHorVer01_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)930 void McHorVer01_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
931                       int32_t iWidth, int32_t iHeight) {
932   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
933   McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
934   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
935                               &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
936 }
937 
McHorVer03_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)938 void McHorVer03_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
939                       int32_t iWidth, int32_t iHeight) {
940   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
941   McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
942   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
943                               &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
944 }
945 
McHorVer10_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)946 void McHorVer10_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
947                       int32_t iWidth, int32_t iHeight) {
948   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
949   McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
950   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
951                               &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
952 }
953 
McHorVer11_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)954 void McHorVer11_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
955                       int32_t iWidth, int32_t iHeight) {
956   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
957   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
958   McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
959   McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
960   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
961                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
962 }
963 
McHorVer12_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)964 void McHorVer12_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
965                       int32_t iWidth, int32_t iHeight) {
966   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
967   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
968   McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
969   McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
970   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
971                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
972 }
973 
McHorVer13_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)974 void McHorVer13_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
975                       int32_t iWidth, int32_t iHeight) {
976   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
977   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
978   McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
979   McHorVer02_avx2 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
980   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
981                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
982 }
983 
McHorVer21_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)984 void McHorVer21_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
985                       int32_t iWidth, int32_t iHeight) {
986   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
987   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
988   McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
989   McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
990   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
991                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
992 }
993 
McHorVer23_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)994 void McHorVer23_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
995                       int32_t iWidth, int32_t iHeight) {
996   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
997   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
998   McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
999   McHorVer22_avx2 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1000   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1001                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1002 }
1003 
McHorVer30_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1004 void McHorVer30_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1005                       int32_t iWidth, int32_t iHeight) {
1006   ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
1007   McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
1008   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
1009 }
1010 
McHorVer31_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1011 void McHorVer31_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1012                       int32_t iWidth, int32_t iHeight) {
1013   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
1014   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1015   McHorVer20_avx2 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
1016   McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1017   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1018                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1019 }
1020 
McHorVer32_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1021 void McHorVer32_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1022                       int32_t iWidth, int32_t iHeight) {
1023   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1024   ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
1025   McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1026   McHorVer22_avx2 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1027   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
1028                               &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1029 }
1030 
McHorVer33_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1031 void McHorVer33_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1032                       int32_t iWidth, int32_t iHeight) {
1033   ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
1034   ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1035   McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
1036   McHorVer02_avx2 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1037   PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1038                               &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1039 }
1040 
McHorVer22Width5Or9Or17_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1041 void McHorVer22Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1042                                    int32_t iWidth, int32_t iHeight) {
1043   if (iWidth < 9) {
1044     ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 9 + 5, WELS_ALIGN(5, 16 / sizeof (int16_t)), 16)
1045     McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1046     McHorVer02Width5S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
1047   } else if (iWidth == 9) {
1048     ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, 16, 32)
1049     McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1050     McHorVer02Width9S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
1051   } else {
1052     ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 32 / sizeof (int16_t)), 32)
1053     McHorVer20Width17U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1054     McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
1055   }
1056 }
1057 
McLuma_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1058 void McLuma_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1059                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1060   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
1061     {McCopy_sse3,     McHorVer01_avx2, McHorVer02_avx2, McHorVer03_avx2},
1062     {McHorVer10_avx2, McHorVer11_avx2, McHorVer12_avx2, McHorVer13_avx2},
1063     {McHorVer20_avx2, McHorVer21_avx2, McHorVer22_avx2, McHorVer23_avx2},
1064     {McHorVer30_avx2, McHorVer31_avx2, McHorVer32_avx2, McHorVer33_avx2},
1065   };
1066 
1067   pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1068 }
1069 
1070 #endif //HAVE_AVX2
1071 
PixelAvg_sse2(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)1072 void PixelAvg_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1073                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1074   static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1075     PixelAvgWidthEq8_mmx,
1076     PixelAvgWidthEq16_sse2
1077   };
1078   kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1079 }
1080 
1081 #endif //X86_ASM
1082 //***************************************************************************//
1083 //                       NEON implementation                      //
1084 //***************************************************************************//
1085 #if defined(HAVE_NEON)
McHorVer20Width5Or9Or17_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1086 void McHorVer20Width5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1087                                 int32_t iWidth, int32_t iHeight) {
1088   if (iWidth == 17)
1089     McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1090   else if (iWidth == 9)
1091     McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1092   else //if (iWidth == 5)
1093     McHorVer20Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1094 }
McHorVer02Height5Or9Or17_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1095 void McHorVer02Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1096                                  int32_t iWidth, int32_t iHeight) {
1097   if (iWidth == 16)
1098     McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1099   else if (iWidth == 8)
1100     McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1101   else //if (iWidth == 4)
1102     McHorVer02Height5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1103 }
McHorVer22Width5Or9Or17Height5Or9Or17_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1104 void McHorVer22Width5Or9Or17Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1105     int32_t iWidth, int32_t iHeight) {
1106   if (iWidth == 17)
1107     McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1108   else if (iWidth == 9)
1109     McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1110   else //if (iWidth == 5)
1111     McHorVer22Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1112 }
McCopy_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1113 void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1114                   int32_t iWidth, int32_t iHeight) {
1115   if (16 == iWidth)
1116     McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1117   else if (8 == iWidth)
1118     McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1119   else if (4 == iWidth)
1120     McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1121   else
1122     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1123 }
McHorVer20_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1124 void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1125                       int32_t iWidth, int32_t iHeight) {
1126   if (iWidth == 16)
1127     McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1128   else if (iWidth == 8)
1129     McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1130   else if (iWidth == 4)
1131     McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1132 }
McHorVer02_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1133 void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1134                       int32_t iWidth, int32_t iHeight) {
1135   if (iWidth == 16)
1136     McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1137   else if (iWidth == 8)
1138     McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1139   else if (iWidth == 4)
1140     McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1141 }
McHorVer22_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1142 void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1143                       int32_t iWidth, int32_t iHeight) {
1144   if (iWidth == 16)
1145     McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1146   else if (iWidth == 8)
1147     McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1148   else if (iWidth == 4)
1149     McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1150 }
1151 
McHorVer01_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1152 void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1153                       int32_t iWidth, int32_t iHeight) {
1154   if (iWidth == 16)
1155     McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1156   else if (iWidth == 8)
1157     McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1158   else if (iWidth == 4)
1159     McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1160 }
McHorVer03_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1161 void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1162                       int32_t iWidth, int32_t iHeight) {
1163   if (iWidth == 16)
1164     McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1165   else if (iWidth == 8)
1166     McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1167   else if (iWidth == 4)
1168     McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1169 }
McHorVer10_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1170 void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1171                       int32_t iWidth, int32_t iHeight) {
1172   if (iWidth == 16)
1173     McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1174   else if (iWidth == 8)
1175     McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1176   else if (iWidth == 4)
1177     McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1178 }
McHorVer11_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1179 void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1180                       int32_t iWidth, int32_t iHeight) {
1181   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1182   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1183   if (iWidth == 16) {
1184     McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1185     McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1186     PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1187   } else if (iWidth == 8) {
1188     McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1189     McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1190     PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1191   } else if (iWidth == 4) {
1192     McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1193     McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1194     PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1195   }
1196 }
McHorVer12_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1197 void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1198                       int32_t iWidth, int32_t iHeight) {
1199   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1200   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1201   if (iWidth == 16) {
1202     McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1203     McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1204     PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1205   } else if (iWidth == 8) {
1206     McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1207     McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1208     PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1209   } else if (iWidth == 4) {
1210     McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1211     McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1212     PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1213   }
1214 }
McHorVer13_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1215 void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1216                       int32_t iWidth, int32_t iHeight) {
1217   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1218   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1219   if (iWidth == 16) {
1220     McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1221     McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1222     PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1223   } else if (iWidth == 8) {
1224     McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1225     McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1226     PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1227   } else if (iWidth == 4) {
1228     McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1229     McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1230     PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1231   }
1232 }
McHorVer21_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1233 void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1234                       int32_t iWidth, int32_t iHeight) {
1235   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1236   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1237   if (iWidth == 16) {
1238     McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1239     McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1240     PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1241   } else if (iWidth == 8) {
1242     McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1243     McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1244     PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1245   } else if (iWidth == 4) {
1246     McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1247     McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1248     PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1249   }
1250 }
McHorVer23_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1251 void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1252                       int32_t iWidth, int32_t iHeight) {
1253   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1254   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1255   if (iWidth == 16) {
1256     McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1257     McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1258     PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1259   } else if (iWidth == 8) {
1260     McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1261     McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1262     PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1263   } else if (iWidth == 4) {
1264     McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1265     McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1266     PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1267   }
1268 }
McHorVer30_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1269 void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1270                       int32_t iWidth, int32_t iHeight) {
1271   if (iWidth == 16)
1272     McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1273   else if (iWidth == 8)
1274     McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1275   else if (iWidth == 4)
1276     McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1277 }
McHorVer31_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1278 void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1279                       int32_t iWidth, int32_t iHeight) {
1280   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1281   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1282   if (iWidth == 16) {
1283     McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1284     McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1285     PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1286   } else if (iWidth == 8) {
1287     McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1288     McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1289     PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1290   } else if (iWidth == 4) {
1291     McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1292     McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1293     PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1294   }
1295 }
McHorVer32_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1296 void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1297                       int32_t iWidth, int32_t iHeight) {
1298   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1299   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1300   if (iWidth == 16) {
1301     McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1302     McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1303     PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1304   } else if (iWidth == 8) {
1305     McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1306     McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1307     PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1308   } else if (iWidth == 4) {
1309     McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1310     McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1311     PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1312   }
1313 }
McHorVer33_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1314 void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1315                       int32_t iWidth, int32_t iHeight) {
1316   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1317   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1318   if (iWidth == 16) {
1319     McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1320     McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1321     PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1322   } else if (iWidth == 8) {
1323     McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1324     McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1325     PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1326   } else if (iWidth == 4) {
1327     McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1328     McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1329     PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1330   }
1331 }
1332 
McLuma_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1333 void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1334                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1335   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
1336     {McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
1337     {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
1338     {McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
1339     {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
1340   };
1341   // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
1342   pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1343 }
McChroma_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1344 void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1345                     int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1346   if (0 == iMvX && 0 == iMvY) {
1347     if (8 == iWidth)
1348       McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1349     else if (iWidth == 4)
1350       McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1351     else //here iWidth == 2
1352       McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1353   } else {
1354     const int32_t kiD8x = iMvX & 0x07;
1355     const int32_t kiD8y = iMvY & 0x07;
1356     if (8 == iWidth)
1357       McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1358     else if (4 == iWidth)
1359       McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1360     else //here iWidth == 2
1361       McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
1362   }
1363 }
PixelAvg_neon(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)1364 void PixelAvg_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1365                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1366   static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1367     PixStrideAvgWidthEq8_neon,
1368     PixStrideAvgWidthEq16_neon
1369   };
1370   kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1371 }
1372 #endif
1373 #if defined(HAVE_NEON_AARCH64)
McHorVer20Width5Or9Or17_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1374 void McHorVer20Width5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1375                                         int32_t iWidth, int32_t iHeight) {
1376   if (iWidth == 17)
1377     McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1378   else if (iWidth == 9)
1379     McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1380   else //if (iWidth == 5)
1381     McHorVer20Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1382 }
McHorVer02Height5Or9Or17_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1383 void McHorVer02Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1384     int32_t iWidth, int32_t iHeight) {
1385   if (iWidth == 16)
1386     McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1387   else if (iWidth == 8)
1388     McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1389   else //if (iWidth == 4)
1390     McHorVer02Height5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1391 }
McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1392 void McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
1393     int32_t iDstStride,
1394     int32_t iWidth, int32_t iHeight) {
1395   if (iWidth == 17)
1396     McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1397   else if (iWidth == 9)
1398     McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1399   else //if (iWidth == 5)
1400     McHorVer22Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1401 }
McCopy_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1402 void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1403                           int32_t iWidth, int32_t iHeight) {
1404   if (16 == iWidth)
1405     McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1406   else if (8 == iWidth)
1407     McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1408   else if (4 == iWidth)
1409     McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1410   else
1411     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1412 }
McHorVer20_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1413 void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1414                               int32_t iWidth, int32_t iHeight) {
1415   if (iWidth == 16)
1416     McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1417   else if (iWidth == 8)
1418     McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1419   else if (iWidth == 4)
1420     McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1421 }
McHorVer02_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1422 void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1423                               int32_t iWidth, int32_t iHeight) {
1424   if (iWidth == 16)
1425     McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1426   else if (iWidth == 8)
1427     McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1428   else if (iWidth == 4)
1429     McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1430 }
McHorVer22_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1431 void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1432                               int32_t iWidth, int32_t iHeight) {
1433   if (iWidth == 16)
1434     McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1435   else if (iWidth == 8)
1436     McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1437   else if (iWidth == 4)
1438     McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1439 }
1440 
McHorVer01_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1441 void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1442                               int32_t iWidth, int32_t iHeight) {
1443   if (iWidth == 16)
1444     McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1445   else if (iWidth == 8)
1446     McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1447   else if (iWidth == 4)
1448     McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1449 }
McHorVer03_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1450 void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1451                               int32_t iWidth, int32_t iHeight) {
1452   if (iWidth == 16)
1453     McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1454   else if (iWidth == 8)
1455     McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1456   else if (iWidth == 4)
1457     McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1458 }
McHorVer10_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1459 void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1460                               int32_t iWidth, int32_t iHeight) {
1461   if (iWidth == 16)
1462     McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1463   else if (iWidth == 8)
1464     McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1465   else if (iWidth == 4)
1466     McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1467 }
McHorVer11_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1468 void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1469                               int32_t iWidth, int32_t iHeight) {
1470   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1471   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1472   if (iWidth == 16) {
1473     McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1474     McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1475     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1476   } else if (iWidth == 8) {
1477     McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1478     McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1479     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1480   } else if (iWidth == 4) {
1481     McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1482     McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1483     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1484   }
1485 }
McHorVer12_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1486 void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1487                               int32_t iWidth, int32_t iHeight) {
1488   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1489   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1490   if (iWidth == 16) {
1491     McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1492     McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1493     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1494   } else if (iWidth == 8) {
1495     McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1496     McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1497     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1498   } else if (iWidth == 4) {
1499     McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1500     McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1501     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1502   }
1503 }
McHorVer13_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1504 void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1505                               int32_t iWidth, int32_t iHeight) {
1506   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1507   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1508   if (iWidth == 16) {
1509     McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1510     McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1511     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1512   } else if (iWidth == 8) {
1513     McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1514     McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1515     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1516   } else if (iWidth == 4) {
1517     McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1518     McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1519     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1520   }
1521 }
McHorVer21_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1522 void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1523                               int32_t iWidth, int32_t iHeight) {
1524   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1525   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1526   if (iWidth == 16) {
1527     McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1528     McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1529     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1530   } else if (iWidth == 8) {
1531     McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1532     McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1533     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1534   } else if (iWidth == 4) {
1535     McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1536     McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1537     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1538   }
1539 }
McHorVer23_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1540 void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1541                               int32_t iWidth, int32_t iHeight) {
1542   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1543   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1544   if (iWidth == 16) {
1545     McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1546     McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1547     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1548   } else if (iWidth == 8) {
1549     McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1550     McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1551     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1552   } else if (iWidth == 4) {
1553     McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1554     McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1555     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1556   }
1557 }
McHorVer30_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1558 void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1559                               int32_t iWidth, int32_t iHeight) {
1560   if (iWidth == 16)
1561     McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1562   else if (iWidth == 8)
1563     McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1564   else if (iWidth == 4)
1565     McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1566 }
McHorVer31_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1567 void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1568                               int32_t iWidth, int32_t iHeight) {
1569   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1570   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1571   if (iWidth == 16) {
1572     McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1573     McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1574     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1575   } else if (iWidth == 8) {
1576     McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1577     McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1578     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1579   } else if (iWidth == 4) {
1580     McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1581     McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1582     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1583   }
1584 }
McHorVer32_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1585 void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1586                               int32_t iWidth, int32_t iHeight) {
1587   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1588   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1589   if (iWidth == 16) {
1590     McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1591     McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1592     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1593   } else if (iWidth == 8) {
1594     McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1595     McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1596     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1597   } else if (iWidth == 4) {
1598     McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1599     McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1600     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1601   }
1602 }
McHorVer33_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1603 void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1604                               int32_t iWidth, int32_t iHeight) {
1605   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1606   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1607   if (iWidth == 16) {
1608     McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1609     McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1610     PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1611   } else if (iWidth == 8) {
1612     McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1613     McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1614     PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1615   } else if (iWidth == 4) {
1616     McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1617     McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1618     PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1619   }
1620 }
1621 
McLuma_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1622 void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1623                           int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1624   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
1625     {McCopy_AArch64_neon,  McHorVer01_AArch64_neon, McHorVer02_AArch64_neon,    McHorVer03_AArch64_neon},
1626     {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
1627     {McHorVer20_AArch64_neon,    McHorVer21_AArch64_neon, McHorVer22_AArch64_neon,    McHorVer23_AArch64_neon},
1628     {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
1629   };
1630   // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
1631   pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1632 }
McChroma_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1633 void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1634                             int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1635   if (0 == iMvX && 0 == iMvY) {
1636     if (8 == iWidth)
1637       McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1638     else if (iWidth == 4)
1639       McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1640     else //here iWidth == 2
1641       McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1642   } else {
1643     const int32_t kiD8x = iMvX & 0x07;
1644     const int32_t kiD8y = iMvY & 0x07;
1645     if (8 == iWidth)
1646       McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1647     else if (4 == iWidth)
1648       McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1649     else //here iWidth == 2
1650       McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
1651   }
1652 }
PixelAvg_AArch64_neon(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)1653 void PixelAvg_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1654                             const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1655   static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1656     PixStrideAvgWidthEq8_AArch64_neon,
1657     PixStrideAvgWidthEq16_AArch64_neon
1658   };
1659   kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1660 }
1661 #endif
1662 
1663 #if defined(HAVE_MMI)
1664 #define MMI_LOAD_8P(f0, f2, f4, r0) \
1665   "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
1666   "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
1667   "punpckhbh  "#f2", "#f0", "#f4"             \n\t" \
1668   "punpcklbh  "#f0", "#f0", "#f4"             \n\t"
1669 
1670 #define FILTER_HV_W4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1671                      f20, f22, f24, f26, f28, f30, r0, r1, r2) \
1672   "paddh      "#f0", "#f0", "#f20"            \n\t" \
1673   "paddh      "#f2", "#f2", "#f22"            \n\t" \
1674   "mov.d      "#f28", "#f8"                   \n\t" \
1675   "mov.d      "#f30", "#f10"                  \n\t" \
1676   "mov.d      "#f24", "#f4"                   \n\t" \
1677   "mov.d      "#f26", "#f6"                   \n\t" \
1678   "dmfc1      "#r2", "#f8"                    \n\t" \
1679   "dli        "#r1", 0x0010001000100010       \n\t" \
1680   "dmtc1      "#r1", "#f8"                    \n\t" \
1681   "paddh      "#f0", "#f0", "#f8"             \n\t" \
1682   "paddh      "#f2", "#f2", "#f8"             \n\t" \
1683   "paddh      "#f28", "#f28", "#f12"          \n\t" \
1684   "paddh      "#f30", "#f30", "#f14"          \n\t" \
1685   "paddh      "#f24", "#f24", "#f16"          \n\t" \
1686   "paddh      "#f26", "#f26", "#f18"          \n\t" \
1687   "dli        "#r1", 0x2                      \n\t" \
1688   "dmtc1      "#r1", "#f8"                    \n\t" \
1689   "psllh      "#f28", "#f28", "#f8"           \n\t" \
1690   "psllh      "#f30", "#f30", "#f8"           \n\t" \
1691   "psubh      "#f28", "#f28", "#f24"          \n\t" \
1692   "psubh      "#f30", "#f30", "#f26"          \n\t" \
1693   "paddh      "#f0", "#f0", "#f28"            \n\t" \
1694   "paddh      "#f2", "#f2", "#f30"            \n\t" \
1695   "psllh      "#f28", "#f28", "#f8"           \n\t" \
1696   "psllh      "#f30", "#f30", "#f8"           \n\t" \
1697   "paddh      "#f0", "#f0", "#f28"            \n\t" \
1698   "paddh      "#f2", "#f2", "#f30"            \n\t" \
1699   "dli        "#r1", 0x5                      \n\t" \
1700   "dmtc1      "#r1", "#f8"                    \n\t" \
1701   "psrah      "#f0", "#f0", "#f8"             \n\t" \
1702   "psrah      "#f2", "#f2", "#f8"             \n\t" \
1703   "xor        "#f28", "#f28", "#f28"          \n\t" \
1704   "packushb   "#f0", "#f0", "#f2"             \n\t" \
1705   "gsswlc1    "#f0", 0x3("#r0")               \n\t" \
1706   "gsswrc1    "#f0", 0x0("#r0")               \n\t" \
1707   "dmtc1      "#r2", "#f8"                    \n\t"
1708 
1709 #define FILTER_HV_W8(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1710                      f20, f22, f24, f26, f28, f30, r0, r1, r2) \
1711   "paddh      "#f0", "#f0", "#f20"            \n\t" \
1712   "paddh      "#f2", "#f2", "#f22"            \n\t" \
1713   "mov.d      "#f28", "#f8"                   \n\t" \
1714   "mov.d      "#f30", "#f10"                  \n\t" \
1715   "mov.d      "#f24", "#f4"                   \n\t" \
1716   "mov.d      "#f26", "#f6"                   \n\t" \
1717   "dmfc1      "#r2", "#f8"                    \n\t" \
1718   "dli        "#r1", 0x0010001000100010       \n\t" \
1719   "dmtc1      "#r1", "#f8"                    \n\t" \
1720   "paddh      "#f0", "#f0", "#f8"             \n\t" \
1721   "paddh      "#f2", "#f2", "#f8"             \n\t" \
1722   "paddh      "#f28", "#f28", "#f12"          \n\t" \
1723   "paddh      "#f30", "#f30", "#f14"          \n\t" \
1724   "paddh      "#f24", "#f24", "#f16"          \n\t" \
1725   "paddh      "#f26", "#f26", "#f18"          \n\t" \
1726   "dli        "#r1", 0x2                      \n\t" \
1727   "dmtc1      "#r1", "#f8"                    \n\t" \
1728   "psllh      "#f28", "#f28", "#f8"           \n\t" \
1729   "psllh      "#f30", "#f30", "#f8"           \n\t" \
1730   "psubh      "#f28", "#f28", "#f24"          \n\t" \
1731   "psubh      "#f30", "#f30", "#f26"          \n\t" \
1732   "paddh      "#f0", "#f0", "#f28"            \n\t" \
1733   "paddh      "#f2", "#f2", "#f30"            \n\t" \
1734   "psllh      "#f28", "#f28", "#f8"           \n\t" \
1735   "psllh      "#f30", "#f30", "#f8"           \n\t" \
1736   "paddh      "#f0", "#f0", "#f28"            \n\t" \
1737   "paddh      "#f2", "#f2", "#f30"            \n\t" \
1738   "dli        "#r1", 0x5                      \n\t" \
1739   "dmtc1      "#r1", "#f8"                    \n\t" \
1740   "psrah      "#f0", "#f0", "#f8"             \n\t" \
1741   "psrah      "#f2", "#f2", "#f8"             \n\t" \
1742   "xor        "#f28", "#f28", "#f28"          \n\t" \
1743   "packushb   "#f0", "#f0", "#f2"             \n\t" \
1744   "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
1745   "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
1746   "dmtc1      "#r2", "#f8"                    \n\t"
1747 
1748 #define FILTER_VER_ALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1749                          f20, f22, f24, f26, f28, f30, r0, r1, r2, r3, r4) \
1750   "paddh      "#f0", "#f0", "#f20"            \n\t" \
1751   "paddh      "#f2", "#f2", "#f22"            \n\t" \
1752   "mov.d      "#f24", "#f4"                   \n\t" \
1753   "mov.d      "#f26", "#f6"                   \n\t" \
1754   "mov.d      "#f28", "#f8"                   \n\t" \
1755   "mov.d      "#f30", "#f10"                  \n\t" \
1756   "dli        "#r2", 0x2                      \n\t" \
1757   "paddh      "#f24", "#f24", "#f16"          \n\t" \
1758   "paddh      "#f26", "#f26", "#f18"          \n\t" \
1759   "dmfc1      "#r3", "#f8"                    \n\t" \
1760   "paddh      "#f28", "#f28", "#f12"          \n\t" \
1761   "paddh      "#f30", "#f30", "#f14"          \n\t" \
1762   "dmtc1      "#r2", "#f8"                    \n\t" \
1763   "psubh      "#f0", "#f0", "#f24"            \n\t" \
1764   "psubh      "#f2", "#f2", "#f26"            \n\t" \
1765   "psrah      "#f0", "#f0", "#f8"             \n\t" \
1766   "psrah      "#f2", "#f2", "#f8"             \n\t" \
1767   "paddh      "#f0", "#f0", "#f28"            \n\t" \
1768   "paddh      "#f2", "#f2", "#f30"            \n\t" \
1769   "psubh      "#f0", "#f0", "#f24"            \n\t" \
1770   "psubh      "#f2", "#f2", "#f26"            \n\t" \
1771   "psrah      "#f0", "#f0", "#f8"             \n\t" \
1772   "psrah      "#f2", "#f2", "#f8"             \n\t" \
1773   "dmtc1      "#r4", "#f8"                    \n\t" \
1774   "paddh      "#f28", "#f28", "#f0"           \n\t" \
1775   "paddh      "#f30", "#f30", "#f2"           \n\t" \
1776   "dli        "#r2", 0x6                      \n\t" \
1777   "paddh      "#f28", "#f28", "#f8"           \n\t" \
1778   "paddh      "#f30", "#f30", "#f8"           \n\t" \
1779   "dmtc1      "#r2", "#f8"                    \n\t" \
1780   "psrah      "#f28", "#f28", "#f8"           \n\t" \
1781   "psrah      "#f30", "#f30", "#f8"           \n\t" \
1782   "packushb   "#f28", "#f28", "#f30"          \n\t" \
1783   "gssdxc1    "#f28", 0x0("#r0", "#r1")       \n\t" \
1784   "dmtc1      "#r3", "#f8"                    \n\t"
1785 
1786 #define FILTER_VER_UNALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1787                            f20, f22, f24, f26, f28, f30, r0, r1, r2, r3) \
1788   "paddh      "#f0", "#f0", "#f20"            \n\t" \
1789   "paddh      "#f2", "#f2", "#f22"            \n\t" \
1790   "mov.d      "#f24", "#f4"                   \n\t" \
1791   "mov.d      "#f26", "#f6"                   \n\t" \
1792   "mov.d      "#f28", "#f8"                   \n\t" \
1793   "mov.d      "#f30", "#f10"                  \n\t" \
1794   "dli        "#r1", 0x2                      \n\t" \
1795   "paddh      "#f24", "#f24", "#f16"          \n\t" \
1796   "paddh      "#f26", "#f26", "#f18"          \n\t" \
1797   "dmfc1      "#r2", "#f8"                    \n\t" \
1798   "paddh      "#f28", "#f28", "#f12"          \n\t" \
1799   "paddh      "#f30", "#f30", "#f14"          \n\t" \
1800   "dmtc1      "#r1", "#f8"                    \n\t" \
1801   "psubh      "#f0", "#f0", "#f24"            \n\t" \
1802   "psubh      "#f2", "#f2", "#f26"            \n\t" \
1803   "psrah      "#f0", "#f0", "#f8"             \n\t" \
1804   "psrah      "#f2", "#f2", "#f8"             \n\t" \
1805   "paddh      "#f0", "#f0", "#f28"            \n\t" \
1806   "paddh      "#f2", "#f2", "#f30"            \n\t" \
1807   "psubh      "#f0", "#f0", "#f24"            \n\t" \
1808   "psubh      "#f2", "#f2", "#f26"            \n\t" \
1809   "psrah      "#f0", "#f0", "#f8"             \n\t" \
1810   "psrah      "#f2", "#f2", "#f8"             \n\t" \
1811   "dmtc1      "#r3", "#f8"                    \n\t" \
1812   "paddh      "#f28", "#f28", "#f0"           \n\t" \
1813   "paddh      "#f30", "#f30", "#f2"           \n\t" \
1814   "dli        "#r1", 0x6                      \n\t" \
1815   "paddh      "#f28", "#f28", "#f8"           \n\t" \
1816   "paddh      "#f30", "#f30", "#f8"           \n\t" \
1817   "dmtc1      "#r1", "#f8"                    \n\t" \
1818   "psrah      "#f28", "#f28", "#f8"           \n\t" \
1819   "psrah      "#f30", "#f30", "#f8"           \n\t" \
1820   "packushb   "#f28", "#f28", "#f30"          \n\t" \
1821   "gssdlc1    "#f28", 0x7("#r0")              \n\t" \
1822   "gssdrc1    "#f28", 0x0("#r0")              \n\t" \
1823   "dmtc1      "#r2", "#f8"                    \n\t"
1824 
McHorVer20Width5_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1825 void McHorVer20Width5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
1826                           int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
1827   BACKUP_REG;
1828   __asm__ volatile (
1829     ".set       arch=loongson3a                 \n\t"
1830     "xor        $f28, $f28, $f28                \n\t"
1831     PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
1832     "dli        $8, 0x2                         \n\t"
1833     "dli        $10, 0x0010001000100010         \n\t"
1834     "dli        $11, 0x5                        \n\t"
1835     "1:                                         \n\t"
1836     "xor        $f28, $f28, $f28                \n\t"
1837     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
1838     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
1839     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
1840     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
1841     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
1842     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
1843     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
1844     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
1845     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
1846     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
1847     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
1848     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
1849     "punpckhbh  $f2, $f0, $f28                  \n\t"
1850     "punpckhbh  $f6, $f4, $f28                  \n\t"
1851     "punpckhbh  $f10, $f8, $f28                 \n\t"
1852     "punpckhbh  $f14, $f12, $f28                \n\t"
1853     "punpckhbh  $f18, $f16, $f28                \n\t"
1854     "punpckhbh  $f22, $f20, $f28                \n\t"
1855     "punpcklbh  $f0, $f0, $f28                  \n\t"
1856     "punpcklbh  $f4, $f4, $f28                  \n\t"
1857     "punpcklbh  $f8, $f8, $f28                  \n\t"
1858     "punpcklbh  $f12, $f12, $f28                \n\t"
1859     "punpcklbh  $f16, $f16, $f28                \n\t"
1860     "punpcklbh  $f20, $f20, $f28                \n\t"
1861 
1862     "mov.d      $f28, $f8                       \n\t"
1863     "mov.d      $f30, $f10                      \n\t"
1864     "paddh      $f28, $f28, $f12                \n\t"
1865     "paddh      $f30, $f30, $f14                \n\t"
1866     "mov.d      $f24, $f16                      \n\t"
1867     "mov.d      $f26, $f18                      \n\t"
1868     "paddh      $f24, $f24, $f20                \n\t"
1869     "paddh      $f26, $f26, $f22                \n\t"
1870     "dmfc1      $9, $f12                        \n\t"
1871     "dmtc1      $8, $f12                        \n\t"
1872     "psllh      $f24, $f24, $f12                \n\t"
1873     "psllh      $f26, $f26, $f12                \n\t"
1874     "psubh      $f24, $f24, $f28                \n\t"
1875     "psubh      $f26, $f26, $f30                \n\t"
1876     "paddh      $f0, $f0, $f4                   \n\t"
1877     "paddh      $f2, $f2, $f6                   \n\t"
1878     "paddh      $f0, $f0, $f24                  \n\t"
1879     "paddh      $f2, $f2, $f26                  \n\t"
1880     "psllh      $f24, $f24, $f12                \n\t"
1881     "psllh      $f26, $f26, $f12                \n\t"
1882     "paddh      $f0, $f0, $f24                  \n\t"
1883     "paddh      $f2, $f2, $f26                  \n\t"
1884 
1885     "dmtc1      $10, $f12                       \n\t"
1886     "paddh      $f0, $f0, $f12                  \n\t"
1887     "paddh      $f2, $f2, $f12                  \n\t"
1888     "dmtc1      $11, $f12                       \n\t"
1889     "psrah      $f0, $f0, $f12                  \n\t"
1890     "psrah      $f2, $f2, $f12                  \n\t"
1891     "packushb   $f0, $f0, $f2                   \n\t"
1892 
1893     "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
1894     "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
1895 
1896     "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
1897     "xor        $f28, $f28, $f28                \n\t"
1898     "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
1899     "punpckhbh  $f2, $f0, $f28                  \n\t"
1900     "punpcklbh  $f0, $f0, $f28                  \n\t"
1901     "dmtc1      $9, $f12                        \n\t"
1902     "dmtc1      $8, $f24                        \n\t"
1903 
1904     "paddh      $f16, $f16, $f4                 \n\t"
1905     "paddh      $f18, $f18, $f6                 \n\t"
1906     "paddh      $f20, $f20, $f12                \n\t"
1907     "paddh      $f22, $f22, $f14                \n\t"
1908     "psllh      $f20, $f20, $f24                \n\t"
1909     "psllh      $f22, $f22, $f24                \n\t"
1910     "psubh      $f20, $f20, $f16                \n\t"
1911     "psubh      $f22, $f22, $f18                \n\t"
1912     "paddh      $f8, $f8, $f0                   \n\t"
1913     "paddh      $f10, $f10, $f2                 \n\t"
1914     "paddh      $f8, $f8, $f20                  \n\t"
1915     "paddh      $f10, $f10, $f22                \n\t"
1916     "psllh      $f20, $f20, $f24                \n\t"
1917     "psllh      $f22, $f22, $f24                \n\t"
1918     "paddh      $f8, $f8, $f20                  \n\t"
1919     "paddh      $f10, $f10, $f22                \n\t"
1920 
1921     "dmtc1      $10, $f24                       \n\t"
1922     "paddh      $f8, $f8, $f24                  \n\t"
1923     "paddh      $f10, $f10, $f24                \n\t"
1924     "dmtc1      $11, $f24                       \n\t"
1925     "psrah      $f8, $f8, $f24                  \n\t"
1926     "psrah      $f10, $f10, $f24                \n\t"
1927     "packushb   $f8, $f8, $f10                  \n\t"
1928     "gsswlc1    $f8, 0x4(%[pDst])               \n\t"
1929     "gsswrc1    $f8, 0x1(%[pDst])               \n\t"
1930 
1931     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
1932     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
1933     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
1934     "bnez       %[iHeight], 1b                  \n\t"
1935     : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
1936       [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
1937     : [iSrcStride]"r"((int)iSrcStride),  [iDstStride]"r"((int)iDstStride)
1938     : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
1939       "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
1940       "$f28", "$f30"
1941   );
1942   RECOVER_REG;
1943 }
1944 
McHorVer20Width9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1945 void McHorVer20Width9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
1946                               int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
1947   BACKUP_REG;
1948   __asm__ volatile (
1949     ".set       arch=loongson3a                 \n\t"
1950     PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
1951     "xor        $f28, $f28, $f28                \n\t"
1952     "dli        $8, 0x2                         \n\t"
1953     "dli        $9, 0x9                         \n\t"
1954     "dli        $10, 0x0010001000100010         \n\t"
1955     "dli        $11, 0x5                        \n\t"
1956     "bne        %[iWidth], $9, 2f               \n\t"
1957     "1:                                         \n\t"
1958     "xor        $f28, $f28, $f28                \n\t"
1959     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
1960     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
1961     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
1962     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
1963     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
1964     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
1965     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
1966     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
1967     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
1968     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
1969     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
1970     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
1971     "punpckhbh  $f2, $f0, $f28                  \n\t"
1972     "punpckhbh  $f6, $f4, $f28                  \n\t"
1973     "punpckhbh  $f10, $f8, $f28                 \n\t"
1974     "punpckhbh  $f14, $f12, $f28                \n\t"
1975     "punpckhbh  $f18, $f16, $f28                \n\t"
1976     "punpckhbh  $f22, $f20, $f28                \n\t"
1977     "punpcklbh  $f0, $f0, $f28                  \n\t"
1978     "punpcklbh  $f4, $f4, $f28                  \n\t"
1979     "punpcklbh  $f8, $f8, $f28                  \n\t"
1980     "punpcklbh  $f12, $f12, $f28                \n\t"
1981     "punpcklbh  $f16, $f16, $f28                \n\t"
1982     "punpcklbh  $f20, $f20, $f28                \n\t"
1983 
1984     "mov.d      $f28, $f8                       \n\t"
1985     "mov.d      $f30, $f10                      \n\t"
1986     "paddh      $f28, $f28, $f12                \n\t"
1987     "paddh      $f30, $f30, $f14                \n\t"
1988     "mov.d      $f24, $f16                      \n\t"
1989     "mov.d      $f26, $f18                      \n\t"
1990     "paddh      $f24, $f24, $f20                \n\t"
1991     "paddh      $f26, $f26, $f22                \n\t"
1992     "dmfc1      $9, $f12                        \n\t"
1993     "dmtc1      $8, $f12                        \n\t"
1994     "psllh      $f24, $f24, $f12                \n\t"
1995     "psllh      $f26, $f26, $f12                \n\t"
1996     "psubh      $f24, $f24, $f28                \n\t"
1997     "psubh      $f26, $f26, $f30                \n\t"
1998     "paddh      $f0, $f0, $f4                   \n\t"
1999     "paddh      $f2, $f2, $f6                   \n\t"
2000     "paddh      $f0, $f0, $f24                  \n\t"
2001     "paddh      $f2, $f2, $f26                  \n\t"
2002     "psllh      $f24, $f24, $f12                \n\t"
2003     "psllh      $f26, $f26, $f12                \n\t"
2004     "paddh      $f0, $f0, $f24                  \n\t"
2005     "paddh      $f2, $f2, $f26                  \n\t"
2006 
2007     "dmtc1      $10, $f12                       \n\t"
2008     "paddh      $f0, $f0, $f12                  \n\t"
2009     "paddh      $f2, $f2, $f12                  \n\t"
2010     "dmtc1      $11, $f12                       \n\t"
2011     "psrah      $f0, $f0, $f12                  \n\t"
2012     "psrah      $f2, $f2, $f12                  \n\t"
2013     "packushb   $f0, $f0, $f2                   \n\t"
2014 
2015     "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
2016     "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
2017 
2018     "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
2019     "xor        $f28, $f28, $f28                \n\t"
2020     "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
2021     "punpckhbh  $f2, $f0, $f28                  \n\t"
2022     "punpcklbh  $f0, $f0, $f28                  \n\t"
2023     "dmtc1      $9, $f12                        \n\t"
2024     "dmtc1      $8, $f24                        \n\t"
2025 
2026     "paddh      $f16, $f16, $f4                 \n\t"
2027     "paddh      $f18, $f18, $f6                 \n\t"
2028     "paddh      $f20, $f20, $f12                \n\t"
2029     "paddh      $f22, $f22, $f14                \n\t"
2030     "psllh      $f20, $f20, $f24                \n\t"
2031     "psllh      $f22, $f22, $f24                \n\t"
2032     "psubh      $f20, $f20, $f16                \n\t"
2033     "psubh      $f22, $f22, $f18                \n\t"
2034     "paddh      $f8, $f8, $f0                   \n\t"
2035     "paddh      $f10, $f10, $f2                 \n\t"
2036     "paddh      $f8, $f8, $f20                  \n\t"
2037     "paddh      $f10, $f10, $f22                \n\t"
2038     "psllh      $f20, $f20, $f24                \n\t"
2039     "psllh      $f22, $f22, $f24                \n\t"
2040     "paddh      $f8, $f8, $f20                  \n\t"
2041     "paddh      $f10, $f10, $f22                \n\t"
2042 
2043     "dmtc1      $10, $f24                       \n\t"
2044     "paddh      $f8, $f8, $f24                  \n\t"
2045     "paddh      $f10, $f10, $f24                \n\t"
2046     "dmtc1      $11, $f24                       \n\t"
2047     "psrah      $f8, $f8, $f24                  \n\t"
2048     "psrah      $f10, $f10, $f24                \n\t"
2049     "packushb   $f8, $f8, $f10                  \n\t"
2050     "gssdlc1    $f8, 0x8(%[pDst])               \n\t"
2051     "gssdrc1    $f8, 0x1(%[pDst])               \n\t"
2052 
2053     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2054     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2055     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2056     "bnez       %[iHeight], 1b                  \n\t"
2057     "j          3f                              \n\t"
2058 
2059     "2:                                         \n\t"
2060     "xor        $f28, $f28, $f28                \n\t"
2061     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
2062     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
2063     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
2064     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
2065     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
2066     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
2067     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
2068     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
2069     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
2070     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
2071     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
2072     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
2073     "punpckhbh  $f2, $f0, $f28                  \n\t"
2074     "punpckhbh  $f6, $f4, $f28                  \n\t"
2075     "punpckhbh  $f10, $f8, $f28                 \n\t"
2076     "punpckhbh  $f14, $f12, $f28                \n\t"
2077     "punpckhbh  $f18, $f16, $f28                \n\t"
2078     "punpckhbh  $f22, $f20, $f28                \n\t"
2079     "punpcklbh  $f0, $f0, $f28                  \n\t"
2080     "punpcklbh  $f4, $f4, $f28                  \n\t"
2081     "punpcklbh  $f8, $f8, $f28                  \n\t"
2082     "punpcklbh  $f12, $f12, $f28                \n\t"
2083     "punpcklbh  $f16, $f16, $f28                \n\t"
2084     "punpcklbh  $f20, $f20, $f28                \n\t"
2085 
2086     "dmtc1      $8, $f30                        \n\t"
2087     "paddh      $f8, $f8, $f12                  \n\t"
2088     "paddh      $f10, $f10, $f14                \n\t"
2089     "paddh      $f16, $f16, $f20                \n\t"
2090     "paddh      $f18, $f18, $f22                \n\t"
2091     "psllh      $f16, $f16, $f30                \n\t"
2092     "psllh      $f18, $f18, $f30                \n\t"
2093     "psubh      $f16, $f16, $f8                 \n\t"
2094     "psubh      $f18, $f18, $f10                \n\t"
2095     "paddh      $f0, $f0, $f4                   \n\t"
2096     "paddh      $f2, $f2, $f6                   \n\t"
2097     "paddh      $f0, $f0, $f16                  \n\t"
2098     "paddh      $f2, $f2, $f18                  \n\t"
2099     "psllh      $f16, $f16, $f30                \n\t"
2100     "psllh      $f18, $f18, $f30                \n\t"
2101     "paddh      $f0, $f0, $f16                  \n\t"
2102     "paddh      $f2, $f2, $f18                  \n\t"
2103 
2104     "dmtc1      $10, $f30                       \n\t"
2105     "paddh      $f0, $f0, $f30                  \n\t"
2106     "paddh      $f2, $f2, $f30                  \n\t"
2107     "dmtc1      $11, $f30                       \n\t"
2108     "psrah      $f0, $f0, $f30                  \n\t"
2109     "psrah      $f2, $f2, $f30                  \n\t"
2110     "packushb   $f0, $f0, $f2                   \n\t"
2111     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
2112     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
2113 
2114     "gsldlc1    $f0, 15(%[pSrc])                \n\t"
2115     "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
2116     "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
2117     "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
2118     "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
2119     "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
2120     "gsldrc1    $f0, 8(%[pSrc])                 \n\t"
2121     "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
2122     "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
2123     "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
2124     "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
2125     "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
2126     "punpckhbh  $f2, $f0, $f28                  \n\t"
2127     "punpckhbh  $f6, $f4, $f28                  \n\t"
2128     "punpckhbh  $f10, $f8, $f28                 \n\t"
2129     "punpckhbh  $f14, $f12, $f28                \n\t"
2130     "punpckhbh  $f18, $f16, $f28                \n\t"
2131     "punpckhbh  $f22, $f20, $f28                \n\t"
2132     "punpcklbh  $f0, $f0, $f28                  \n\t"
2133     "punpcklbh  $f4, $f4, $f28                  \n\t"
2134     "punpcklbh  $f8, $f8, $f28                  \n\t"
2135     "punpcklbh  $f12, $f12, $f28                \n\t"
2136     "punpcklbh  $f16, $f16, $f28                \n\t"
2137     "punpcklbh  $f20, $f20, $f28                \n\t"
2138 
2139     "mov.d      $f28, $f8                       \n\t"
2140     "mov.d      $f30, $f10                      \n\t"
2141     "paddh      $f28, $f28, $f12                \n\t"
2142     "paddh      $f30, $f30, $f14                \n\t"
2143     "mov.d      $f24, $f16                      \n\t"
2144     "mov.d      $f26, $f18                      \n\t"
2145     "paddh      $f24, $f24, $f20                \n\t"
2146     "paddh      $f26, $f26, $f22                \n\t"
2147     "dmfc1      $9, $f12                        \n\t"
2148     "dmtc1      $8, $f12                        \n\t"
2149     "psllh      $f24, $f24, $f12                \n\t"
2150     "psllh      $f26, $f26, $f12                \n\t"
2151     "psubh      $f24, $f24, $f28                \n\t"
2152     "psubh      $f26, $f26, $f30                \n\t"
2153     "paddh      $f0, $f0, $f4                   \n\t"
2154     "paddh      $f2, $f2, $f6                   \n\t"
2155     "paddh      $f0, $f0, $f24                  \n\t"
2156     "paddh      $f2, $f2, $f26                  \n\t"
2157     "psllh      $f24, $f24, $f12                \n\t"
2158     "psllh      $f26, $f26, $f12                \n\t"
2159     "paddh      $f0, $f0, $f24                  \n\t"
2160     "paddh      $f2, $f2, $f26                  \n\t"
2161 
2162     "dmtc1      $10, $f30                       \n\t"
2163     "paddh      $f0, $f0, $f30                  \n\t"
2164     "paddh      $f2, $f2, $f30                  \n\t"
2165     "dmtc1      $11, $f30                       \n\t"
2166     "psrah      $f0, $f0, $f30                  \n\t"
2167     "psrah      $f2, $f2, $f30                  \n\t"
2168     "packushb   $f0, $f0, $f2                   \n\t"
2169     "gsswlc1    $f0, 0xb(%[pDst])               \n\t"
2170     "gsswrc1    $f0, 0x8(%[pDst])               \n\t"
2171 
2172     "dmtc1      $9, $f12                        \n\t"
2173     "xor        $f28, $f28, $f28                \n\t"
2174     "dli        $9, 0x20                        \n\t"
2175     "gsldlc1    $f0, 0x15(%[pSrc])              \n\t"
2176     "dmtc1      $9, $f30                        \n\t"
2177     "gsldrc1    $f0, 0xE(%[pSrc])               \n\t"
2178     "punpckhbh  $f2, $f0, $f28                  \n\t"
2179     "punpcklbh  $f0, $f0, $f28                  \n\t"
2180     "dmtc1      $8, $f24                        \n\t"
2181 
2182     "paddh      $f16, $f16, $f4                 \n\t"
2183     "paddh      $f18, $f18, $f6                 \n\t"
2184     "paddh      $f20, $f20, $f12                \n\t"
2185     "paddh      $f22, $f22, $f14                \n\t"
2186     "psllh      $f20, $f20, $f24                \n\t"
2187     "psllh      $f22, $f22, $f24                \n\t"
2188     "psubh      $f20, $f20, $f16                \n\t"
2189     "psubh      $f22, $f22, $f18                \n\t"
2190     "paddh      $f8, $f8, $f0                   \n\t"
2191     "paddh      $f10, $f10, $f2                 \n\t"
2192     "paddh      $f8, $f8, $f20                  \n\t"
2193     "paddh      $f10, $f10, $f22                \n\t"
2194     "psllh      $f20, $f20, $f24                \n\t"
2195     "psllh      $f22, $f22, $f24                \n\t"
2196     "paddh      $f8, $f8, $f20                  \n\t"
2197     "paddh      $f10, $f10, $f22                \n\t"
2198 
2199     "dmtc1      $10, $f24                       \n\t"
2200     "paddh      $f8, $f8, $f24                  \n\t"
2201     "paddh      $f10, $f10, $f24                \n\t"
2202     "dmtc1      $11, $f24                       \n\t"
2203     "psrah      $f8, $f8, $f24                  \n\t"
2204     "psrah      $f10, $f10, $f24                \n\t"
2205     "packushb   $f8, $f8, $f10                  \n\t"
2206     "gssdlc1    $f8, 0x10(%[pDst])              \n\t"
2207     "gssdrc1    $f8, 0x9(%[pDst])               \n\t"
2208 
2209     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2210     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2211     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2212     "bnez       %[iHeight], 2b                  \n\t"
2213     "3:                                         \n\t"
2214     : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2215       [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
2216     : [iSrcStride]"r"((int)iSrcStride),  [iDstStride]"r"((int)iDstStride)
2217     : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
2218       "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
2219       "$f28", "$f30"
2220   );
2221   RECOVER_REG;
2222 }
2223 
2224 //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
McHorVer20Width5Or9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2225 static inline void McHorVer20Width5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
2226                                                uint8_t* pDst, int32_t iDstStride,
2227                                                int32_t iWidth, int32_t iHeight) {
2228   if (iWidth == 17 || iWidth == 9)
2229       McHorVer20Width9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2230   else //if (iWidth == 5)
2231       McHorVer20Width5_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2232 }
2233 
McHorVer02Height5_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2234 void McHorVer02Height5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
2235                            int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
2236   BACKUP_REG;
2237   __asm__ volatile (
2238     ".set       arch=loongson3a                 \n\t"
2239     "move       $12, %[pSrc]                    \n\t"
2240     "move       $13, %[pDst]                    \n\t"
2241     "move       $14, %[iHeight]                 \n\t"
2242 
2243     "dsrl       %[iWidth], %[iWidth], 0x2       \n\t"
2244     PTR_ADDU   "$10, %[iSrcStride], %[iSrcStride] \n\t"
2245     PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2246 
2247     "1:                                         \n\t"
2248     "xor        $f28, $f28, $f28                \n\t"
2249     MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
2250     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2251     MMI_LOAD_8P($f4, $f6, $f28, $8)
2252 
2253     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2254     MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
2255     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2256     MMI_LOAD_8P($f12, $f14, $f28, $8)
2257     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2258     MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
2259     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2260     MMI_LOAD_8P($f20, $f22, $f28, $8)
2261     FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2262                  $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2263     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2264     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2265     MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2266     "mov.d      $f0, $f4                        \n\t"
2267     "mov.d      $f2, $f6                        \n\t"
2268     "mov.d      $f4, $f8                        \n\t"
2269     "mov.d      $f6, $f10                       \n\t"
2270     "mov.d      $f8, $f12                       \n\t"
2271     "mov.d      $f10, $f14                      \n\t"
2272     "mov.d      $f12, $f16                      \n\t"
2273     "mov.d      $f14, $f18                      \n\t"
2274     "mov.d      $f16, $f20                      \n\t"
2275     "mov.d      $f18, $f22                      \n\t"
2276     "mov.d      $f20, $f24                      \n\t"
2277     "mov.d      $f22, $f26                      \n\t"
2278 
2279     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2280     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2281 
2282     "2:                                         \n\t"
2283     FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2284                  $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2285     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2286     "beqz       %[iHeight], 3f                  \n\t"
2287 
2288     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2289     MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2290     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2291     FILTER_HV_W4($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2292                  $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
2293     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2294     "beqz       %[iHeight], 3f                  \n\t"
2295 
2296     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2297     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2298     MMI_LOAD_8P($f28, $f30, $f0, $8)
2299     FILTER_HV_W4($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2300                  $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
2301     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2302     "beqz       %[iHeight], 3f                  \n\t"
2303 
2304     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2305     MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
2306     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2307     FILTER_HV_W4($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2308                  $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
2309     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2310     "beqz       %[iHeight], 3f                  \n\t"
2311 
2312     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2313     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2314     MMI_LOAD_8P($f4, $f6, $f8, $8)
2315     FILTER_HV_W4($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
2316                  $f8, $f10, $f12, $f14, %[pDst], $8, $9)
2317     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2318     "beqz       %[iHeight], 3f                  \n\t"
2319 
2320     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2321     MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
2322     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2323     FILTER_HV_W4($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
2324                  $f12, $f14, $f16, $f18, %[pDst], $8, $9)
2325     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2326     "beqz       %[iHeight], 3f                  \n\t"
2327 
2328     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2329     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2330     MMI_LOAD_8P($f12, $f14, $f16, $8)
2331     FILTER_HV_W4($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
2332                  $f16, $f18, $f20, $f22, %[pDst], $8, $9)
2333     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2334     "beqz       %[iHeight], 3f                  \n\t"
2335 
2336     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2337     MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
2338     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2339     FILTER_HV_W4($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2340                  $f20, $f22, $f24, $f26, %[pDst], $8, $9)
2341     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2342     "beqz       %[iHeight], 3f                  \n\t"
2343 
2344     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2345     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2346     MMI_LOAD_8P($f20, $f22, $f24, $8)
2347     "j          2b                              \n\t"
2348 
2349     "3:                                         \n\t"
2350     PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
2351     "beqz       %[iWidth], 4f                   \n\t"
2352     "move       %[pSrc], $12                    \n\t"
2353     "move       %[pDst], $13                    \n\t"
2354     "move       %[iHeight], $14                 \n\t"
2355     PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2356     PTR_ADDIU  "%[pSrc], %[pSrc], 0x4           \n\t"
2357     PTR_ADDIU  "%[pDst], %[pDst], 0x4           \n\t"
2358     "j          1b                              \n\t"
2359     "4:                                         \n\t"
2360     : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2361       [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
2362     : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
2363     : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
2364       "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2365       "$f24", "$f26", "$f28", "$f30"
2366   );
2367   RECOVER_REG;
2368 }
2369 
McHorVer02Height9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2370 void McHorVer02Height9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
2371                                int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
2372   BACKUP_REG;
2373   __asm__ volatile (
2374     ".set       arch=loongson3a                 \n\t"
2375     "move       $12, %[pSrc]                    \n\t"
2376     "move       $13, %[pDst]                    \n\t"
2377     "move       $14, %[iHeight]                 \n\t"
2378 
2379     "dsrl       %[iWidth], %[iWidth], 0x3       \n\t"
2380     PTR_ADDU   "$10, %[iSrcStride], %[iSrcStride] \n\t"
2381     PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2382 
2383     "1:                                         \n\t"
2384     "dli        $8, 0x20                        \n\t"
2385     "xor        $f28, $f28, $f28                \n\t"
2386     "dmtc1      $8, $f30                        \n\t"
2387 
2388     MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
2389     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2390     MMI_LOAD_8P($f4, $f6, $f28, $8)
2391     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2392     MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
2393     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2394     MMI_LOAD_8P($f12, $f14, $f28, $8)
2395     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2396     MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
2397     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2398     MMI_LOAD_8P($f20, $f22, $f28, $8)
2399     FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2400                  $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2401     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2402     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2403     MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2404     "mov.d      $f0, $f4                        \n\t"
2405     "mov.d      $f2, $f6                        \n\t"
2406     "mov.d      $f4, $f8                        \n\t"
2407     "mov.d      $f6, $f10                       \n\t"
2408     "mov.d      $f8, $f12                       \n\t"
2409     "mov.d      $f10, $f14                      \n\t"
2410     "mov.d      $f12, $f16                      \n\t"
2411     "mov.d      $f14, $f18                      \n\t"
2412     "mov.d      $f16, $f20                      \n\t"
2413     "mov.d      $f18, $f22                      \n\t"
2414     "mov.d      $f20, $f24                      \n\t"
2415     "mov.d      $f22, $f26                      \n\t"
2416     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2417     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2418 
2419     "2:                                         \n\t"
2420     FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2421                  $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2422     "dmtc1      $9, $f8                         \n\t"
2423     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2424     "beqz       %[iHeight], 3f                  \n\t"
2425 
2426     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2427     MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2428     PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2429     FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2430                  $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
2431     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2432     "beqz       %[iHeight], 3f                  \n\t"
2433 
2434     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2435     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2436     MMI_LOAD_8P($f28, $f30, $f0, $8)
2437     FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2438                  $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
2439     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2440     "beqz       %[iHeight], 3f                  \n\t"
2441 
2442     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2443     MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
2444     PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2445     FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2446                  $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
2447     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2448     "beqz       %[iHeight], 3f                  \n\t"
2449 
2450     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2451     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2452     MMI_LOAD_8P($f4, $f6, $f8, $8)
2453     FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
2454                  $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
2455     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2456     "beqz       %[iHeight], 3f                  \n\t"
2457 
2458     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2459     MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
2460     PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2461     FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
2462                  $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
2463     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2464     "beqz       %[iHeight], 3f                  \n\t"
2465 
2466     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2467     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2468     MMI_LOAD_8P($f12, $f14, $f16, $8)
2469     FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
2470                  $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
2471     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2472     "beqz       %[iHeight], 3f                  \n\t"
2473 
2474     PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
2475     MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
2476     PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
2477     FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
2478                  $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
2479     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2480     "beqz       %[iHeight], 3f                  \n\t"
2481 
2482     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2483     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
2484     MMI_LOAD_8P($f20, $f22, $f24, $8)
2485     "j          2b                              \n\t"
2486 
2487     "3:                                         \n\t"
2488     PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
2489     "beqz       %[iWidth], 4f                   \n\t"
2490 
2491     "move       %[pSrc], $12                    \n\t"
2492     "move       %[pDst], $13                    \n\t"
2493     "move       %[iHeight], $14                 \n\t"
2494     PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
2495     PTR_ADDIU  "%[pSrc], %[pSrc], 0x8           \n\t"
2496     PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
2497     "j          1b                              \n\t"
2498     "4:                                         \n\t"
2499     : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2500       [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
2501     : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
2502     : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
2503       "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2504       "$f24", "$f26", "$f28", "$f30"
2505   );
2506   RECOVER_REG;
2507 }
2508 
2509 //vertical filter to gain half sample, that is (0, 2) location in quarter sample
McHorVer02Height5Or9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2510 static inline void McHorVer02Height5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
2511                                                 uint8_t* pDst, int32_t iDstStride,
2512                                                 int32_t iWidth, int32_t iHeight) {
2513   if (iWidth == 16 || iWidth == 8)
2514     McHorVer02Height9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight );
2515   else
2516     McHorVer02Height5_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2517 }
2518 
McHorVer22HorFirst_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pTap,int32_t iTapStride,int32_t iWidth,int32_t iHeight)2519 static inline void McHorVer22HorFirst_mmi(const uint8_t *pSrc, int32_t iSrcStride,
2520                                           uint8_t * pTap, int32_t iTapStride,
2521                                           int32_t iWidth, int32_t iHeight) {
2522   BACKUP_REG;
2523   __asm__ volatile (
2524     ".set       arch=loongson3a                 \n\t"
2525     "dli        $8, 0x9                         \n\t"
2526     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2527     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2528     "bne        %[iWidth], $8, 2f               \n\t"
2529 
2530     "1:                                         \n\t"
2531     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
2532     "xor        $f28, $f28, $f28                \n\t"
2533     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
2534     "punpckhbh  $f2, $f0, $f28                  \n\t"
2535     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
2536     "punpcklbh  $f0, $f0, $f28                  \n\t"
2537     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
2538     "punpckhbh  $f6, $f4, $f28                  \n\t"
2539     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
2540     "punpcklbh  $f4, $f4, $f28                  \n\t"
2541     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
2542     "punpckhbh  $f10, $f8, $f28                 \n\t"
2543     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
2544     "punpcklbh  $f8, $f8, $f28                  \n\t"
2545     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
2546     "punpckhbh  $f14, $f12, $f28                \n\t"
2547     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
2548     "punpcklbh  $f12, $f12, $f28                \n\t"
2549     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
2550     "punpckhbh  $f18, $f16, $f28                \n\t"
2551     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
2552     "punpcklbh  $f16, $f16, $f28                \n\t"
2553     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
2554     "punpckhbh  $f22, $f20, $f28                \n\t"
2555     "punpcklbh  $f20, $f20, $f28                \n\t"
2556 
2557     "mov.d      $f28, $f8                       \n\t"
2558     "mov.d      $f30, $f10                      \n\t"
2559     "paddh      $f28, $f28, $f12                \n\t"
2560     "paddh      $f30, $f30, $f14                \n\t"
2561     "mov.d      $f24, $f16                      \n\t"
2562     "mov.d      $f26, $f18                      \n\t"
2563     "paddh      $f24, $f24, $f20                \n\t"
2564     "paddh      $f26, $f26, $f22                \n\t"
2565     "dli        $8, 0x2                         \n\t"
2566     "dmfc1      $9, $f12                        \n\t"
2567     "dmtc1      $8, $f12                        \n\t"
2568     "psllh      $f24, $f24, $f12                \n\t"
2569     "psllh      $f26, $f26, $f12                \n\t"
2570     "psubh      $f24, $f24, $f28                \n\t"
2571     "psubh      $f26, $f26, $f30                \n\t"
2572     "paddh      $f0, $f0, $f4                   \n\t"
2573     "paddh      $f2, $f2, $f6                   \n\t"
2574     "paddh      $f0, $f0, $f24                  \n\t"
2575     "paddh      $f2, $f2, $f26                  \n\t"
2576     "psllh      $f24, $f24, $f12                \n\t"
2577     "psllh      $f26, $f26, $f12                \n\t"
2578     "paddh      $f0, $f0, $f24                  \n\t"
2579     "paddh      $f2, $f2, $f26                  \n\t"
2580     "gsswlc1    $f0, 0x3(%[pTap])               \n\t"
2581     "gsswrc1    $f0, 0x0(%[pTap])               \n\t"
2582 
2583     "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
2584     "xor        $f28, $f28, $f28                \n\t"
2585     "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
2586     "punpckhbh  $f2, $f0, $f28                  \n\t"
2587     "punpcklbh  $f0, $f0, $f28                  \n\t"
2588     "dli        $8, 0x2                         \n\t"
2589     "dmtc1      $9, $f12                        \n\t"
2590     "dmtc1      $8, $f24                        \n\t"
2591 
2592     "paddh      $f16, $f16, $f4                 \n\t"
2593     "paddh      $f18, $f18, $f6                 \n\t"
2594     "paddh      $f20, $f20, $f12                \n\t"
2595     "paddh      $f22, $f22, $f14                \n\t"
2596     "psllh      $f20, $f20, $f24                \n\t"
2597     "psllh      $f22, $f22, $f24                \n\t"
2598     "psubh      $f20, $f20, $f16                \n\t"
2599     "psubh      $f22, $f22, $f18                \n\t"
2600     "paddh      $f8, $f8, $f0                   \n\t"
2601     "paddh      $f10, $f10, $f2                 \n\t"
2602     "paddh      $f8, $f8, $f20                  \n\t"
2603     "paddh      $f10, $f10, $f22                \n\t"
2604     "psllh      $f20, $f20, $f24                \n\t"
2605     "psllh      $f22, $f22, $f24                \n\t"
2606     "paddh      $f8, $f8, $f20                  \n\t"
2607     "paddh      $f10, $f10, $f22                \n\t"
2608     "gssdlc1    $f8, 0x9(%[pTap])               \n\t"
2609     "gssdlc1    $f10, 0x11(%[pTap])             \n\t"
2610     "gssdrc1    $f8, 0x2(%[pTap])               \n\t"
2611     "gssdrc1    $f10, 0xa(%[pTap])              \n\t"
2612 
2613     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2614     PTR_ADDU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2615     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2616     "bnez       %[iHeight], 1b                  \n\t"
2617     "j          3f                              \n\t"
2618 
2619     "2:                                         \n\t"
2620     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
2621     "xor        $f28, $f28, $f28                \n\t"
2622     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
2623     "punpckhbh  $f2, $f0, $f28                  \n\t"
2624     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
2625     "punpcklbh  $f0, $f0, $f28                  \n\t"
2626     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
2627     "punpckhbh  $f6, $f4, $f28                  \n\t"
2628     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
2629     "punpcklbh  $f4, $f4, $f28                  \n\t"
2630     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
2631     "punpckhbh  $f10, $f8, $f28                 \n\t"
2632     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
2633     "punpcklbh  $f8, $f8, $f28                  \n\t"
2634     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
2635     "punpckhbh  $f14, $f12, $f28                \n\t"
2636     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
2637     "punpcklbh  $f12, $f12, $f28                \n\t"
2638     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
2639     "punpckhbh  $f18, $f16, $f28                \n\t"
2640     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
2641     "punpcklbh  $f16, $f16, $f28                \n\t"
2642     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
2643     "punpckhbh  $f22, $f20, $f28                \n\t"
2644     "dli        $8, 0x2                         \n\t"
2645     "punpcklbh  $f20, $f20, $f28                \n\t"
2646 
2647     "dmtc1      $8, $f30                        \n\t"
2648     "paddh      $f8, $f8, $f12                  \n\t"
2649     "paddh      $f10, $f10, $f14                \n\t"
2650     "paddh      $f16, $f16, $f20                \n\t"
2651     "paddh      $f18, $f18, $f22                \n\t"
2652     "psllh      $f16, $f16, $f30                \n\t"
2653     "psllh      $f18, $f18, $f30                \n\t"
2654     "psubh      $f16, $f16, $f8                 \n\t"
2655     "psubh      $f18, $f18, $f10                \n\t"
2656     "paddh      $f0, $f0, $f4                   \n\t"
2657     "paddh      $f2, $f2, $f6                   \n\t"
2658     "paddh      $f0, $f0, $f16                  \n\t"
2659     "paddh      $f2, $f2, $f18                  \n\t"
2660     "psllh      $f16, $f16, $f30                \n\t"
2661     "psllh      $f18, $f18, $f30                \n\t"
2662     "paddh      $f0, $f0, $f16                  \n\t"
2663     "paddh      $f2, $f2, $f18                  \n\t"
2664     "gssqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
2665 
2666     "gsldlc1    $f0, 15(%[pSrc])                \n\t"
2667     "gsldrc1    $f0, 8(%[pSrc])                 \n\t"
2668     "punpckhbh  $f2, $f0, $f28                  \n\t"
2669     "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
2670     "punpcklbh  $f0, $f0, $f28                  \n\t"
2671     "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
2672     "punpckhbh  $f6, $f4, $f28                  \n\t"
2673     "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
2674     "punpcklbh  $f4, $f4, $f28                  \n\t"
2675     "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
2676     "punpckhbh  $f10, $f8, $f28                 \n\t"
2677     "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
2678     "punpcklbh  $f8, $f8, $f28                  \n\t"
2679     "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
2680     "punpckhbh  $f14, $f12, $f28                \n\t"
2681     "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
2682     "punpcklbh  $f12, $f12, $f28                \n\t"
2683     "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
2684     "punpckhbh  $f18, $f16, $f28                \n\t"
2685     "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
2686     "punpcklbh  $f16, $f16, $f28                \n\t"
2687     "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
2688     "punpckhbh  $f22, $f20, $f28                \n\t"
2689     "punpcklbh  $f20, $f20, $f28                \n\t"
2690 
2691     "mov.d      $f28, $f8                       \n\t"
2692     "mov.d      $f30, $f10                      \n\t"
2693     "paddh      $f28, $f28, $f12                \n\t"
2694     "paddh      $f30, $f30, $f14                \n\t"
2695     "mov.d      $f24, $f16                      \n\t"
2696     "mov.d      $f26, $f18                      \n\t"
2697     "dli        $8, 0x2                         \n\t"
2698     "paddh      $f24, $f24, $f20                \n\t"
2699     "paddh      $f26, $f26, $f22                \n\t"
2700     "dmfc1      $9, $f12                        \n\t"
2701     "dmtc1      $8, $f12                        \n\t"
2702     "psllh      $f24, $f24, $f12                \n\t"
2703     "psllh      $f26, $f26, $f12                \n\t"
2704     "psubh      $f24, $f24, $f28                \n\t"
2705     "psubh      $f26, $f26, $f30                \n\t"
2706     "paddh      $f0, $f0, $f4                   \n\t"
2707     "paddh      $f2, $f2, $f6                   \n\t"
2708     "paddh      $f0, $f0, $f24                  \n\t"
2709     "paddh      $f2, $f2, $f26                  \n\t"
2710     "psllh      $f24, $f24, $f12                \n\t"
2711     "psllh      $f26, $f26, $f12                \n\t"
2712     "paddh      $f0, $f0, $f24                  \n\t"
2713     "paddh      $f2, $f2, $f26                  \n\t"
2714     "gsswlc1    $f0, 0x13(%[pTap])              \n\t"
2715     "gsswrc1    $f0, 0x10(%[pTap])              \n\t"
2716 
2717     "gsldlc1    $f0, 0x15(%[pSrc])              \n\t"
2718     "xor        $f28, $f28, $f28                \n\t"
2719     "gsldrc1    $f0, 0xE(%[pSrc])               \n\t"
2720     "punpckhbh  $f2, $f0, $f28                  \n\t"
2721     "punpcklbh  $f0, $f0, $f28                  \n\t"
2722     "dli        $8, 0x2                         \n\t"
2723     "dmtc1      $9, $f12                        \n\t"
2724     "dmtc1      $8, $f24                        \n\t"
2725 
2726     "paddh      $f16, $f16, $f4                 \n\t"
2727     "paddh      $f18, $f18, $f6                 \n\t"
2728     "paddh      $f20, $f20, $f12                \n\t"
2729     "paddh      $f22, $f22, $f14                \n\t"
2730     "psllh      $f20, $f20, $f24                \n\t"
2731     "psllh      $f22, $f22, $f24                \n\t"
2732     "psubh      $f20, $f20, $f16                \n\t"
2733     "psubh      $f22, $f22, $f18                \n\t"
2734     "paddh      $f8, $f8, $f0                   \n\t"
2735     "paddh      $f10, $f10, $f2                 \n\t"
2736     "paddh      $f8, $f8, $f20                  \n\t"
2737     "paddh      $f10, $f10, $f22                \n\t"
2738     "psllh      $f20, $f20, $f24                \n\t"
2739     "psllh      $f22, $f22, $f24                \n\t"
2740     "paddh      $f8, $f8, $f20                  \n\t"
2741     "paddh      $f10, $f10, $f22                \n\t"
2742     "gssdlc1    $f8, 0x19(%[pTap])              \n\t"
2743     "gssdlc1    $f10, 0x21(%[pTap])             \n\t"
2744     "gssdrc1    $f8, 0x12(%[pTap])              \n\t"
2745     "gssdrc1    $f10, 0x1a(%[pTap])             \n\t"
2746 
2747     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2748     PTR_ADDU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2749     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2750     "bnez       %[iHeight], 2b                  \n\t"
2751     "3:                                         \n\t"
2752     : [pSrc]"+&r"(pSrc), [pTap]"+&r"(pTap), [iWidth]"+&r"(iWidth),
2753       [iHeight]"+&r"(iHeight)
2754     : [iSrcStride]"r"(iSrcStride),  [iTapStride]"r"(iTapStride)
2755     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
2756       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
2757   );
2758   RECOVER_REG;
2759 }
2760 
McHorVer22Width8VerLastAlign_mmi(const uint8_t * pTap,int32_t iTapStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2761 static inline void McHorVer22Width8VerLastAlign_mmi(const uint8_t *pTap,
2762                    int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
2763                    int32_t iWidth, int32_t iHeight) {
2764   BACKUP_REG;
2765   __asm__ volatile (
2766     ".set       arch=loongson3a                 \n\t"
2767     "move       $10, %[pTap]                    \n\t"
2768     "move       $11, %[pDst]                    \n\t"
2769     "move       $12, %[iHeight]                 \n\t"
2770     "dsrl       %[iWidth], 0x3                  \n\t"
2771     PTR_ADDU   "$13, %[iTapStride], %[iTapStride] \n\t"
2772     PTR_ADDU   "$14, %[iDstStride], %[iDstStride] \n\t"
2773     "dli        $15, 0x0020002000200020         \n\t"
2774 
2775     "4:                                         \n\t"
2776     "gslqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
2777     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2778     "gslqc1     $f6, $f4, 0x0($8)               \n\t"
2779     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2780     "gslqc1     $f10, $f8, 0x0(%[pTap])         \n\t"
2781     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2782     "gslqc1     $f14, $f12, 0x0($8)             \n\t"
2783     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2784     "gslqc1     $f18, $f16, 0x0(%[pTap])        \n\t"
2785     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2786     "gslqc1     $f22, $f20, 0x0($8)             \n\t"
2787 
2788     FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2789                      $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
2790 
2791     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2792     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2793     "gslqc1     $f26, $f24, 0x0(%[pTap])        \n\t"
2794     "mov.d      $f0, $f4                        \n\t"
2795     "mov.d      $f2, $f6                        \n\t"
2796     "mov.d      $f4, $f8                        \n\t"
2797     "mov.d      $f6, $f10                       \n\t"
2798     "mov.d      $f8, $f12                       \n\t"
2799     "mov.d      $f10, $f14                      \n\t"
2800     "mov.d      $f12, $f16                      \n\t"
2801     "mov.d      $f14, $f18                      \n\t"
2802     "mov.d      $f16, $f20                      \n\t"
2803     "mov.d      $f18, $f22                      \n\t"
2804     "mov.d      $f20, $f24                      \n\t"
2805     "mov.d      $f22, $f26                      \n\t"
2806     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2807     PTR_SUBU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2808 
2809     "5:                                         \n\t"
2810     FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2811                      $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
2812     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2813     "beqz       %[iHeight], 6f                  \n\t"
2814     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2815     "gslqc1     $f26, $f24, 0x0(%[pTap])        \n\t"
2816 
2817     FILTER_VER_ALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2818                      $f26, $f28, $f30, $f0, $f2, %[pDst], %[iDstStride], $8, $9, $15)
2819     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2820     "beqz       %[iHeight], 6f                  \n\t"
2821     PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2822     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2823     "gslqc1     $f30, $f28, 0x0($8)             \n\t"
2824 
2825     FILTER_VER_ALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2826                      $f30, $f0, $f2, $f4, $f6, %[pDst], $0, $8, $9, $15)
2827     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2828     "beqz       %[iHeight], 6f                  \n\t"
2829     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2830     "gslqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
2831 
2832     FILTER_VER_ALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2833                      $f2, $f4, $f6, $f8, $f10, %[pDst], %[iDstStride], $8, $9, $15)
2834     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2835     "beqz       %[iHeight], 6f                  \n\t"
2836     PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2837     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2838     "gslqc1     $f6, $f4, 0x0($8)               \n\t"
2839 
2840     FILTER_VER_ALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
2841                      $f6, $f8, $f10, $f12, $f14, %[pDst], $0, $8, $9, $15)
2842     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2843     "beqz       %[iHeight], 6f                  \n\t"
2844     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2845     "gslqc1     $f10, $f8, 0x0(%[pTap])         \n\t"
2846 
2847     FILTER_VER_ALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
2848                      $f10, $f12, $f14, $f16, $f18, %[pDst], %[iDstStride], $8, $9, $15)
2849     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2850     "beqz       %[iHeight], 6f                  \n\t"
2851     PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2852     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2853     "gslqc1     $f14, $f12, 0x0($8)             \n\t"
2854 
2855     FILTER_VER_ALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
2856                      $f14, $f16, $f18, $f20, $f22, %[pDst], $0, $8, $9, $15)
2857     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2858     "beqz       %[iHeight], 6f                  \n\t"
2859     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2860     "gslqc1     $f18, $f16, 0x0(%[pTap])        \n\t"
2861 
2862     FILTER_VER_ALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
2863                      $f18, $f20, $f22, $f24, $f26, %[pDst], %[iDstStride], $8, $9, $15)
2864     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2865     "beqz       %[iHeight], 6f                  \n\t"
2866     PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
2867     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2868     "gslqc1     $f22, $f20, 0x0($8)             \n\t"
2869     "j          5b                              \n\t"
2870 
2871     "6:                                         \n\t"
2872     PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
2873     "beqz       %[iWidth], 7f                   \n\t"
2874     "move       %[pTap], $10                    \n\t"
2875     "move       %[pDst], $11                    \n\t"
2876     "move       %[iHeight], $12                 \n\t"
2877     PTR_ADDIU  "%[pTap], %[pTap], 0x10          \n\t"
2878     PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
2879     "j          4b                              \n\t"
2880     "7:                                         \n\t"
2881     : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
2882       [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
2883     : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
2884     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
2885       "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18",
2886       "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
2887   );
2888   RECOVER_REG;
2889 }
2890 
McHorVer22Width8VerLastUnAlign_mmi(const uint8_t * pTap,int32_t iTapStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2891 static inline void McHorVer22Width8VerLastUnAlign_mmi(const uint8_t *pTap,
2892                    int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
2893                    int32_t iWidth, int32_t iHeight) {
2894   BACKUP_REG;
2895   __asm__ volatile (
2896     ".set       arch=loongson3a                 \n\t"
2897     "move       $10, %[pTap]                    \n\t"
2898     "move       $11, %[pDst]                    \n\t"
2899     "move       $12, %[iHeight]                 \n\t"
2900     "dsrl       %[iWidth], 0x3                  \n\t"
2901     PTR_ADDU   "$13, %[iTapStride], %[iTapStride] \n\t"
2902     "dli        $14, 0x0020002000200020         \n\t"
2903 
2904     "4:                                         \n\t"
2905     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2906     "gsldlc1    $f0, 0x7(%[pTap])               \n\t"
2907     "gsldlc1    $f2, 0xF(%[pTap])               \n\t"
2908     "gsldlc1    $f4, 0x7($8)                    \n\t"
2909     "gsldlc1    $f6, 0xF($8)                    \n\t"
2910     "gsldrc1    $f0, 0x0(%[pTap])               \n\t"
2911     "gsldrc1    $f2, 0x8(%[pTap])               \n\t"
2912     "gsldrc1    $f4, 0x0($8)                    \n\t"
2913     "gsldrc1    $f6, 0x8($8)                    \n\t"
2914     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2915     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2916     "gsldlc1    $f8, 0x7(%[pTap])               \n\t"
2917     "gsldlc1    $f10, 0xF(%[pTap])              \n\t"
2918     "gsldlc1    $f12, 0x7($8)                   \n\t"
2919     "gsldlc1    $f14, 0xF($8)                   \n\t"
2920     "gsldrc1    $f8, 0x0(%[pTap])               \n\t"
2921     "gsldrc1    $f10, 0x8(%[pTap])              \n\t"
2922     "gsldrc1    $f12, 0x0($8)                   \n\t"
2923     "gsldrc1    $f14, 0x8($8)                   \n\t"
2924     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2925     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2926     "gsldlc1    $f16, 0x7(%[pTap])              \n\t"
2927     "gsldlc1    $f18, 0xF(%[pTap])              \n\t"
2928     "gsldlc1    $f20, 0x7($8)                   \n\t"
2929     "gsldlc1    $f22, 0xF($8)                   \n\t"
2930     "gsldrc1    $f16, 0x0(%[pTap])              \n\t"
2931     "gsldrc1    $f18, 0x8(%[pTap])              \n\t"
2932     "gsldrc1    $f20, 0x0($8)                   \n\t"
2933     "gsldrc1    $f22, 0x8($8)                   \n\t"
2934 
2935     FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2936                        $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
2937     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2938     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2939     "gsldlc1    $f24, 0x7(%[pTap])              \n\t"
2940     "gsldlc1    $f26, 0xF(%[pTap])              \n\t"
2941     "gsldrc1    $f24, 0x0(%[pTap])              \n\t"
2942     "gsldrc1    $f26, 0x8(%[pTap])              \n\t"
2943     "mov.d      $f0, $f4                        \n\t"
2944     "mov.d      $f2, $f6                        \n\t"
2945     "mov.d      $f4, $f8                        \n\t"
2946     "mov.d      $f6, $f10                       \n\t"
2947     "mov.d      $f8, $f12                       \n\t"
2948     "mov.d      $f10, $f14                      \n\t"
2949     "mov.d      $f12, $f16                      \n\t"
2950     "mov.d      $f14, $f18                      \n\t"
2951     "mov.d      $f16, $f20                      \n\t"
2952     "mov.d      $f18, $f22                      \n\t"
2953     "mov.d      $f20, $f24                      \n\t"
2954     "mov.d      $f22, $f26                      \n\t"
2955     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2956     PTR_SUBU   "%[pTap], %[pTap], %[iTapStride] \n\t"
2957 
2958     "5:                                         \n\t"
2959     FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2960                        $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
2961 
2962     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2963     "beqz       %[iHeight], 6f                  \n\t"
2964     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2965     "gsldlc1    $f24, 0x7(%[pTap])              \n\t"
2966     "gsldlc1    $f26, 0xF(%[pTap])              \n\t"
2967     "gsldrc1    $f24, 0x0(%[pTap])              \n\t"
2968     "gsldrc1    $f26, 0x8(%[pTap])              \n\t"
2969     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2970 
2971     FILTER_VER_UNALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22,
2972                        $f24, $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9, $14)
2973     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2974     "beqz       %[iHeight], 6f                  \n\t"
2975     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2976     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2977     "gsldlc1    $f28, 0x7($8)                   \n\t"
2978     "gsldlc1    $f30, 0xF($8)                   \n\t"
2979     "gsldrc1    $f28, 0x0($8)                   \n\t"
2980     "gsldrc1    $f30, 0x8($8)                   \n\t"
2981 
2982     FILTER_VER_UNALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
2983                        $f28, $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9, $14)
2984     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2985     "beqz       %[iHeight], 6f                  \n\t"
2986     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
2987     "gsldlc1    $f0, 0x7(%[pTap])               \n\t"
2988     "gsldlc1    $f2, 0xF(%[pTap])               \n\t"
2989     "gsldrc1    $f0, 0x0(%[pTap])               \n\t"
2990     "gsldrc1    $f2, 0x8(%[pTap])               \n\t"
2991     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2992 
2993     FILTER_VER_UNALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2994                        $f30, $f0, $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9, $14)
2995     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
2996     "beqz       %[iHeight], 6f                  \n\t"
2997     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
2998     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
2999     "gsldlc1    $f4, 0x7($8)                    \n\t"
3000     "gsldlc1    $f6, 0xF($8)                    \n\t"
3001     "gsldrc1    $f4, 0x0($8)                    \n\t"
3002     "gsldrc1    $f6, 0x8($8)                    \n\t"
3003 
3004     FILTER_VER_UNALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2,
3005                        $f4, $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9, $14)
3006     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3007     "beqz       %[iHeight], 6f                  \n\t"
3008     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
3009     "gsldlc1    $f8, 0x7(%[pTap])               \n\t"
3010     "gsldlc1    $f10, 0xF(%[pTap])              \n\t"
3011     "gsldrc1    $f8, 0x0(%[pTap])               \n\t"
3012     "gsldrc1    $f10, 0x8(%[pTap])              \n\t"
3013     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3014 
3015     FILTER_VER_UNALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
3016                        $f8, $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9, $14)
3017     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3018     "beqz       %[iHeight], 6f                  \n\t"
3019     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3020     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
3021     "gsldlc1    $f12, 0x7($8)                   \n\t"
3022     "gsldlc1    $f14, 0xF($8)                   \n\t"
3023     "gsldrc1    $f12, 0x0($8)                   \n\t"
3024     "gsldrc1    $f14, 0x8($8)                   \n\t"
3025 
3026     FILTER_VER_UNALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
3027                        $f12, $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9, $14)
3028     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3029     "beqz       %[iHeight], 6f                  \n\t"
3030     PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
3031     "gsldlc1    $f16, 0x7(%[pTap])              \n\t"
3032     "gsldlc1    $f18, 0xF(%[pTap])              \n\t"
3033     "gsldrc1    $f16, 0x0(%[pTap])              \n\t"
3034     "gsldrc1    $f18, 0x8(%[pTap])              \n\t"
3035     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3036 
3037     FILTER_VER_UNALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
3038                        $f16, $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9, $14)
3039     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3040     "beqz       %[iHeight], 6f                  \n\t"
3041     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3042     PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
3043     "gsldlc1    $f20, 0x7($8)                   \n\t"
3044     "gsldlc1    $f22, 0xF($8)                   \n\t"
3045     "gsldrc1    $f20, 0x0($8)                   \n\t"
3046     "gsldrc1    $f22, 0x8($8)                   \n\t"
3047     "j          5b                              \n\t"
3048 
3049     "6:                                         \n\t"
3050     PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
3051     "beqz       %[iWidth], 7f                   \n\t"
3052     "move       %[pTap], $10                    \n\t"
3053     "move       %[pDst], $11                    \n\t"
3054     "move       %[iHeight], $12                 \n\t"
3055     PTR_ADDIU  "%[pTap], %[pTap], 0x10          \n\t"
3056     PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
3057     "j          4b                              \n\t"
3058 
3059     "7:                                         \n\t"
3060     : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
3061       [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
3062     : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
3063     : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
3064       "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
3065       "$f22", "$f24", "$f26", "$f28", "$f30"
3066   );
3067   RECOVER_REG;
3068 }
3069 
3070 //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3071 static inline void McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t* pSrc,
3072                    int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
3073                    int32_t iWidth, int32_t iHeight) {
3074   ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
3075 
3076   if (iWidth == 17 || iWidth == 9){
3077     int32_t tmp1 = 2 * (iWidth - 8);
3078     McHorVer22HorFirst_mmi(pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
3079 
3080     McHorVer22Width8VerLastAlign_mmi((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
3081 
3082     McHorVer22Width8VerLastUnAlign_mmi((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8,
3083                                         iDstStride, 8, iHeight);
3084   } else {
3085     int16_t iTmp[17 + 5];
3086     int32_t i, j, k;
3087 
3088     for (i = 0; i < iHeight; i++) {
3089       for (j = 0; j < iWidth + 5; j++) {
3090         iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
3091       }
3092       for (k = 0; k < iWidth; k++) {
3093         pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
3094       }
3095       pSrc += iSrcStride;
3096       pDst += iDstStride;
3097     }
3098   }
3099 }
3100 
McCopyWidthEq4_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3101 void McCopyWidthEq4_mmi(const uint8_t *pSrc, int iSrcStride,
3102                         uint8_t *pDst, int iDstStride, int iHeight) {
3103   __asm__ volatile (
3104     ".set       arch=loongson3a                 \n\t"
3105     "1:                                         \n\t"
3106     "lwl        $8, 0x3(%[pSrc])                \n\t"
3107     "lwr        $8, 0x0(%[pSrc])                \n\t"
3108     "swl        $8, 0x3(%[pDst])                \n\t"
3109     "swr        $8, 0x0(%[pDst])                \n\t"
3110     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3111     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3112     PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3113     "bnez       %[iHeight], 1b                  \n\t"
3114     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3115     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3116     : "memory", "$8"
3117   );
3118 }
3119 
McCopyWidthEq8_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3120 void McCopyWidthEq8_mmi(const uint8_t *pSrc, int iSrcStride,
3121                         uint8_t *pDst, int iDstStride, int iHeight) {
3122   __asm__ volatile (
3123     ".set       arch=loongson3a                 \n\t"
3124     "1:                                         \n\t"
3125     "ldl        $8, 0x7(%[pSrc])                \n\t"
3126     "ldr        $8, 0x0(%[pSrc])                \n\t"
3127     "sdl        $8, 0x7(%[pDst])                \n\t"
3128     "sdr        $8, 0x0(%[pDst])                \n\t"
3129     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3130     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3131     PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3132     "bnez       %[iHeight], 1b                  \n\t"
3133     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3134     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3135     : "memory", "$8"
3136   );
3137 }
3138 
McCopyWidthEq16_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3139 void McCopyWidthEq16_mmi(const uint8_t *pSrc, int iSrcStride,
3140                          uint8_t *pDst, int iDstStride, int iHeight) {
3141   __asm__ volatile (
3142     ".set       arch=loongson3a                 \n\t"
3143     "1:                                         \n\t"
3144     "ldl        $8, 0x7(%[pSrc])                \n\t"
3145     "ldl        $9, 0xF(%[pSrc])                \n\t"
3146     "ldr        $8, 0x0(%[pSrc])                \n\t"
3147     "ldr        $9, 0x8(%[pSrc])                \n\t"
3148     "sdl        $8, 0x7(%[pDst])                \n\t"
3149     "sdl        $9, 0xF(%[pDst])                \n\t"
3150     "sdr        $8, 0x0(%[pDst])                \n\t"
3151     "sdr        $9, 0x8(%[pDst])                \n\t"
3152     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3153     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3154     PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3155     "bnez       %[iHeight], 1b                  \n\t"
3156     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3157     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3158     : "memory", "$8", "$9"
3159   );
3160 }
3161 
McCopy_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3162 static inline void McCopy_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3163                               int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3164   if (iWidth == 16)
3165     McCopyWidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3166   else if (iWidth == 8)
3167     McCopyWidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3168   else if (iWidth == 4)
3169     McCopyWidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3170   else
3171     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3172 }
3173 
McChromaWidthEq4_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,const uint8_t * pABCD,int32_t iHeight)3174 void McChromaWidthEq4_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
3175                           int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
3176   __asm__ volatile (
3177     ".set       arch=loongson3a                 \n\t"
3178     "gsldlc1    $f6, 0x7(%[pABCD])              \n\t"
3179     "gsldrc1    $f6, 0x0(%[pABCD])              \n\t"
3180     "xor        $f14, $f14, $f14                \n\t"
3181     "punpcklbh  $f6, $f6, $f6                   \n\t"
3182     "mov.d      $f8, $f6                        \n\t"
3183     "punpcklhw  $f6, $f6, $f6                   \n\t"
3184     "punpckhhw  $f8, $f8, $f8                   \n\t"
3185     "mov.d      $f10, $f6                       \n\t"
3186     "punpcklbh  $f6, $f6, $f14                  \n\t"
3187     "punpckhbh  $f10, $f10, $f14                \n\t"
3188 
3189     "mov.d      $f12, $f8                       \n\t"
3190     "punpcklbh  $f8, $f8, $f14                  \n\t"
3191     "punpckhbh  $f12, $f12, $f14                \n\t"
3192     PTR_ADDU   "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
3193     "dli        $8, 0x6                         \n\t"
3194     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3195     "gsldlc1    $f2, 0x8(%[pSrc])               \n\t"
3196     "dmtc1      $8, $f16                        \n\t"
3197     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3198     "gsldrc1    $f2, 0x1(%[pSrc])               \n\t"
3199     "dli        $8, 0x0020002000200020          \n\t"
3200     "punpcklbh  $f0, $f0, $f14                  \n\t"
3201     "punpcklbh  $f2, $f2, $f14                  \n\t"
3202 
3203     "dmtc1      $8, $f18                        \n\t"
3204     "1:                                         \n\t"
3205     "pmullh     $f0, $f0, $f6                   \n\t"
3206     "pmullh     $f2, $f2, $f10                  \n\t"
3207     "paddh      $f0, $f0, $f2                   \n\t"
3208 
3209     "gsldlc1    $f2, 0x7(%[pABCD])              \n\t"
3210     "gsldrc1    $f2, 0x0(%[pABCD])              \n\t"
3211     "punpcklbh  $f2, $f2, $f14                  \n\t"
3212     "mov.d      $f4, $f2                        \n\t"
3213     "pmullh     $f2, $f2, $f8                   \n\t"
3214     "paddh      $f0, $f0, $f2                   \n\t"
3215     "gsldlc1    $f2, 0x8(%[pABCD])              \n\t"
3216     "gsldrc1    $f2, 0x1(%[pABCD])              \n\t"
3217     "punpcklbh  $f2, $f2, $f14                  \n\t"
3218     "mov.d      $f14, $f2                       \n\t"
3219     "pmullh     $f2, $f2, $f12                  \n\t"
3220     "paddh      $f0, $f0, $f2                   \n\t"
3221     "mov.d      $f2, $f14                       \n\t"
3222     "paddh      $f0, $f0, $f18                  \n\t"
3223     "psrlh      $f0, $f0, $f16                  \n\t"
3224     "xor        $f14, $f14, $f14                \n\t"
3225     "packushb   $f0, $f0, $f14                  \n\t"
3226     "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
3227     "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
3228     "mov.d      $f0, $f4                        \n\t"
3229     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3230     PTR_ADDU   "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
3231     PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3232     "bnez       %[iHeight], 1b                  \n\t"
3233     : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
3234       [pABCD]"+&r"((unsigned char *)pABCD), [iHeight]"+&r"((int)iHeight)
3235     : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
3236     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3237       "$f14", "$f16", "$f18"
3238   );
3239 }
3240 
McChromaWidthEq8_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,const uint8_t * pABCD,int32_t iHeight)3241 void McChromaWidthEq8_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
3242                           int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
3243   BACKUP_REG;
3244   __asm__ volatile (
3245     ".set       arch=loongson3a                 \n\t"
3246     "gsldlc1    $f12, 0x7(%[pABCD])             \n\t"
3247     "xor        $f28, $f28, $f28                \n\t"
3248     "gsldrc1    $f12, 0x0(%[pABCD])             \n\t"
3249     "punpcklbh  $f12, $f12, $f12                \n\t"
3250     "punpckhhw  $f14, $f12, $f12                \n\t"
3251     "punpcklhw  $f12, $f12, $f12                \n\t"
3252 
3253     "mov.d      $f16, $f14                      \n\t"
3254     "punpckhwd  $f14, $f12, $f12                \n\t"
3255     "punpcklwd  $f12, $f12, $f12                \n\t"
3256     "punpckhwd  $f18, $f16, $f16                \n\t"
3257     "punpcklwd  $f16, $f16, $f16                \n\t"
3258     "mov.d      $f20, $f14                      \n\t"
3259     "mov.d      $f24, $f18                      \n\t"
3260 
3261     "punpckhbh  $f14, $f12, $f28                \n\t"
3262     "punpcklbh  $f12, $f12, $f28                \n\t"
3263     "punpckhbh  $f22, $f20, $f28                \n\t"
3264     "punpcklbh  $f20, $f20, $f28                \n\t"
3265     "punpckhbh  $f18, $f16, $f28                \n\t"
3266     "punpcklbh  $f16, $f16, $f28                \n\t"
3267     "punpckhbh  $f26, $f24, $f28                \n\t"
3268     "punpcklbh  $f24, $f24, $f28                \n\t"
3269 
3270     PTR_ADDU   "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
3271     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3272     "gsldlc1    $f4, 0x8(%[pSrc])               \n\t"
3273     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3274     "gsldrc1    $f4, 0x1(%[pSrc])               \n\t"
3275     "punpckhbh  $f2, $f0, $f28                  \n\t"
3276     "punpcklbh  $f0, $f0, $f28                  \n\t"
3277     "punpckhbh  $f6, $f4, $f28                  \n\t"
3278     "punpcklbh  $f4, $f4, $f28                  \n\t"
3279     "1:                                         \n\t"
3280     "dli        $8, 0x20                        \n\t"
3281     "dmtc1      $8, $f30                        \n\t"
3282 
3283     "pmullh     $f0, $f0, $f12                  \n\t"
3284     "pmullh     $f2, $f2, $f14                  \n\t"
3285     "pmullh     $f4, $f4, $f20                  \n\t"
3286     "pmullh     $f6, $f6, $f22                  \n\t"
3287     "paddh      $f0, $f0, $f4                   \n\t"
3288     "paddh      $f2, $f2, $f6                   \n\t"
3289 
3290     "gsldlc1    $f4, 0x7(%[pABCD])              \n\t"
3291     "gsldrc1    $f4, 0x0(%[pABCD])              \n\t"
3292     "punpckhbh  $f6, $f4, $f28                  \n\t"
3293     "punpcklbh  $f4, $f4, $f28                  \n\t"
3294     "mov.d      $f8, $f4                        \n\t"
3295     "mov.d      $f10, $f6                       \n\t"
3296     "pmullh     $f4, $f4, $f16                  \n\t"
3297     "pmullh     $f6, $f6, $f18                  \n\t"
3298     "paddh      $f0, $f0, $f4                   \n\t"
3299     "paddh      $f2, $f2, $f6                   \n\t"
3300 
3301     "gsldlc1    $f4, 0x8(%[pABCD])              \n\t"
3302     "gsldrc1    $f4, 0x1(%[pABCD])              \n\t"
3303     "punpckhbh  $f6, $f4, $f28                  \n\t"
3304     "punpcklbh  $f4, $f4, $f28                  \n\t"
3305     "mov.d      $f28, $f4                       \n\t"
3306     "mov.d      $f30, $f6                       \n\t"
3307     "pmullh     $f4, $f4, $f24                  \n\t"
3308     "pmullh     $f6, $f6, $f26                  \n\t"
3309     "paddh      $f0, $f0, $f4                   \n\t"
3310     "paddh      $f2, $f2, $f6                   \n\t"
3311     "mov.d      $f4, $f28                       \n\t"
3312     "mov.d      $f6, $f30                       \n\t"
3313 
3314     "dli        $8, 0x0020002000200020          \n\t"
3315     "dmfc1      $9, $f20                        \n\t"
3316     "dmtc1      $8, $f20                        \n\t"
3317     "dli        $8, 0x6                         \n\t"
3318     "paddh      $f0, $f0, $f20                  \n\t"
3319     "paddh      $f2, $f2, $f20                  \n\t"
3320     "dmtc1      $8, $f20                        \n\t"
3321     "psrlh      $f0, $f0, $f20                  \n\t"
3322     "psrlh      $f2, $f2, $f20                  \n\t"
3323 
3324     "xor        $f28, $f28, $f28                \n\t"
3325     "packushb   $f0, $f0, $f2                   \n\t"
3326     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3327     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3328 
3329     "mov.d      $f0, $f8                        \n\t"
3330     "mov.d      $f2, $f10                       \n\t"
3331     "dmtc1      $9, $f20                        \n\t"
3332     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3333     PTR_ADDU   "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
3334 
3335     PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
3336     "bnez       %[iHeight], 1b                  \n\t"
3337     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [pABCD]"+&r"(pABCD),
3338       [iHeight]"+&r"(iHeight)
3339     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3340     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3341       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3342   );
3343   RECOVER_REG;
3344 }
3345 
McChroma_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)3346 void McChroma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3347                   int32_t iDstStride, int16_t iMvX, int16_t iMvY,
3348                   int32_t iWidth, int32_t iHeight) {
3349   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
3350     McChromaWidthEq4_mmi,
3351     McChromaWidthEq8_mmi
3352   };
3353   const int32_t kiD8x = iMvX & 0x07;
3354   const int32_t kiD8y = iMvY & 0x07;
3355   if (kiD8x == 0 && kiD8y == 0) {
3356     McCopy_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
3357     return;
3358   }
3359   if (iWidth != 2) {
3360     kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
3361                                       g_kuiABCD[kiD8y][kiD8x], iHeight);
3362   } else
3363     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
3364                           iWidth, iHeight);
3365 }
3366 
McHorVer20WidthEq8_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3367 void McHorVer20WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3368                             int iDstStride, int iHeight) {
3369   BACKUP_REG;
3370   __asm__ volatile (
3371     ".set       arch=loongson3a                 \n\t"
3372     PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
3373     "xor        $f28, $f28, $f28                \n\t"
3374     "dli        $8, 0x0010001000100010          \n\t"
3375     "dmtc1      $8, $f24                        \n\t"
3376     "dli        $8, 0x2                         \n\t"
3377     "dmtc1      $8, $f26                        \n\t"
3378     "dli        $8, 0x5                         \n\t"
3379     "dmtc1      $8, $f30                        \n\t"
3380     "1:                                         \n\t"
3381     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3382     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
3383     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
3384     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
3385     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
3386     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
3387     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3388     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
3389     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
3390     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
3391     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
3392     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
3393     "punpckhbh  $f2, $f0, $f28                  \n\t"
3394     "punpckhbh  $f6, $f4, $f28                  \n\t"
3395     "punpckhbh  $f10, $f8, $f28                 \n\t"
3396     "punpckhbh  $f14, $f12, $f28                \n\t"
3397     "punpckhbh  $f18, $f16, $f28                \n\t"
3398     "punpckhbh  $f22, $f20, $f28                \n\t"
3399     "punpcklbh  $f0, $f0, $f28                  \n\t"
3400     "punpcklbh  $f4, $f4, $f28                  \n\t"
3401     "punpcklbh  $f8, $f8, $f28                  \n\t"
3402     "punpcklbh  $f12, $f12, $f28                \n\t"
3403     "punpcklbh  $f16, $f16, $f28                \n\t"
3404     "punpcklbh  $f20, $f20, $f28                \n\t"
3405     "paddh      $f8, $f8, $f12                  \n\t"
3406     "paddh      $f10, $f10, $f14                \n\t"
3407     "paddh      $f16, $f16, $f20                \n\t"
3408     "paddh      $f18, $f18, $f22                \n\t"
3409     "psllh      $f16, $f16, $f26                \n\t"
3410     "psllh      $f18, $f18, $f26                \n\t"
3411     "psubh      $f16, $f16, $f8                 \n\t"
3412     "psubh      $f18, $f18, $f10                \n\t"
3413     "paddh      $f0, $f0, $f4                   \n\t"
3414     "paddh      $f2, $f2, $f6                   \n\t"
3415     "paddh      $f0, $f0, $f16                  \n\t"
3416     "paddh      $f2, $f2, $f18                  \n\t"
3417     "psllh      $f16, $f16, $f26                \n\t"
3418     "psllh      $f18, $f18, $f26                \n\t"
3419     "paddh      $f0, $f0, $f16                  \n\t"
3420     "paddh      $f2, $f2, $f18                  \n\t"
3421     "paddh      $f0, $f0, $f24                  \n\t"
3422     "paddh      $f2, $f2, $f24                  \n\t"
3423     "psrah      $f0, $f0, $f30                  \n\t"
3424     "psrah      $f2, $f2, $f30                  \n\t"
3425     "packushb   $f0, $f0, $f2                   \n\t"
3426     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3427     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3428     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3429     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3430     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3431     "bnez       %[iHeight], 1b                  \n\t"
3432     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3433     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3434     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3435       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3436   );
3437   RECOVER_REG;
3438 }
3439 
McHorVer20WidthEq16_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3440 void McHorVer20WidthEq16_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3441                              int iDstStride, int iHeight) {
3442   BACKUP_REG;
3443   __asm__ volatile (
3444     ".set       arch=loongson3a                 \n\t"
3445     PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
3446     "dli        $8, 0x0010001000100010          \n\t"
3447     "dmtc1      $8, $f24                        \n\t"
3448     "dli        $8, 0x2                         \n\t"
3449     "dmtc1      $8, $f26                        \n\t"
3450     "dli        $8, 0x5                         \n\t"
3451     "dmtc1      $8, $f30                        \n\t"
3452     "1:                                         \n\t"
3453     "xor        $f28, $f28, $f28                \n\t"
3454     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3455     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
3456     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
3457     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
3458     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
3459     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
3460     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3461     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
3462     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
3463     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
3464     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
3465     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
3466     "punpckhbh  $f2, $f0, $f28                  \n\t"
3467     "punpckhbh  $f6, $f4, $f28                  \n\t"
3468     "punpckhbh  $f10, $f8, $f28                 \n\t"
3469     "punpckhbh  $f14, $f12, $f28                \n\t"
3470     "punpckhbh  $f18, $f16, $f28                \n\t"
3471     "punpckhbh  $f22, $f20, $f28                \n\t"
3472     "punpcklbh  $f0, $f0, $f28                  \n\t"
3473     "punpcklbh  $f4, $f4, $f28                  \n\t"
3474     "punpcklbh  $f8, $f8, $f28                  \n\t"
3475     "punpcklbh  $f12, $f12, $f28                \n\t"
3476     "punpcklbh  $f16, $f16, $f28                \n\t"
3477     "punpcklbh  $f20, $f20, $f28                \n\t"
3478     "paddh      $f8, $f8, $f12                  \n\t"
3479     "paddh      $f10, $f10, $f14                \n\t"
3480     "paddh      $f16, $f16, $f20                \n\t"
3481     "paddh      $f18, $f18, $f22                \n\t"
3482     "psllh      $f16, $f16, $f26                \n\t"
3483     "psllh      $f18, $f18, $f26                \n\t"
3484     "psubh      $f16, $f16, $f8                 \n\t"
3485     "psubh      $f18, $f18, $f10                \n\t"
3486     "paddh      $f0, $f0, $f4                   \n\t"
3487     "paddh      $f2, $f2, $f6                   \n\t"
3488     "paddh      $f0, $f0, $f16                  \n\t"
3489     "paddh      $f2, $f2, $f18                  \n\t"
3490     "psllh      $f16, $f16, $f26                \n\t"
3491     "psllh      $f18, $f18, $f26                \n\t"
3492     "paddh      $f0, $f0, $f16                  \n\t"
3493     "paddh      $f2, $f2, $f18                  \n\t"
3494     "paddh      $f0, $f0, $f24                  \n\t"
3495     "paddh      $f2, $f2, $f24                  \n\t"
3496     "psrah      $f0, $f0, $f30                  \n\t"
3497     "psrah      $f2, $f2, $f30                  \n\t"
3498     "packushb   $f0, $f0, $f2                   \n\t"
3499     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3500     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3501     "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
3502     "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
3503     "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
3504     "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
3505     "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
3506     "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
3507     "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
3508     "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
3509     "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
3510     "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
3511     "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
3512     "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
3513     "punpckhbh  $f2, $f0, $f28                  \n\t"
3514     "punpckhbh  $f6, $f4, $f28                  \n\t"
3515     "punpckhbh  $f10, $f8, $f28                 \n\t"
3516     "punpckhbh  $f14, $f12, $f28                \n\t"
3517     "punpckhbh  $f18, $f16, $f28                \n\t"
3518     "punpckhbh  $f22, $f20, $f28                \n\t"
3519     "punpcklbh  $f0, $f0, $f28                  \n\t"
3520     "punpcklbh  $f4, $f4, $f28                  \n\t"
3521     "punpcklbh  $f8, $f8, $f28                  \n\t"
3522     "punpcklbh  $f12, $f12, $f28                \n\t"
3523     "punpcklbh  $f16, $f16, $f28                \n\t"
3524     "punpcklbh  $f20, $f20, $f28                \n\t"
3525     "paddh      $f8, $f8, $f12                  \n\t"
3526     "paddh      $f10, $f10, $f14                \n\t"
3527     "paddh      $f16, $f16, $f20                \n\t"
3528     "paddh      $f18, $f18, $f22                \n\t"
3529     "psllh      $f16, $f16, $f26                \n\t"
3530     "psllh      $f18, $f18, $f26                \n\t"
3531     "psubh      $f16, $f16, $f8                 \n\t"
3532     "psubh      $f18, $f18, $f10                \n\t"
3533     "paddh      $f0, $f0, $f4                   \n\t"
3534     "paddh      $f2, $f2, $f6                   \n\t"
3535     "paddh      $f0, $f0, $f16                  \n\t"
3536     "paddh      $f2, $f2, $f18                  \n\t"
3537     "psllh      $f16, $f16, $f26                \n\t"
3538     "psllh      $f18, $f18, $f26                \n\t"
3539     "paddh      $f0, $f0, $f16                  \n\t"
3540     "paddh      $f2, $f2, $f18                  \n\t"
3541     "paddh      $f0, $f0, $f24                  \n\t"
3542     "paddh      $f2, $f2, $f24                  \n\t"
3543     "psrah      $f0, $f0, $f30                  \n\t"
3544     "psrah      $f2, $f2, $f30                  \n\t"
3545     "packushb   $f0, $f0, $f2                   \n\t"
3546     "gssdlc1    $f0, 0xF(%[pDst])               \n\t"
3547     "gssdrc1    $f0, 0x8(%[pDst])               \n\t"
3548     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3549     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3550     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3551     "bnez       %[iHeight], 1b                  \n\t"
3552     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3553     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3554     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3555       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3556   );
3557   RECOVER_REG;
3558 }
3559 
McHorVer20WidthEq4_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3560 void McHorVer20WidthEq4_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3561                             int iDstStride, int iHeight) {
3562   __asm__ volatile (
3563     ".set       arch=loongson3a                 \n\t"
3564     "1:                                         \n\t"
3565     PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
3566     "xor        $f14, $f14, $f14                \n\t"
3567     "dli        $8, 0x0010001000100010          \n\t"
3568     "dmtc1      $8, $f12                        \n\t"
3569     "1:                                         \n\t"
3570     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3571     "gsldlc1    $f2, 0xc(%[pSrc])               \n\t"
3572     "gsldlc1    $f4, 0x8(%[pSrc])               \n\t"
3573     "gsldlc1    $f6, 0xb(%[pSrc])               \n\t"
3574     "gsldlc1    $f8, 0x9(%[pSrc])               \n\t"
3575     "gsldlc1    $f10, 0xa(%[pSrc])              \n\t"
3576     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3577     "gsldrc1    $f2, 0x5(%[pSrc])               \n\t"
3578     "gsldrc1    $f4, 0x1(%[pSrc])               \n\t"
3579     "gsldrc1    $f6, 0x4(%[pSrc])               \n\t"
3580     "gsldrc1    $f8, 0x2(%[pSrc])               \n\t"
3581     "gsldrc1    $f10, 0x3(%[pSrc])              \n\t"
3582     "dli        $8, 0x2                         \n\t"
3583     "punpcklbh  $f0, $f0, $f14                  \n\t"
3584     "punpcklbh  $f2, $f2, $f14                  \n\t"
3585     "punpcklbh  $f4, $f4, $f14                  \n\t"
3586     "punpcklbh  $f6, $f6, $f14                  \n\t"
3587     "punpcklbh  $f8, $f8, $f14                  \n\t"
3588     "punpcklbh  $f10, $f10, $f14                \n\t"
3589     "dmtc1      $8, $f16                        \n\t"
3590     "paddh      $f4, $f4, $f6                   \n\t"
3591     "paddh      $f8, $f8, $f10                  \n\t"
3592     "psllh      $f8, $f8, $f16                  \n\t"
3593     "psubh      $f8, $f8, $f4                   \n\t"
3594     "paddh      $f0, $f0, $f2                   \n\t"
3595     "paddh      $f0, $f0, $f8                   \n\t"
3596     "dli        $8, 0x5                         \n\t"
3597     "psllh      $f8, $f8, $f16                  \n\t"
3598     "paddh      $f0, $f0, $f8                   \n\t"
3599     "paddh      $f0, $f0, $f12                  \n\t"
3600     "dmtc1      $8, $f16                        \n\t"
3601     "psrah      $f0, $f0, $f16                  \n\t"
3602     "packushb   $f0, $f0, $f14                  \n\t"
3603     "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
3604     "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
3605     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3606     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3607     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3608     "bnez       %[iHeight], 1b                  \n\t"
3609     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3610     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3611     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3612       "$f14", "$f16"
3613   );
3614 }
3615 
McHorVer20_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3616 static inline void McHorVer20_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3617                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3618   if (iWidth == 16)
3619     McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3620   else if (iWidth == 8)
3621     McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3622   else
3623     McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3624 }
3625 
McHorVer02WidthEq8_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3626 void McHorVer02WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3627                             int iDstStride, int iHeight) {
3628   BACKUP_REG;
3629   __asm__ volatile (
3630     ".set       arch=loongson3a                 \n\t"
3631     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3632     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3633     "xor        $f28, $f28, $f28                \n\t"
3634     MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
3635     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3636     MMI_LOAD_8P($f4, $f6, $f28, $8)
3637     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3638     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3639     MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
3640     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3641     MMI_LOAD_8P($f12, $f14, $f28, $8)
3642     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3643     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3644     MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
3645     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3646     MMI_LOAD_8P($f20, $f22, $f28, $8)
3647 
3648     "1:                                         \n\t"
3649     FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
3650                  $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
3651     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3652     "beqz       %[iHeight], 2f                  \n\t"
3653     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3654     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3655     MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
3656     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3657     FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
3658                  $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
3659     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3660     "beqz       %[iHeight], 2f                  \n\t"
3661     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3662     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3663     MMI_LOAD_8P($f28, $f30, $f0, $8)
3664     FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
3665                  $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
3666     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3667     "beqz       %[iHeight], 2f                  \n\t"
3668     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3669     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3670     MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
3671     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3672     FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
3673                  $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
3674     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3675     "beqz       %[iHeight], 2f                  \n\t"
3676     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3677     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3678     MMI_LOAD_8P($f4, $f6, $f8, $8)
3679     FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
3680                  $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
3681     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3682     "beqz       %[iHeight], 2f                  \n\t"
3683     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3684     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3685     MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
3686     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3687     FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
3688                  $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
3689     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3690     "beqz       %[iHeight], 2f                  \n\t"
3691     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3692     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3693     MMI_LOAD_8P($f12, $f14, $f16, $8)
3694     FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
3695                  $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
3696     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3697     "beqz       %[iHeight], 2f                  \n\t"
3698     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3699     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3700     MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
3701     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3702     FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
3703                  $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
3704     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3705     "beqz       %[iHeight], 2f                  \n\t"
3706     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3707     PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
3708     MMI_LOAD_8P($f20, $f22, $f24, $8)
3709     "j          1b                              \n\t"
3710     "2:                                         \n\t"
3711     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3712     : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3713     : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3714       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3715   );
3716   RECOVER_REG;
3717 }
3718 
McHorVer02WidthEq16_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3719 static inline void McHorVer02WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3720                    uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3721   McHorVer02WidthEq8_mmi (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
3722   McHorVer02WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
3723 }
3724 
McHorVer02_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3725 static inline void McHorVer02_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3726                    uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
3727                    int32_t iHeight) {
3728   if (iWidth == 16)
3729     McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3730   else if (iWidth == 8)
3731     McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3732   else
3733     McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
3734 }
3735 
McHorVer22Width8HorFirst_mmi(const uint8_t * pSrc,int16_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3736 void McHorVer22Width8HorFirst_mmi(const uint8_t *pSrc, int16_t iSrcStride,
3737      uint8_t *pDst, int32_t iDstStride, int32_t iHeight) {
3738   BACKUP_REG;
3739   __asm__ volatile (
3740     ".set       arch=loongson3a                 \n\t"
3741     "xor        $f28, $f28, $f28                \n\t"
3742     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3743     PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3744     "dli        $8, 0x2                         \n\t"
3745     "dmtc1      $8, $f30                        \n\t"
3746     "1:                                         \n\t"
3747     "xor        $f28, $f28, $f28                \n\t"
3748     "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
3749     "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
3750     "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
3751     "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
3752     "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
3753     "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
3754     "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
3755     "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
3756     "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
3757     "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
3758     "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
3759     "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
3760     "punpckhbh  $f2, $f0, $f28                  \n\t"
3761     "punpckhbh  $f6, $f4, $f28                  \n\t"
3762     "punpckhbh  $f10, $f8, $f28                 \n\t"
3763     "punpckhbh  $f14, $f12, $f28                \n\t"
3764     "punpckhbh  $f18, $f16, $f28                \n\t"
3765     "punpckhbh  $f22, $f20, $f28                \n\t"
3766     "punpcklbh  $f0, $f0, $f28                  \n\t"
3767     "punpcklbh  $f4, $f4, $f28                  \n\t"
3768     "punpcklbh  $f8, $f8, $f28                  \n\t"
3769     "punpcklbh  $f12, $f12, $f28                \n\t"
3770     "punpcklbh  $f16, $f16, $f28                \n\t"
3771     "punpcklbh  $f20, $f20, $f28                \n\t"
3772     "paddh      $f8, $f8, $f12                  \n\t"
3773     "paddh      $f10, $f10, $f14                \n\t"
3774     "paddh      $f16, $f16, $f20                \n\t"
3775     "paddh      $f18, $f18, $f22                \n\t"
3776     "psllh      $f16, $f16, $f30                \n\t"
3777     "psllh      $f18, $f18, $f30                \n\t"
3778     "psubh      $f16, $f16, $f8                 \n\t"
3779     "psubh      $f18, $f18, $f10                \n\t"
3780     "paddh      $f0, $f0, $f4                   \n\t"
3781     "paddh      $f2, $f2, $f6                   \n\t"
3782     "paddh      $f0, $f0, $f16                  \n\t"
3783     "paddh      $f2, $f2, $f18                  \n\t"
3784     "psllh      $f16, $f16, $f30                \n\t"
3785     "psllh      $f18, $f18, $f30                \n\t"
3786     "paddh      $f0, $f0, $f16                  \n\t"
3787     "paddh      $f2, $f2, $f18                  \n\t"
3788     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3789     "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
3790     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3791     "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
3792     PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3793     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
3794     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
3795     "bnez       %[iHeight], 1b                  \n\t"
3796     : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3797     : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
3798     : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3799       "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3800   );
3801   RECOVER_REG;
3802 }
3803 
McHorVer22WidthEq8_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3804 static inline void McHorVer22WidthEq8_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3805                    uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3806   ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
3807   McHorVer22Width8HorFirst_mmi (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
3808   McHorVer22Width8VerLastAlign_mmi ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
3809 }
3810 
McHorVer22WidthEq16_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3811 static inline void McHorVer22WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3812                    uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3813   McHorVer22WidthEq8_mmi (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
3814   McHorVer22WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
3815 }
3816 
McHorVer22_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3817 static inline void McHorVer22_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3818                    int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3819   if (iWidth == 16)
3820     McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3821   else if (iWidth == 8)
3822     McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3823   else
3824     McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
3825 }
3826 
PixelAvgWidthEq4_mmi(uint8_t * pDst,int iDstStride,const uint8_t * pSrcA,int iSrcAStride,const uint8_t * pSrcB,int iSrcBStride,int iHeight)3827 void PixelAvgWidthEq4_mmi(uint8_t *pDst,  int iDstStride, const uint8_t *pSrcA,
3828      int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3829   __asm__ volatile (
3830     ".set       arch=loongson3a                    \n\t"
3831     "1:                                            \n\t"
3832     "gsldlc1    $f0, 0x7(%[pSrcB])                 \n\t"
3833     "gsldlc1    $f2, 0x7(%[pSrcA])                 \n\t"
3834     "gsldrc1    $f0, 0x0(%[pSrcB])                 \n\t"
3835     "gsldrc1    $f2, 0x0(%[pSrcA])                 \n\t"
3836     "pavgb      $f0, $f0, $f2                      \n\t"
3837     "gsswlc1    $f0, 0x3(%[pDst])                  \n\t"
3838     "gsswrc1    $f0, 0x0(%[pDst])                  \n\t"
3839     PTR_ADDIU  "%[iHeight], %[iHeight], -0x1       \n\t"
3840     PTR_ADDU   "%[pDst], %[pDst], %[iDstStride]    \n\t"
3841     PTR_ADDU   "%[pSrcA], %[pSrcA], %[iSrcAStride] \n\t"
3842     PTR_ADDU   "%[pSrcB], %[pSrcB], %[iSrcBStride] \n\t"
3843     "bnez       %[iHeight], 1b                     \n\t"
3844     : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3845       [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3846     : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3847       [iSrcBStride]"r"((int)iSrcBStride)
3848     : "memory", "$8", "$9", "$10", "$f0", "$f2"
3849   );
3850 }
3851 
PixelAvgWidthEq8_mmi(uint8_t * pDst,int iDstStride,const uint8_t * pSrcA,int iSrcAStride,const uint8_t * pSrcB,int iSrcBStride,int iHeight)3852 void PixelAvgWidthEq8_mmi(uint8_t *pDst,  int iDstStride, const uint8_t *pSrcA,
3853      int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3854   __asm__ volatile (
3855     ".set       arch=loongson3a                 \n\t"
3856     "1:                                         \n\t"
3857     "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
3858     "gsldlc1    $f2, 0x7(%[pSrcB])              \n\t"
3859     "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
3860     "gsldrc1    $f2, 0x0(%[pSrcB])              \n\t"
3861     "pavgb      $f0, $f0, $f2                   \n\t"
3862     PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
3863     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3864     PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
3865     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3866     "gsldlc1    $f0, 0x7($8)                    \n\t"
3867     "gsldlc1    $f2, 0x7($9)                    \n\t"
3868     "gsldrc1    $f0, 0x0($8)                    \n\t"
3869     "gsldrc1    $f2, 0x0($9)                    \n\t"
3870     "pavgb      $f0, $f0, $f2                   \n\t"
3871     PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
3872     "gssdlc1    $f0, 0x7($10)                   \n\t"
3873     PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
3874     "gssdrc1    $f0, 0x0($10)                   \n\t"
3875     PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
3876     PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
3877     PTR_ADDIU  "%[iHeight], %[iHeight], -0x2    \n\t"
3878     "bnez       %[iHeight], 1b                  \n\t"
3879     : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3880       [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3881     : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3882       [iSrcBStride]"r"((int)iSrcBStride)
3883     : "memory", "$8", "$9", "$10", "$f0", "$f2"
3884   );
3885 }
3886 
PixelAvgWidthEq16_mmi(uint8_t * pDst,int iDstStride,const uint8_t * pSrcA,int iSrcAStride,const uint8_t * pSrcB,int iSrcBStride,int iHeight)3887 void PixelAvgWidthEq16_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
3888      int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3889   __asm__ volatile (
3890     ".set       arch=loongson3a                 \n\t"
3891     "1:                                         \n\t"
3892     "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
3893     "gsldlc1    $f2, 0xF(%[pSrcA])              \n\t"
3894     "gsldlc1    $f4, 0x7(%[pSrcB])              \n\t"
3895     "gsldlc1    $f6, 0xF(%[pSrcB])              \n\t"
3896     "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
3897     "gsldrc1    $f2, 0x8(%[pSrcA])              \n\t"
3898     "gsldrc1    $f4, 0x0(%[pSrcB])              \n\t"
3899     "gsldrc1    $f6, 0x8(%[pSrcB])              \n\t"
3900     "pavgb      $f0, $f0, $f4                   \n\t"
3901     "pavgb      $f2, $f2, $f6                   \n\t"
3902     PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
3903     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3904     "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
3905     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3906     "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
3907     PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
3908     "gsldlc1    $f0, 0x7($8)                    \n\t"
3909     "gsldlc1    $f2, 0xF($8)                    \n\t"
3910     "gsldrc1    $f0, 0x0($8)                    \n\t"
3911     "gsldrc1    $f2, 0x8($8)                    \n\t"
3912     PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
3913     "gsldlc1    $f4, 0x7($9)                    \n\t"
3914     "gsldlc1    $f6, 0xF($9)                    \n\t"
3915     "gsldrc1    $f4, 0x0($9)                    \n\t"
3916     "gsldrc1    $f6, 0x8($9)                    \n\t"
3917     "pavgb      $f0, $f0, $f4                   \n\t"
3918     "pavgb      $f2, $f2, $f6                   \n\t"
3919     "gssdlc1    $f0, 0x7($10)                   \n\t"
3920     "gssdlc1    $f2, 0xF($10)                   \n\t"
3921     "gssdrc1    $f0, 0x0($10)                   \n\t"
3922     "gssdrc1    $f2, 0x8($10)                   \n\t"
3923 
3924     PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
3925     PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
3926     PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
3927     "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
3928     "gsldlc1    $f2, 0xF(%[pSrcA])              \n\t"
3929     "gsldlc1    $f4, 0x7(%[pSrcB])              \n\t"
3930     "gsldlc1    $f6, 0xF(%[pSrcB])              \n\t"
3931     "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
3932     "gsldrc1    $f2, 0x8(%[pSrcA])              \n\t"
3933     "gsldrc1    $f4, 0x0(%[pSrcB])              \n\t"
3934     "gsldrc1    $f6, 0x8(%[pSrcB])              \n\t"
3935     "pavgb      $f0, $f0, $f4                   \n\t"
3936     "pavgb      $f2, $f2, $f6                   \n\t"
3937     PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
3938     PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
3939     "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
3940     "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
3941     "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
3942     "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
3943     "gsldlc1    $f0, 0x7($8)                    \n\t"
3944     "gsldlc1    $f2, 0xF($8)                    \n\t"
3945     "gsldlc1    $f4, 0x7($9)                    \n\t"
3946     "gsldlc1    $f6, 0xF($9)                    \n\t"
3947     "gsldrc1    $f0, 0x0($8)                    \n\t"
3948     "gsldrc1    $f2, 0x8($8)                    \n\t"
3949     "gsldrc1    $f4, 0x0($9)                    \n\t"
3950     "gsldrc1    $f6, 0x8($9)                    \n\t"
3951     PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
3952     "pavgb      $f0, $f0, $f4                   \n\t"
3953     "pavgb      $f2, $f2, $f6                   \n\t"
3954     "gssdlc1    $f0, 0x7($10)                   \n\t"
3955     "gssdlc1    $f2, 0xF($10)                   \n\t"
3956     "gssdrc1    $f0, 0x0($10)                   \n\t"
3957     "gssdrc1    $f2, 0x8($10)                   \n\t"
3958     PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
3959     PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
3960     PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
3961     PTR_ADDIU  "%[iHeight], %[iHeight], -0x4    \n\t"
3962     "bnez       %[iHeight], 1b                  \n\t"
3963     : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3964       [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3965     : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3966       [iSrcBStride]"r"((int)iSrcBStride)
3967     : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6"
3968   );
3969 }
3970 
McHorVer01_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3971 static inline void McHorVer01_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3972                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3973   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
3974   if (iWidth == 16) {
3975     McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3976     PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3977   } else if (iWidth == 8) {
3978     McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3979     PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3980   } else {
3981     McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
3982     PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3983   }
3984 }
3985 
McHorVer03_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3986 static inline void McHorVer03_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3987                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3988   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
3989   if (iWidth == 16) {
3990     McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3991     PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3992   } else if (iWidth == 8) {
3993     McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3994     PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3995   } else {
3996     McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
3997     PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3998   }
3999 }
4000 
McHorVer10_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4001 static inline void McHorVer10_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4002                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4003   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
4004   if (iWidth == 16) {
4005     McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4006     PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4007   } else if (iWidth == 8) {
4008     McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4009     PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4010   } else {
4011     McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4012     PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4013   }
4014 }
4015 
McHorVer11_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4016 static inline void McHorVer11_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4017                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4018   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4019   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4020   if (iWidth == 16) {
4021     McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4022     McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4023     PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4024   } else if (iWidth == 8) {
4025     McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4026     McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4027     PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4028   } else {
4029     McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4030     McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4031     PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4032   }
4033 }
4034 
McHorVer12_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4035 static inline void McHorVer12_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4036                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4037   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4038   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4039   if (iWidth == 16) {
4040     McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4041     McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4042     PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4043   } else if (iWidth == 8) {
4044     McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4045     McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4046     PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4047   } else {
4048     McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4049     McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4050     PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4051   }
4052 }
McHorVer13_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4053 static inline void McHorVer13_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4054                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4055   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4056   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4057   if (iWidth == 16) {
4058     McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4059     McHorVer02WidthEq16_mmi (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
4060     PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4061   } else if (iWidth == 8) {
4062     McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4063     McHorVer02WidthEq8_mmi (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
4064     PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4065   } else {
4066     McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4067     McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
4068     PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4069   }
4070 }
McHorVer21_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4071 static inline void McHorVer21_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4072                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4073   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4074   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4075   if (iWidth == 16) {
4076     McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4077     McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4078     PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4079   } else if (iWidth == 8) {
4080     McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4081     McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4082     PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4083   } else {
4084     McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4085     McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4086     PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4087   }
4088 }
4089 
McHorVer23_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4090 static inline void McHorVer23_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4091                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4092   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4093   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4094   if (iWidth == 16) {
4095     McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4096     McHorVer22WidthEq16_mmi (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
4097     PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4098   } else if (iWidth == 8) {
4099     McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4100     McHorVer22WidthEq8_mmi (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
4101     PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4102   } else {
4103     McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4104     McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
4105     PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4106   }
4107 }
McHorVer30_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4108 static inline void McHorVer30_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4109                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4110   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4111   if (iWidth == 16) {
4112     McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4113     PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4114   } else if (iWidth == 8) {
4115     McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4116     PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4117   } else {
4118     McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4119     PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4120   }
4121 }
McHorVer31_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4122 static inline void McHorVer31_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4123                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4124   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4125   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4126   if (iWidth == 16) {
4127     McHorVer20WidthEq16_mmi (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
4128     McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4129     PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4130   } else if (iWidth == 8) {
4131     McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4132     McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4133     PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4134   } else {
4135     McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4136     McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4137     PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4138   }
4139 }
McHorVer32_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4140 static inline void McHorVer32_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4141                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4142   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4143   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4144   if (iWidth == 16) {
4145     McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4146     McHorVer22WidthEq16_mmi (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
4147     PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4148   } else if (iWidth == 8) {
4149     McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4150     McHorVer22WidthEq8_mmi (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
4151     PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4152   } else {
4153     McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4154     McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
4155     PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4156   }
4157 }
McHorVer33_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4158 static inline void McHorVer33_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4159                                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4160   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4161   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4162   if (iWidth == 16) {
4163     McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4164     McHorVer02WidthEq16_mmi (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
4165     PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4166   } else if (iWidth == 8) {
4167     McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4168     McHorVer02WidthEq8_mmi (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
4169     PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4170   } else {
4171     McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4172     McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
4173     PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4174   }
4175 }
4176 
McLuma_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)4177 void McLuma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
4178                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
4179   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
4180     {McCopy_mmi,     McHorVer01_mmi, McHorVer02_mmi, McHorVer03_mmi},
4181     {McHorVer10_mmi, McHorVer11_mmi, McHorVer12_mmi, McHorVer13_mmi},
4182     {McHorVer20_mmi, McHorVer21_mmi, McHorVer22_mmi, McHorVer23_mmi},
4183     {McHorVer30_mmi, McHorVer31_mmi, McHorVer32_mmi, McHorVer33_mmi},
4184   };
4185 
4186   pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
4187 }
4188 
PixelAvg_mmi(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)4189 void PixelAvg_mmi(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
4190                   const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
4191   static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
4192     PixelAvgWidthEq8_mmi,
4193     PixelAvgWidthEq16_mmi
4194   };
4195   kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
4196 }
4197 #endif//HAVE_MMI
4198 
4199 #if defined(HAVE_LSX)
McCopy_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4200 static inline void McCopy_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4201                               int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4202   if (iWidth == 16)
4203     McCopyWidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4204   else if (iWidth == 8)
4205     McCopyWidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4206   else if (iWidth == 4)
4207     McCopyWidthEq4_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4208   else
4209     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4210 }
4211 
McChroma_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)4212 void McChroma_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4213                   int32_t iDstStride, int16_t iMvX, int16_t iMvY,
4214                   int32_t iWidth, int32_t iHeight) {
4215   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
4216     McChromaWidthEq4_lsx,
4217     McChromaWidthEq8_lsx
4218   };
4219   const int32_t kiD8x = iMvX & 0x07;
4220   const int32_t kiD8y = iMvY & 0x07;
4221   if (kiD8x == 0 && kiD8y == 0) {
4222     McCopy_lsx (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
4223     return;
4224   }
4225   if (iWidth != 2) {
4226     kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
4227                                        g_kuiABCD[kiD8y][kiD8x], iHeight);
4228   } else
4229     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
4230                           iWidth, iHeight);
4231 }
4232 #endif//HAVE_LSX
4233 
4234 } // anon ns.
4235 
InitMcFunc(SMcFunc * pMcFuncs,uint32_t uiCpuFlag)4236 void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
4237   pMcFuncs->pfLumaHalfpelHor  = McHorVer20_c;
4238   pMcFuncs->pfLumaHalfpelVer  = McHorVer02_c;
4239   pMcFuncs->pfLumaHalfpelCen  = McHorVer22_c;
4240   pMcFuncs->pfSampleAveraging = PixelAvg_c;
4241   pMcFuncs->pMcChromaFunc     = McChroma_c;
4242   pMcFuncs->pMcLumaFunc       = McLuma_c;
4243 
4244 #if defined (X86_ASM)
4245   if (uiCpuFlag & WELS_CPU_SSE2) {
4246     pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_sse2;
4247     pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_sse2;
4248     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_sse2;
4249     pMcFuncs->pfSampleAveraging = PixelAvg_sse2;
4250     pMcFuncs->pMcChromaFunc     = McChroma_sse2;
4251     pMcFuncs->pMcLumaFunc       = McLuma_sse2;
4252   }
4253 
4254   if (uiCpuFlag & WELS_CPU_SSSE3) {
4255     pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_ssse3;
4256     pMcFuncs->pfLumaHalfpelVer  = McHorVer02_ssse3;
4257     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_ssse3;
4258     pMcFuncs->pMcChromaFunc = McChroma_ssse3;
4259     pMcFuncs->pMcLumaFunc   = McLuma_ssse3;
4260   }
4261 #ifdef HAVE_AVX2
4262   if (uiCpuFlag & WELS_CPU_AVX2) {
4263     pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_avx2;
4264     pMcFuncs->pfLumaHalfpelVer  = McHorVer02_avx2;
4265     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_avx2;
4266     pMcFuncs->pMcLumaFunc       = McLuma_avx2;
4267   }
4268 #endif
4269 #endif //(X86_ASM)
4270 
4271 #if defined(HAVE_NEON)
4272   if (uiCpuFlag & WELS_CPU_NEON) {
4273     pMcFuncs->pMcLumaFunc       = McLuma_neon;
4274     pMcFuncs->pMcChromaFunc     = McChroma_neon;
4275     pMcFuncs->pfSampleAveraging = PixelAvg_neon;
4276     pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_neon;//iWidth+1:4/8/16
4277     pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_neon;//heigh+1:4/8/16
4278     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_neon;//iWidth+1/heigh+1
4279   }
4280 #endif
4281 #if defined(HAVE_NEON_AARCH64)
4282   if (uiCpuFlag & WELS_CPU_NEON) {
4283     pMcFuncs->pMcLumaFunc       = McLuma_AArch64_neon;
4284     pMcFuncs->pMcChromaFunc     = McChroma_AArch64_neon;
4285     pMcFuncs->pfSampleAveraging = PixelAvg_AArch64_neon;
4286     pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_AArch64_neon;//iWidth+1:4/8/16
4287     pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_AArch64_neon;//heigh+1:4/8/16
4288     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon;//iWidth+1/heigh+1
4289   }
4290 #endif
4291 
4292 #if defined(HAVE_MMI)
4293   if (uiCpuFlag & WELS_CPU_MMI) {
4294     pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_mmi;
4295     pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_mmi;
4296     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_mmi;
4297     pMcFuncs->pfSampleAveraging = PixelAvg_mmi;
4298     pMcFuncs->pMcChromaFunc     = McChroma_mmi;
4299     pMcFuncs->pMcLumaFunc       = McLuma_mmi;
4300   }
4301 #endif//HAVE_MMI
4302 
4303 #if defined(HAVE_LSX)
4304   if (uiCpuFlag & WELS_CPU_LSX) {
4305     pMcFuncs->pMcChromaFunc     = McChroma_lsx;
4306   }
4307 #endif//HAVE_LSX
4308 }
4309