1 /*!
2 * \copy
3 * Copyright (c) 2009-2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file mc.c
33 *
34 * \brief Interfaces implementation for motion compensation
35 *
36 * \date 03/17/2009 Created
37 *
38 *************************************************************************************
39 */
40
41 #include "mc.h"
42
43 #include "cpu_core.h"
44 #include "ls_defines.h"
45 #include "macros.h"
46 #include "asmdefs_mmi.h"
47
48 namespace {
49
50 typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
51 const uint8_t* kpABCD, int32_t iHeight);
52 typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,
53 int32_t, int32_t);
54 typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
55 int32_t iWidth, int32_t iHeight);
56
57 /*------------------weight for chroma fraction pixel interpolation------------------*/
58 //iA = (8 - dx) * (8 - dy);
59 //iB = dx * (8 - dy);
60 //iC = (8 - dx) * dy;
61 //iD = dx * dy
62 static const uint8_t g_kuiABCD[8][8][4] = { //g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
63 {
64 {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
65 {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
66 },
67 {
68 {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
69 {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
70 },
71 {
72 {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
73 {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
74 },
75 {
76 {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
77 {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
78 },
79 {
80 {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
81 {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
82 },
83 {
84 {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
85 {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
86 },
87 {
88 {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
89 {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
90 },
91 {
92 {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
93 {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
94 }
95 };
96
97 //***************************************************************************//
98 // C code implementation //
99 //***************************************************************************//
McCopyWidthEq2_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)100 static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
101 int32_t iHeight) {
102 int32_t i;
103 for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
104 ST16A2 (pDst, LD16 (pSrc));
105 pDst += iDstStride;
106 pSrc += iSrcStride;
107 }
108 }
109
McCopyWidthEq4_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)110 static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
111 int32_t iHeight) {
112 int32_t i;
113 for (i = 0; i < iHeight; i++) {
114 ST32A4 (pDst, LD32 (pSrc));
115 pDst += iDstStride;
116 pSrc += iSrcStride;
117 }
118 }
119
McCopyWidthEq8_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)120 static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
121 int32_t iHeight) {
122 int32_t i;
123 for (i = 0; i < iHeight; i++) {
124 ST64A8 (pDst, LD64 (pSrc));
125 pDst += iDstStride;
126 pSrc += iSrcStride;
127 }
128 }
129
McCopyWidthEq16_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)130 static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
131 int32_t iHeight) {
132 int32_t i;
133 for (i = 0; i < iHeight; i++) {
134 ST64A8 (pDst , LD64 (pSrc));
135 ST64A8 (pDst + 8, LD64 (pSrc + 8));
136 pDst += iDstStride;
137 pSrc += iSrcStride;
138 }
139 }
140
141 //--------------------Luma sample MC------------------//
142
HorFilterInput16bit_c(const int16_t * pSrc)143 static inline int32_t HorFilterInput16bit_c (const int16_t* pSrc) {
144 int32_t iPix05 = pSrc[0] + pSrc[5];
145 int32_t iPix14 = pSrc[1] + pSrc[4];
146 int32_t iPix23 = pSrc[2] + pSrc[3];
147
148 return (iPix05 - (iPix14 * 5) + (iPix23 * 20));
149 }
150 // h: iOffset=1 / v: iOffset=iSrcStride
FilterInput8bitWithStride_c(const uint8_t * pSrc,const int32_t kiOffset)151 static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
152 const int32_t kiOffset1 = kiOffset;
153 const int32_t kiOffset2 = (kiOffset << 1);
154 const int32_t kiOffset3 = kiOffset + kiOffset2;
155 const uint32_t kuiPix05 = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
156 const uint32_t kuiPix14 = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
157 const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiOffset1);
158
159 return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
160 }
161
PixelAvg_c(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)162 static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
163 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
164 int32_t i, j;
165 for (i = 0; i < iHeight; i++) {
166 for (j = 0; j < iWidth; j++) {
167 pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
168 }
169 pDst += iDstStride;
170 pSrcA += iSrcAStride;
171 pSrcB += iSrcBStride;
172 }
173 }
McCopy_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)174 static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
175 int32_t iHeight) {
176 if (iWidth == 16)
177 McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
178 else if (iWidth == 8)
179 McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
180 else if (iWidth == 4)
181 McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
182 else //here iWidth == 2
183 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
184 }
185
186 //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
McHorVer20_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)187 static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
188 int32_t iWidth,
189 int32_t iHeight) {
190 int32_t i, j;
191 for (i = 0; i < iHeight; i++) {
192 for (j = 0; j < iWidth; j++) {
193 pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
194 }
195 pDst += iDstStride;
196 pSrc += iSrcStride;
197 }
198 }
199
200 //vertical filter to gain half sample, that is (0, 2) location in quarter sample
McHorVer02_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)201 static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
202 int32_t iWidth,
203 int32_t iHeight) {
204 int32_t i, j;
205 for (i = 0; i < iHeight; i++) {
206 for (j = 0; j < iWidth; j++) {
207 pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
208 }
209 pDst += iDstStride;
210 pSrc += iSrcStride;
211 }
212 }
213
214 //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
McHorVer22_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)215 static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
216 int32_t iWidth,
217 int32_t iHeight) {
218 int16_t iTmp[17 + 5];
219 int32_t i, j, k;
220
221 for (i = 0; i < iHeight; i++) {
222 for (j = 0; j < iWidth + 5; j++) {
223 iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
224 }
225 for (k = 0; k < iWidth; k++) {
226 pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
227 }
228 pSrc += iSrcStride;
229 pDst += iDstStride;
230 }
231 }
232
233 /////////////////////luma MC//////////////////////////
McHorVer01_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)234 static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
235 int32_t iWidth,
236 int32_t iHeight) {
237 uint8_t uiTmp[256];
238 McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
239 PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
240 }
McHorVer03_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)241 static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
242 int32_t iWidth,
243 int32_t iHeight) {
244 uint8_t uiTmp[256];
245 McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
246 PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
247 }
McHorVer10_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)248 static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
249 int32_t iWidth,
250 int32_t iHeight) {
251 uint8_t uiTmp[256];
252 McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
253 PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
254 }
McHorVer11_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)255 static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
256 int32_t iWidth,
257 int32_t iHeight) {
258 uint8_t uiHorTmp[256];
259 uint8_t uiVerTmp[256];
260 McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
261 McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
262 PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
263 }
McHorVer12_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)264 static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
265 int32_t iWidth,
266 int32_t iHeight) {
267 uint8_t uiVerTmp[256];
268 uint8_t uiCtrTmp[256];
269 McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
270 McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
271 PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
272 }
McHorVer13_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)273 static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
274 int32_t iWidth,
275 int32_t iHeight) {
276 uint8_t uiHorTmp[256];
277 uint8_t uiVerTmp[256];
278 McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
279 McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
280 PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
281 }
McHorVer21_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)282 static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
283 int32_t iWidth,
284 int32_t iHeight) {
285 uint8_t uiHorTmp[256];
286 uint8_t uiCtrTmp[256];
287 McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
288 McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
289 PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
290 }
McHorVer23_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)291 static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
292 int32_t iWidth,
293 int32_t iHeight) {
294 uint8_t uiHorTmp[256];
295 uint8_t uiCtrTmp[256];
296 McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
297 McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
298 PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
299 }
McHorVer30_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)300 static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
301 int32_t iWidth,
302 int32_t iHeight) {
303 uint8_t uiHorTmp[256];
304 McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
305 PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
306 }
McHorVer31_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)307 static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
308 int32_t iWidth,
309 int32_t iHeight) {
310 uint8_t uiHorTmp[256];
311 uint8_t uiVerTmp[256];
312 McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
313 McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
314 PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
315 }
McHorVer32_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)316 static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
317 int32_t iWidth,
318 int32_t iHeight) {
319 uint8_t uiVerTmp[256];
320 uint8_t uiCtrTmp[256];
321 McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
322 McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
323 PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
324 }
McHorVer33_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)325 static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
326 int32_t iWidth,
327 int32_t iHeight) {
328 uint8_t uiHorTmp[256];
329 uint8_t uiVerTmp[256];
330 McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
331 McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
332 PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
333 }
334
McLuma_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)335 void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
336 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
337 //pSrc has been added the offset of mv
338 {
339 static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
340 {McCopy_c, McHorVer01_c, McHorVer02_c, McHorVer03_c},
341 {McHorVer10_c, McHorVer11_c, McHorVer12_c, McHorVer13_c},
342 {McHorVer20_c, McHorVer21_c, McHorVer22_c, McHorVer23_c},
343 {McHorVer30_c, McHorVer31_c, McHorVer32_c, McHorVer33_c},
344 };
345
346 pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
347 }
348
McChromaWithFragMv_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)349 static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
350 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
351 int32_t i, j;
352 int32_t iA, iB, iC, iD;
353 const uint8_t* pSrcNext = pSrc + iSrcStride;
354 const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
355 iA = pABCD[0];
356 iB = pABCD[1];
357 iC = pABCD[2];
358 iD = pABCD[3];
359 for (i = 0; i < iHeight; i++) {
360 for (j = 0; j < iWidth; j++) {
361 pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
362 }
363 pDst += iDstStride;
364 pSrc = pSrcNext;
365 pSrcNext += iSrcStride;
366 }
367 }
368
McChroma_c(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)369 void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
370 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
371 //pSrc has been added the offset of mv
372 {
373 const int32_t kiD8x = iMvX & 0x07;
374 const int32_t kiD8y = iMvY & 0x07;
375 if (0 == kiD8x && 0 == kiD8y)
376 McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
377 else
378 McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
379 }
380
381 #if defined(X86_ASM)
382 //***************************************************************************//
383 // SSE2 implement //
384 //***************************************************************************//
McHorVer22WidthEq8_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)385 static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
386 int32_t iHeight) {
387 ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
388 McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
389 McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
390 }
391
McHorVer02WidthEq16_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)392 static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
393 int32_t iHeight) {
394 McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
395 McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
396 }
397
McHorVer22WidthEq16_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)398 static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
399 int32_t iHeight) {
400 McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
401 McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
402 }
403
McHorVer20Width5Or9Or17_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)404 void McHorVer20Width5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
405 int32_t iWidth, int32_t iHeight) {
406 if (iWidth == 17 || iWidth == 9)
407 McHorVer20Width9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
408 else //if (iWidth == 5)
409 McHorVer20Width5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
410 }
411
McHorVer02Height5Or9Or17_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)412 void McHorVer02Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
413 int32_t iWidth, int32_t iHeight) {
414 if (iWidth == 16 || iWidth == 8)
415 McHorVer02Height9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
416 else //if (iWidth == 4)
417 McHorVer02Height5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
418 }
419
McHorVer22Width5Or9Or17Height5Or9Or17_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)420 void McHorVer22Width5Or9Or17Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
421 int32_t iWidth, int32_t iHeight) {
422 ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
423 if (iWidth == 17 || iWidth == 9){
424 int32_t tmp1 = 2 * (iWidth - 8);
425 McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
426 McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
427 McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight);
428 }
429 else{ //if(iWidth == 5)
430 int32_t tmp1 = 2 * (iWidth - 4);
431 McHorVer22Width5HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
432 McHorVer22Width4VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
433 McHorVer22Width4VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 4, iDstStride, 4, iHeight);
434 }
435
436 }
437
McCopy_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)438 static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
439 int32_t iWidth,
440 int32_t iHeight) {
441 if (iWidth == 16)
442 McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
443 else if (iWidth == 8)
444 McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
445 else if (iWidth == 4)
446 McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
447 else
448 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
449 }
450
McHorVer20_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)451 static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
452 int32_t iWidth, int32_t iHeight) {
453 if (iWidth == 16)
454 McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
455 else if (iWidth == 8)
456 McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
457 else
458 McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
459 }
460
McHorVer02_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)461 static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
462 int32_t iWidth, int32_t iHeight) {
463 if (iWidth == 16)
464 McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
465 else if (iWidth == 8)
466 McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
467 else
468 McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
469 }
470
McHorVer22_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)471 static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
472 int32_t iWidth, int32_t iHeight) {
473 if (iWidth == 16)
474 McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
475 else if (iWidth == 8)
476 McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
477 else
478 McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
479 }
480
McHorVer01_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)481 static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
482 int32_t iWidth, int32_t iHeight) {
483 ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
484 if (iWidth == 16) {
485 McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
486 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
487 } else if (iWidth == 8) {
488 McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
489 PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
490 } else {
491 McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
492 PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
493 }
494 }
McHorVer03_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)495 static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
496 int32_t iWidth, int32_t iHeight) {
497 ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
498 if (iWidth == 16) {
499 McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
500 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
501 } else if (iWidth == 8) {
502 McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
503 PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
504 } else {
505 McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
506 PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
507 }
508 }
McHorVer10_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)509 static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
510 int32_t iWidth, int32_t iHeight) {
511 ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
512 if (iWidth == 16) {
513 McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
514 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
515 } else if (iWidth == 8) {
516 McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
517 PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
518 } else {
519 McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
520 PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
521 }
522 }
McHorVer11_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)523 static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
524 int32_t iWidth, int32_t iHeight) {
525 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
526 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
527 if (iWidth == 16) {
528 McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
529 McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
530 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
531 } else if (iWidth == 8) {
532 McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
533 McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
534 PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
535 } else {
536 McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
537 McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
538 PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
539 }
540 }
McHorVer12_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)541 static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
542 int32_t iWidth, int32_t iHeight) {
543 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
544 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
545 if (iWidth == 16) {
546 McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
547 McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
548 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
549 } else if (iWidth == 8) {
550 McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
551 McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
552 PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
553 } else {
554 McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
555 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
556 PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
557 }
558 }
McHorVer13_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)559 static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
560 int32_t iWidth, int32_t iHeight) {
561 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
562 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
563 if (iWidth == 16) {
564 McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
565 McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
566 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
567 } else if (iWidth == 8) {
568 McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
569 McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
570 PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
571 } else {
572 McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
573 McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4 , iHeight);
574 PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
575 }
576 }
McHorVer21_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)577 static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
578 int32_t iWidth, int32_t iHeight) {
579 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
580 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
581 if (iWidth == 16) {
582 McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
583 McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
584 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
585 } else if (iWidth == 8) {
586 McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
587 McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
588 PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
589 } else {
590 McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
591 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
592 PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
593 }
594 }
McHorVer23_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)595 static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
596 int32_t iWidth, int32_t iHeight) {
597 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
598 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
599 if (iWidth == 16) {
600 McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
601 McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
602 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
603 } else if (iWidth == 8) {
604 McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
605 McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
606 PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
607 } else {
608 McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
609 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
610 PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
611 }
612 }
McHorVer30_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)613 static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
614 int32_t iWidth, int32_t iHeight) {
615 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
616 if (iWidth == 16) {
617 McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
618 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
619 } else if (iWidth == 8) {
620 McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
621 PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
622 } else {
623 McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
624 PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
625 }
626 }
McHorVer31_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)627 static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
628 int32_t iWidth, int32_t iHeight) {
629 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
630 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
631 if (iWidth == 16) {
632 McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
633 McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
634 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
635 } else if (iWidth == 8) {
636 McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
637 McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
638 PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
639 } else {
640 McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
641 McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
642 PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
643 }
644 }
McHorVer32_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)645 static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
646 int32_t iWidth, int32_t iHeight) {
647 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
648 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
649 if (iWidth == 16) {
650 McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
651 McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
652 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
653 } else if (iWidth == 8) {
654 McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
655 McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
656 PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
657 } else {
658 McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
659 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
660 PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
661 }
662 }
McHorVer33_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)663 static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
664 int32_t iWidth, int32_t iHeight) {
665 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
666 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
667 if (iWidth == 16) {
668 McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
669 McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
670 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
671 } else if (iWidth == 8) {
672 McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
673 McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
674 PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
675 } else {
676 McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
677 McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
678 PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
679 }
680 }
681
McLuma_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)682 void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
683 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
684 //pSrc has been added the offset of mv
685 {
686 static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
687 {McCopy_sse2, McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
688 {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
689 {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
690 {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
691 };
692
693 pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
694 }
695
McChroma_sse2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)696 void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
697 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
698 static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
699 McChromaWidthEq4_mmx,
700 McChromaWidthEq8_sse2
701 };
702 const int32_t kiD8x = iMvX & 0x07;
703 const int32_t kiD8y = iMvY & 0x07;
704 if (kiD8x == 0 && kiD8y == 0) {
705 McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
706 return;
707 }
708 if (iWidth != 2) {
709 kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
710 } else
711 McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
712 }
713
714 //***************************************************************************//
715 // SSSE3 implementation //
716 //***************************************************************************//
717
PixelAvgWidth4Or8Or16_sse2(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)718 void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
719 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
720 if (iWidth < 8) {
721 PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
722 } else if (iWidth == 8) {
723 PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
724 } else {
725 PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
726 }
727 }
728
McCopy_sse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)729 void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
730 int32_t iWidth, int32_t iHeight) {
731 switch (iWidth) {
732 case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
733 case 8: return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
734 case 4: return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
735 }
736 return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
737 }
738
McHorVer22_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)739 void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
740 int32_t iWidth, int32_t iHeight) {
741 ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16);
742 if (iWidth < 8) {
743 McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
744 McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight);
745 } else if (iWidth == 8) {
746 McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
747 McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
748 } else {
749 McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
750 McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight);
751 McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
752 McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight);
753 }
754 }
755
McHorVer01_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)756 void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
757 int32_t iWidth, int32_t iHeight) {
758 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
759 McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
760 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
761 &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
762 }
763
McHorVer03_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)764 void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
765 int32_t iWidth, int32_t iHeight) {
766 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
767 McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
768 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
769 &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
770 }
771
McHorVer10_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)772 void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
773 int32_t iWidth, int32_t iHeight) {
774 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
775 McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
776 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
777 &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
778 }
779
McHorVer11_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)780 void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
781 int32_t iWidth, int32_t iHeight) {
782 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
783 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
784 McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
785 McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
786 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
787 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
788 }
789
McHorVer12_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)790 void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
791 int32_t iWidth, int32_t iHeight) {
792 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
793 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
794 McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
795 McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
796 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
797 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
798 }
799
McHorVer13_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)800 void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
801 int32_t iWidth, int32_t iHeight) {
802 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
803 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
804 McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
805 McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
806 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
807 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
808 }
809
McHorVer21_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)810 void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
811 int32_t iWidth, int32_t iHeight) {
812 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
813 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
814 McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
815 McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
816 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
817 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
818 }
819
McHorVer23_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)820 void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
821 int32_t iWidth, int32_t iHeight) {
822 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
823 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
824 McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
825 McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
826 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
827 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
828 }
829
McHorVer30_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)830 void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
831 int32_t iWidth, int32_t iHeight) {
832 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
833 McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
834 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
835 }
836
McHorVer31_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)837 void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
838 int32_t iWidth, int32_t iHeight) {
839 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
840 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
841 McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
842 McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
843 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
844 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
845 }
846
McHorVer32_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)847 void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
848 int32_t iWidth, int32_t iHeight) {
849 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
850 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
851 McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
852 McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
853 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
854 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
855 }
856
McHorVer33_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)857 void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
858 int32_t iWidth, int32_t iHeight) {
859 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
860 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
861 McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
862 McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
863 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
864 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
865 }
866
McHorVer22Width5Or9Or17_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)867 void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
868 int32_t iWidth, int32_t iHeight) {
869 ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16)
870 if (iWidth > 5) {
871 McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5);
872 McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
873 } else {
874 McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
875 McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight);
876 }
877 }
878
McLuma_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)879 void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
880 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
881 static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
882 {McCopy_sse3, McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3},
883 {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3},
884 {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3},
885 {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3},
886 };
887
888 pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
889 }
890
McChroma_ssse3(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)891 void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
892 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
893 static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
894 McChromaWidthEq4_mmx,
895 McChromaWidthEq8_ssse3
896 };
897 const int32_t kiD8x = iMvX & 0x07;
898 const int32_t kiD8y = iMvY & 0x07;
899 if (kiD8x == 0 && kiD8y == 0) {
900 McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
901 return;
902 }
903 if (iWidth != 2) {
904 kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
905 } else
906 McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
907 }
908
909 //***************************************************************************//
910 // AVX2 implementation //
911 //***************************************************************************//
912
913 #ifdef HAVE_AVX2
914
McHorVer22_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)915 void McHorVer22_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
916 int32_t iWidth, int32_t iHeight) {
917 ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 16, 32);
918 if (iWidth < 8) {
919 McHorVer20Width4U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
920 McHorVer02Width4S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
921 } else if (iWidth == 8) {
922 McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
923 McHorVer02Width8S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
924 } else {
925 McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
926 McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
927 }
928 }
929
McHorVer01_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)930 void McHorVer01_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
931 int32_t iWidth, int32_t iHeight) {
932 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
933 McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
934 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
935 &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
936 }
937
McHorVer03_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)938 void McHorVer03_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
939 int32_t iWidth, int32_t iHeight) {
940 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
941 McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
942 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
943 &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
944 }
945
McHorVer10_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)946 void McHorVer10_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
947 int32_t iWidth, int32_t iHeight) {
948 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
949 McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
950 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
951 &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
952 }
953
McHorVer11_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)954 void McHorVer11_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
955 int32_t iWidth, int32_t iHeight) {
956 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
957 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
958 McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
959 McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
960 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
961 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
962 }
963
McHorVer12_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)964 void McHorVer12_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
965 int32_t iWidth, int32_t iHeight) {
966 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
967 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
968 McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
969 McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
970 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
971 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
972 }
973
McHorVer13_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)974 void McHorVer13_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
975 int32_t iWidth, int32_t iHeight) {
976 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
977 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
978 McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
979 McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
980 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
981 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
982 }
983
McHorVer21_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)984 void McHorVer21_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
985 int32_t iWidth, int32_t iHeight) {
986 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
987 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
988 McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
989 McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
990 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
991 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
992 }
993
McHorVer23_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)994 void McHorVer23_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
995 int32_t iWidth, int32_t iHeight) {
996 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
997 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
998 McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
999 McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1000 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1001 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1002 }
1003
McHorVer30_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1004 void McHorVer30_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1005 int32_t iWidth, int32_t iHeight) {
1006 ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
1007 McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
1008 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
1009 }
1010
McHorVer31_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1011 void McHorVer31_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1012 int32_t iWidth, int32_t iHeight) {
1013 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
1014 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1015 McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
1016 McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1017 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1018 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1019 }
1020
McHorVer32_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1021 void McHorVer32_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1022 int32_t iWidth, int32_t iHeight) {
1023 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1024 ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
1025 McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1026 McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1027 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
1028 &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
1029 }
1030
McHorVer33_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1031 void McHorVer33_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1032 int32_t iWidth, int32_t iHeight) {
1033 ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
1034 ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
1035 McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
1036 McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1037 PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
1038 &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
1039 }
1040
McHorVer22Width5Or9Or17_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1041 void McHorVer22Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1042 int32_t iWidth, int32_t iHeight) {
1043 if (iWidth < 9) {
1044 ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 9 + 5, WELS_ALIGN(5, 16 / sizeof (int16_t)), 16)
1045 McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1046 McHorVer02Width5S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
1047 } else if (iWidth == 9) {
1048 ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, 16, 32)
1049 McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1050 McHorVer02Width9S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
1051 } else {
1052 ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 32 / sizeof (int16_t)), 32)
1053 McHorVer20Width17U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
1054 McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
1055 }
1056 }
1057
McLuma_avx2(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1058 void McLuma_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1059 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1060 static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
1061 {McCopy_sse3, McHorVer01_avx2, McHorVer02_avx2, McHorVer03_avx2},
1062 {McHorVer10_avx2, McHorVer11_avx2, McHorVer12_avx2, McHorVer13_avx2},
1063 {McHorVer20_avx2, McHorVer21_avx2, McHorVer22_avx2, McHorVer23_avx2},
1064 {McHorVer30_avx2, McHorVer31_avx2, McHorVer32_avx2, McHorVer33_avx2},
1065 };
1066
1067 pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1068 }
1069
1070 #endif //HAVE_AVX2
1071
PixelAvg_sse2(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)1072 void PixelAvg_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1073 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1074 static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1075 PixelAvgWidthEq8_mmx,
1076 PixelAvgWidthEq16_sse2
1077 };
1078 kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1079 }
1080
1081 #endif //X86_ASM
1082 //***************************************************************************//
1083 // NEON implementation //
1084 //***************************************************************************//
1085 #if defined(HAVE_NEON)
McHorVer20Width5Or9Or17_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1086 void McHorVer20Width5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1087 int32_t iWidth, int32_t iHeight) {
1088 if (iWidth == 17)
1089 McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1090 else if (iWidth == 9)
1091 McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1092 else //if (iWidth == 5)
1093 McHorVer20Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1094 }
McHorVer02Height5Or9Or17_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1095 void McHorVer02Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1096 int32_t iWidth, int32_t iHeight) {
1097 if (iWidth == 16)
1098 McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1099 else if (iWidth == 8)
1100 McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1101 else //if (iWidth == 4)
1102 McHorVer02Height5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1103 }
McHorVer22Width5Or9Or17Height5Or9Or17_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1104 void McHorVer22Width5Or9Or17Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1105 int32_t iWidth, int32_t iHeight) {
1106 if (iWidth == 17)
1107 McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1108 else if (iWidth == 9)
1109 McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1110 else //if (iWidth == 5)
1111 McHorVer22Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1112 }
McCopy_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1113 void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1114 int32_t iWidth, int32_t iHeight) {
1115 if (16 == iWidth)
1116 McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1117 else if (8 == iWidth)
1118 McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1119 else if (4 == iWidth)
1120 McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1121 else
1122 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1123 }
McHorVer20_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1124 void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1125 int32_t iWidth, int32_t iHeight) {
1126 if (iWidth == 16)
1127 McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1128 else if (iWidth == 8)
1129 McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1130 else if (iWidth == 4)
1131 McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1132 }
McHorVer02_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1133 void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1134 int32_t iWidth, int32_t iHeight) {
1135 if (iWidth == 16)
1136 McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1137 else if (iWidth == 8)
1138 McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1139 else if (iWidth == 4)
1140 McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1141 }
McHorVer22_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1142 void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1143 int32_t iWidth, int32_t iHeight) {
1144 if (iWidth == 16)
1145 McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1146 else if (iWidth == 8)
1147 McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1148 else if (iWidth == 4)
1149 McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1150 }
1151
McHorVer01_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1152 void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1153 int32_t iWidth, int32_t iHeight) {
1154 if (iWidth == 16)
1155 McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1156 else if (iWidth == 8)
1157 McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1158 else if (iWidth == 4)
1159 McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1160 }
McHorVer03_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1161 void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1162 int32_t iWidth, int32_t iHeight) {
1163 if (iWidth == 16)
1164 McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1165 else if (iWidth == 8)
1166 McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1167 else if (iWidth == 4)
1168 McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1169 }
McHorVer10_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1170 void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1171 int32_t iWidth, int32_t iHeight) {
1172 if (iWidth == 16)
1173 McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1174 else if (iWidth == 8)
1175 McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1176 else if (iWidth == 4)
1177 McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1178 }
McHorVer11_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1179 void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1180 int32_t iWidth, int32_t iHeight) {
1181 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1182 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1183 if (iWidth == 16) {
1184 McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1185 McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1186 PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1187 } else if (iWidth == 8) {
1188 McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1189 McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1190 PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1191 } else if (iWidth == 4) {
1192 McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1193 McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1194 PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1195 }
1196 }
McHorVer12_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1197 void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1198 int32_t iWidth, int32_t iHeight) {
1199 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1200 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1201 if (iWidth == 16) {
1202 McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1203 McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1204 PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1205 } else if (iWidth == 8) {
1206 McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1207 McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1208 PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1209 } else if (iWidth == 4) {
1210 McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1211 McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1212 PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1213 }
1214 }
McHorVer13_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1215 void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1216 int32_t iWidth, int32_t iHeight) {
1217 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1218 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1219 if (iWidth == 16) {
1220 McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1221 McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1222 PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1223 } else if (iWidth == 8) {
1224 McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1225 McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1226 PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1227 } else if (iWidth == 4) {
1228 McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1229 McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1230 PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1231 }
1232 }
McHorVer21_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1233 void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1234 int32_t iWidth, int32_t iHeight) {
1235 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1236 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1237 if (iWidth == 16) {
1238 McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1239 McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1240 PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1241 } else if (iWidth == 8) {
1242 McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1243 McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1244 PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1245 } else if (iWidth == 4) {
1246 McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1247 McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1248 PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1249 }
1250 }
McHorVer23_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1251 void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1252 int32_t iWidth, int32_t iHeight) {
1253 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1254 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1255 if (iWidth == 16) {
1256 McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1257 McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1258 PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1259 } else if (iWidth == 8) {
1260 McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1261 McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1262 PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1263 } else if (iWidth == 4) {
1264 McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1265 McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1266 PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
1267 }
1268 }
McHorVer30_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1269 void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1270 int32_t iWidth, int32_t iHeight) {
1271 if (iWidth == 16)
1272 McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1273 else if (iWidth == 8)
1274 McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1275 else if (iWidth == 4)
1276 McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1277 }
McHorVer31_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1278 void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1279 int32_t iWidth, int32_t iHeight) {
1280 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1281 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1282 if (iWidth == 16) {
1283 McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1284 McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1285 PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1286 } else if (iWidth == 8) {
1287 McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1288 McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1289 PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1290 } else if (iWidth == 4) {
1291 McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1292 McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1293 PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1294 }
1295 }
McHorVer32_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1296 void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1297 int32_t iWidth, int32_t iHeight) {
1298 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1299 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1300 if (iWidth == 16) {
1301 McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1302 McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1303 PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1304 } else if (iWidth == 8) {
1305 McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1306 McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1307 PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1308 } else if (iWidth == 4) {
1309 McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1310 McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1311 PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
1312 }
1313 }
McHorVer33_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1314 void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1315 int32_t iWidth, int32_t iHeight) {
1316 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1317 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1318 if (iWidth == 16) {
1319 McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1320 McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1321 PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1322 } else if (iWidth == 8) {
1323 McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1324 McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1325 PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1326 } else if (iWidth == 4) {
1327 McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1328 McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1329 PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
1330 }
1331 }
1332
McLuma_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1333 void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1334 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1335 static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
1336 {McCopy_neon, McHorVer01_neon, McHorVer02_neon, McHorVer03_neon},
1337 {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
1338 {McHorVer20_neon, McHorVer21_neon, McHorVer22_neon, McHorVer23_neon},
1339 {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
1340 };
1341 // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
1342 pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1343 }
McChroma_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1344 void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1345 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1346 if (0 == iMvX && 0 == iMvY) {
1347 if (8 == iWidth)
1348 McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1349 else if (iWidth == 4)
1350 McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1351 else //here iWidth == 2
1352 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1353 } else {
1354 const int32_t kiD8x = iMvX & 0x07;
1355 const int32_t kiD8y = iMvY & 0x07;
1356 if (8 == iWidth)
1357 McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1358 else if (4 == iWidth)
1359 McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1360 else //here iWidth == 2
1361 McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
1362 }
1363 }
PixelAvg_neon(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)1364 void PixelAvg_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1365 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1366 static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1367 PixStrideAvgWidthEq8_neon,
1368 PixStrideAvgWidthEq16_neon
1369 };
1370 kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1371 }
1372 #endif
1373 #if defined(HAVE_NEON_AARCH64)
McHorVer20Width5Or9Or17_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1374 void McHorVer20Width5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1375 int32_t iWidth, int32_t iHeight) {
1376 if (iWidth == 17)
1377 McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1378 else if (iWidth == 9)
1379 McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1380 else //if (iWidth == 5)
1381 McHorVer20Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1382 }
McHorVer02Height5Or9Or17_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1383 void McHorVer02Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1384 int32_t iWidth, int32_t iHeight) {
1385 if (iWidth == 16)
1386 McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1387 else if (iWidth == 8)
1388 McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1389 else //if (iWidth == 4)
1390 McHorVer02Height5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1391 }
McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1392 void McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
1393 int32_t iDstStride,
1394 int32_t iWidth, int32_t iHeight) {
1395 if (iWidth == 17)
1396 McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1397 else if (iWidth == 9)
1398 McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1399 else //if (iWidth == 5)
1400 McHorVer22Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1401 }
McCopy_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1402 void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1403 int32_t iWidth, int32_t iHeight) {
1404 if (16 == iWidth)
1405 McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1406 else if (8 == iWidth)
1407 McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1408 else if (4 == iWidth)
1409 McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1410 else
1411 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1412 }
McHorVer20_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1413 void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1414 int32_t iWidth, int32_t iHeight) {
1415 if (iWidth == 16)
1416 McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1417 else if (iWidth == 8)
1418 McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1419 else if (iWidth == 4)
1420 McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1421 }
McHorVer02_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1422 void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1423 int32_t iWidth, int32_t iHeight) {
1424 if (iWidth == 16)
1425 McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1426 else if (iWidth == 8)
1427 McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1428 else if (iWidth == 4)
1429 McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1430 }
McHorVer22_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1431 void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1432 int32_t iWidth, int32_t iHeight) {
1433 if (iWidth == 16)
1434 McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1435 else if (iWidth == 8)
1436 McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1437 else if (iWidth == 4)
1438 McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1439 }
1440
McHorVer01_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1441 void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1442 int32_t iWidth, int32_t iHeight) {
1443 if (iWidth == 16)
1444 McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1445 else if (iWidth == 8)
1446 McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1447 else if (iWidth == 4)
1448 McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1449 }
McHorVer03_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1450 void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1451 int32_t iWidth, int32_t iHeight) {
1452 if (iWidth == 16)
1453 McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1454 else if (iWidth == 8)
1455 McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1456 else if (iWidth == 4)
1457 McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1458 }
McHorVer10_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1459 void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1460 int32_t iWidth, int32_t iHeight) {
1461 if (iWidth == 16)
1462 McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1463 else if (iWidth == 8)
1464 McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1465 else if (iWidth == 4)
1466 McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1467 }
McHorVer11_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1468 void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1469 int32_t iWidth, int32_t iHeight) {
1470 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1471 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1472 if (iWidth == 16) {
1473 McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1474 McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1475 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1476 } else if (iWidth == 8) {
1477 McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1478 McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1479 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1480 } else if (iWidth == 4) {
1481 McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1482 McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1483 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1484 }
1485 }
McHorVer12_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1486 void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1487 int32_t iWidth, int32_t iHeight) {
1488 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1489 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1490 if (iWidth == 16) {
1491 McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1492 McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1493 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1494 } else if (iWidth == 8) {
1495 McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1496 McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1497 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1498 } else if (iWidth == 4) {
1499 McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1500 McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1501 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1502 }
1503 }
McHorVer13_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1504 void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1505 int32_t iWidth, int32_t iHeight) {
1506 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1507 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1508 if (iWidth == 16) {
1509 McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1510 McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1511 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1512 } else if (iWidth == 8) {
1513 McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1514 McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1515 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1516 } else if (iWidth == 4) {
1517 McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1518 McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
1519 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1520 }
1521 }
McHorVer21_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1522 void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1523 int32_t iWidth, int32_t iHeight) {
1524 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1525 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1526 if (iWidth == 16) {
1527 McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1528 McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1529 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1530 } else if (iWidth == 8) {
1531 McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1532 McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1533 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1534 } else if (iWidth == 4) {
1535 McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1536 McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1537 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1538 }
1539 }
McHorVer23_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1540 void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1541 int32_t iWidth, int32_t iHeight) {
1542 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1543 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1544 if (iWidth == 16) {
1545 McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1546 McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1547 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1548 } else if (iWidth == 8) {
1549 McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1550 McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1551 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1552 } else if (iWidth == 4) {
1553 McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1554 McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1555 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
1556 }
1557 }
McHorVer30_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1558 void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1559 int32_t iWidth, int32_t iHeight) {
1560 if (iWidth == 16)
1561 McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1562 else if (iWidth == 8)
1563 McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1564 else if (iWidth == 4)
1565 McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1566 }
McHorVer31_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1567 void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1568 int32_t iWidth, int32_t iHeight) {
1569 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1570 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1571 if (iWidth == 16) {
1572 McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1573 McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1574 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1575 } else if (iWidth == 8) {
1576 McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1577 McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1578 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1579 } else if (iWidth == 4) {
1580 McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
1581 McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1582 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1583 }
1584 }
McHorVer32_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1585 void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1586 int32_t iWidth, int32_t iHeight) {
1587 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1588 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
1589 if (iWidth == 16) {
1590 McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1591 McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1592 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1593 } else if (iWidth == 8) {
1594 McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1595 McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1596 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1597 } else if (iWidth == 4) {
1598 McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1599 McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
1600 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
1601 }
1602 }
McHorVer33_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1603 void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1604 int32_t iWidth, int32_t iHeight) {
1605 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
1606 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
1607 if (iWidth == 16) {
1608 McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1609 McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1610 PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1611 } else if (iWidth == 8) {
1612 McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1613 McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1614 PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1615 } else if (iWidth == 4) {
1616 McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
1617 McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
1618 PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
1619 }
1620 }
1621
McLuma_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1622 void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1623 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1624 static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
1625 {McCopy_AArch64_neon, McHorVer01_AArch64_neon, McHorVer02_AArch64_neon, McHorVer03_AArch64_neon},
1626 {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
1627 {McHorVer20_AArch64_neon, McHorVer21_AArch64_neon, McHorVer22_AArch64_neon, McHorVer23_AArch64_neon},
1628 {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
1629 };
1630 // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
1631 pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
1632 }
McChroma_AArch64_neon(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)1633 void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1634 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
1635 if (0 == iMvX && 0 == iMvY) {
1636 if (8 == iWidth)
1637 McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1638 else if (iWidth == 4)
1639 McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1640 else //here iWidth == 2
1641 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
1642 } else {
1643 const int32_t kiD8x = iMvX & 0x07;
1644 const int32_t kiD8y = iMvY & 0x07;
1645 if (8 == iWidth)
1646 McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1647 else if (4 == iWidth)
1648 McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
1649 else //here iWidth == 2
1650 McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
1651 }
1652 }
PixelAvg_AArch64_neon(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)1653 void PixelAvg_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1654 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
1655 static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
1656 PixStrideAvgWidthEq8_AArch64_neon,
1657 PixStrideAvgWidthEq16_AArch64_neon
1658 };
1659 kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
1660 }
1661 #endif
1662
1663 #if defined(HAVE_MMI)
1664 #define MMI_LOAD_8P(f0, f2, f4, r0) \
1665 "gsldlc1 "#f0", 0x7("#r0") \n\t" \
1666 "gsldrc1 "#f0", 0x0("#r0") \n\t" \
1667 "punpckhbh "#f2", "#f0", "#f4" \n\t" \
1668 "punpcklbh "#f0", "#f0", "#f4" \n\t"
1669
1670 #define FILTER_HV_W4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1671 f20, f22, f24, f26, f28, f30, r0, r1, r2) \
1672 "paddh "#f0", "#f0", "#f20" \n\t" \
1673 "paddh "#f2", "#f2", "#f22" \n\t" \
1674 "mov.d "#f28", "#f8" \n\t" \
1675 "mov.d "#f30", "#f10" \n\t" \
1676 "mov.d "#f24", "#f4" \n\t" \
1677 "mov.d "#f26", "#f6" \n\t" \
1678 "dmfc1 "#r2", "#f8" \n\t" \
1679 "dli "#r1", 0x0010001000100010 \n\t" \
1680 "dmtc1 "#r1", "#f8" \n\t" \
1681 "paddh "#f0", "#f0", "#f8" \n\t" \
1682 "paddh "#f2", "#f2", "#f8" \n\t" \
1683 "paddh "#f28", "#f28", "#f12" \n\t" \
1684 "paddh "#f30", "#f30", "#f14" \n\t" \
1685 "paddh "#f24", "#f24", "#f16" \n\t" \
1686 "paddh "#f26", "#f26", "#f18" \n\t" \
1687 "dli "#r1", 0x2 \n\t" \
1688 "dmtc1 "#r1", "#f8" \n\t" \
1689 "psllh "#f28", "#f28", "#f8" \n\t" \
1690 "psllh "#f30", "#f30", "#f8" \n\t" \
1691 "psubh "#f28", "#f28", "#f24" \n\t" \
1692 "psubh "#f30", "#f30", "#f26" \n\t" \
1693 "paddh "#f0", "#f0", "#f28" \n\t" \
1694 "paddh "#f2", "#f2", "#f30" \n\t" \
1695 "psllh "#f28", "#f28", "#f8" \n\t" \
1696 "psllh "#f30", "#f30", "#f8" \n\t" \
1697 "paddh "#f0", "#f0", "#f28" \n\t" \
1698 "paddh "#f2", "#f2", "#f30" \n\t" \
1699 "dli "#r1", 0x5 \n\t" \
1700 "dmtc1 "#r1", "#f8" \n\t" \
1701 "psrah "#f0", "#f0", "#f8" \n\t" \
1702 "psrah "#f2", "#f2", "#f8" \n\t" \
1703 "xor "#f28", "#f28", "#f28" \n\t" \
1704 "packushb "#f0", "#f0", "#f2" \n\t" \
1705 "gsswlc1 "#f0", 0x3("#r0") \n\t" \
1706 "gsswrc1 "#f0", 0x0("#r0") \n\t" \
1707 "dmtc1 "#r2", "#f8" \n\t"
1708
1709 #define FILTER_HV_W8(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1710 f20, f22, f24, f26, f28, f30, r0, r1, r2) \
1711 "paddh "#f0", "#f0", "#f20" \n\t" \
1712 "paddh "#f2", "#f2", "#f22" \n\t" \
1713 "mov.d "#f28", "#f8" \n\t" \
1714 "mov.d "#f30", "#f10" \n\t" \
1715 "mov.d "#f24", "#f4" \n\t" \
1716 "mov.d "#f26", "#f6" \n\t" \
1717 "dmfc1 "#r2", "#f8" \n\t" \
1718 "dli "#r1", 0x0010001000100010 \n\t" \
1719 "dmtc1 "#r1", "#f8" \n\t" \
1720 "paddh "#f0", "#f0", "#f8" \n\t" \
1721 "paddh "#f2", "#f2", "#f8" \n\t" \
1722 "paddh "#f28", "#f28", "#f12" \n\t" \
1723 "paddh "#f30", "#f30", "#f14" \n\t" \
1724 "paddh "#f24", "#f24", "#f16" \n\t" \
1725 "paddh "#f26", "#f26", "#f18" \n\t" \
1726 "dli "#r1", 0x2 \n\t" \
1727 "dmtc1 "#r1", "#f8" \n\t" \
1728 "psllh "#f28", "#f28", "#f8" \n\t" \
1729 "psllh "#f30", "#f30", "#f8" \n\t" \
1730 "psubh "#f28", "#f28", "#f24" \n\t" \
1731 "psubh "#f30", "#f30", "#f26" \n\t" \
1732 "paddh "#f0", "#f0", "#f28" \n\t" \
1733 "paddh "#f2", "#f2", "#f30" \n\t" \
1734 "psllh "#f28", "#f28", "#f8" \n\t" \
1735 "psllh "#f30", "#f30", "#f8" \n\t" \
1736 "paddh "#f0", "#f0", "#f28" \n\t" \
1737 "paddh "#f2", "#f2", "#f30" \n\t" \
1738 "dli "#r1", 0x5 \n\t" \
1739 "dmtc1 "#r1", "#f8" \n\t" \
1740 "psrah "#f0", "#f0", "#f8" \n\t" \
1741 "psrah "#f2", "#f2", "#f8" \n\t" \
1742 "xor "#f28", "#f28", "#f28" \n\t" \
1743 "packushb "#f0", "#f0", "#f2" \n\t" \
1744 "gssdlc1 "#f0", 0x7("#r0") \n\t" \
1745 "gssdrc1 "#f0", 0x0("#r0") \n\t" \
1746 "dmtc1 "#r2", "#f8" \n\t"
1747
1748 #define FILTER_VER_ALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1749 f20, f22, f24, f26, f28, f30, r0, r1, r2, r3, r4) \
1750 "paddh "#f0", "#f0", "#f20" \n\t" \
1751 "paddh "#f2", "#f2", "#f22" \n\t" \
1752 "mov.d "#f24", "#f4" \n\t" \
1753 "mov.d "#f26", "#f6" \n\t" \
1754 "mov.d "#f28", "#f8" \n\t" \
1755 "mov.d "#f30", "#f10" \n\t" \
1756 "dli "#r2", 0x2 \n\t" \
1757 "paddh "#f24", "#f24", "#f16" \n\t" \
1758 "paddh "#f26", "#f26", "#f18" \n\t" \
1759 "dmfc1 "#r3", "#f8" \n\t" \
1760 "paddh "#f28", "#f28", "#f12" \n\t" \
1761 "paddh "#f30", "#f30", "#f14" \n\t" \
1762 "dmtc1 "#r2", "#f8" \n\t" \
1763 "psubh "#f0", "#f0", "#f24" \n\t" \
1764 "psubh "#f2", "#f2", "#f26" \n\t" \
1765 "psrah "#f0", "#f0", "#f8" \n\t" \
1766 "psrah "#f2", "#f2", "#f8" \n\t" \
1767 "paddh "#f0", "#f0", "#f28" \n\t" \
1768 "paddh "#f2", "#f2", "#f30" \n\t" \
1769 "psubh "#f0", "#f0", "#f24" \n\t" \
1770 "psubh "#f2", "#f2", "#f26" \n\t" \
1771 "psrah "#f0", "#f0", "#f8" \n\t" \
1772 "psrah "#f2", "#f2", "#f8" \n\t" \
1773 "dmtc1 "#r4", "#f8" \n\t" \
1774 "paddh "#f28", "#f28", "#f0" \n\t" \
1775 "paddh "#f30", "#f30", "#f2" \n\t" \
1776 "dli "#r2", 0x6 \n\t" \
1777 "paddh "#f28", "#f28", "#f8" \n\t" \
1778 "paddh "#f30", "#f30", "#f8" \n\t" \
1779 "dmtc1 "#r2", "#f8" \n\t" \
1780 "psrah "#f28", "#f28", "#f8" \n\t" \
1781 "psrah "#f30", "#f30", "#f8" \n\t" \
1782 "packushb "#f28", "#f28", "#f30" \n\t" \
1783 "gssdxc1 "#f28", 0x0("#r0", "#r1") \n\t" \
1784 "dmtc1 "#r3", "#f8" \n\t"
1785
1786 #define FILTER_VER_UNALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
1787 f20, f22, f24, f26, f28, f30, r0, r1, r2, r3) \
1788 "paddh "#f0", "#f0", "#f20" \n\t" \
1789 "paddh "#f2", "#f2", "#f22" \n\t" \
1790 "mov.d "#f24", "#f4" \n\t" \
1791 "mov.d "#f26", "#f6" \n\t" \
1792 "mov.d "#f28", "#f8" \n\t" \
1793 "mov.d "#f30", "#f10" \n\t" \
1794 "dli "#r1", 0x2 \n\t" \
1795 "paddh "#f24", "#f24", "#f16" \n\t" \
1796 "paddh "#f26", "#f26", "#f18" \n\t" \
1797 "dmfc1 "#r2", "#f8" \n\t" \
1798 "paddh "#f28", "#f28", "#f12" \n\t" \
1799 "paddh "#f30", "#f30", "#f14" \n\t" \
1800 "dmtc1 "#r1", "#f8" \n\t" \
1801 "psubh "#f0", "#f0", "#f24" \n\t" \
1802 "psubh "#f2", "#f2", "#f26" \n\t" \
1803 "psrah "#f0", "#f0", "#f8" \n\t" \
1804 "psrah "#f2", "#f2", "#f8" \n\t" \
1805 "paddh "#f0", "#f0", "#f28" \n\t" \
1806 "paddh "#f2", "#f2", "#f30" \n\t" \
1807 "psubh "#f0", "#f0", "#f24" \n\t" \
1808 "psubh "#f2", "#f2", "#f26" \n\t" \
1809 "psrah "#f0", "#f0", "#f8" \n\t" \
1810 "psrah "#f2", "#f2", "#f8" \n\t" \
1811 "dmtc1 "#r3", "#f8" \n\t" \
1812 "paddh "#f28", "#f28", "#f0" \n\t" \
1813 "paddh "#f30", "#f30", "#f2" \n\t" \
1814 "dli "#r1", 0x6 \n\t" \
1815 "paddh "#f28", "#f28", "#f8" \n\t" \
1816 "paddh "#f30", "#f30", "#f8" \n\t" \
1817 "dmtc1 "#r1", "#f8" \n\t" \
1818 "psrah "#f28", "#f28", "#f8" \n\t" \
1819 "psrah "#f30", "#f30", "#f8" \n\t" \
1820 "packushb "#f28", "#f28", "#f30" \n\t" \
1821 "gssdlc1 "#f28", 0x7("#r0") \n\t" \
1822 "gssdrc1 "#f28", 0x0("#r0") \n\t" \
1823 "dmtc1 "#r2", "#f8" \n\t"
1824
McHorVer20Width5_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1825 void McHorVer20Width5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
1826 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
1827 BACKUP_REG;
1828 __asm__ volatile (
1829 ".set arch=loongson3a \n\t"
1830 "xor $f28, $f28, $f28 \n\t"
1831 PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
1832 "dli $8, 0x2 \n\t"
1833 "dli $10, 0x0010001000100010 \n\t"
1834 "dli $11, 0x5 \n\t"
1835 "1: \n\t"
1836 "xor $f28, $f28, $f28 \n\t"
1837 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
1838 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
1839 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
1840 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
1841 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
1842 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
1843 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
1844 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
1845 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
1846 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
1847 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
1848 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
1849 "punpckhbh $f2, $f0, $f28 \n\t"
1850 "punpckhbh $f6, $f4, $f28 \n\t"
1851 "punpckhbh $f10, $f8, $f28 \n\t"
1852 "punpckhbh $f14, $f12, $f28 \n\t"
1853 "punpckhbh $f18, $f16, $f28 \n\t"
1854 "punpckhbh $f22, $f20, $f28 \n\t"
1855 "punpcklbh $f0, $f0, $f28 \n\t"
1856 "punpcklbh $f4, $f4, $f28 \n\t"
1857 "punpcklbh $f8, $f8, $f28 \n\t"
1858 "punpcklbh $f12, $f12, $f28 \n\t"
1859 "punpcklbh $f16, $f16, $f28 \n\t"
1860 "punpcklbh $f20, $f20, $f28 \n\t"
1861
1862 "mov.d $f28, $f8 \n\t"
1863 "mov.d $f30, $f10 \n\t"
1864 "paddh $f28, $f28, $f12 \n\t"
1865 "paddh $f30, $f30, $f14 \n\t"
1866 "mov.d $f24, $f16 \n\t"
1867 "mov.d $f26, $f18 \n\t"
1868 "paddh $f24, $f24, $f20 \n\t"
1869 "paddh $f26, $f26, $f22 \n\t"
1870 "dmfc1 $9, $f12 \n\t"
1871 "dmtc1 $8, $f12 \n\t"
1872 "psllh $f24, $f24, $f12 \n\t"
1873 "psllh $f26, $f26, $f12 \n\t"
1874 "psubh $f24, $f24, $f28 \n\t"
1875 "psubh $f26, $f26, $f30 \n\t"
1876 "paddh $f0, $f0, $f4 \n\t"
1877 "paddh $f2, $f2, $f6 \n\t"
1878 "paddh $f0, $f0, $f24 \n\t"
1879 "paddh $f2, $f2, $f26 \n\t"
1880 "psllh $f24, $f24, $f12 \n\t"
1881 "psllh $f26, $f26, $f12 \n\t"
1882 "paddh $f0, $f0, $f24 \n\t"
1883 "paddh $f2, $f2, $f26 \n\t"
1884
1885 "dmtc1 $10, $f12 \n\t"
1886 "paddh $f0, $f0, $f12 \n\t"
1887 "paddh $f2, $f2, $f12 \n\t"
1888 "dmtc1 $11, $f12 \n\t"
1889 "psrah $f0, $f0, $f12 \n\t"
1890 "psrah $f2, $f2, $f12 \n\t"
1891 "packushb $f0, $f0, $f2 \n\t"
1892
1893 "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
1894 "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
1895
1896 "gsldlc1 $f0, 0xd(%[pSrc]) \n\t"
1897 "xor $f28, $f28, $f28 \n\t"
1898 "gsldrc1 $f0, 0x6(%[pSrc]) \n\t"
1899 "punpckhbh $f2, $f0, $f28 \n\t"
1900 "punpcklbh $f0, $f0, $f28 \n\t"
1901 "dmtc1 $9, $f12 \n\t"
1902 "dmtc1 $8, $f24 \n\t"
1903
1904 "paddh $f16, $f16, $f4 \n\t"
1905 "paddh $f18, $f18, $f6 \n\t"
1906 "paddh $f20, $f20, $f12 \n\t"
1907 "paddh $f22, $f22, $f14 \n\t"
1908 "psllh $f20, $f20, $f24 \n\t"
1909 "psllh $f22, $f22, $f24 \n\t"
1910 "psubh $f20, $f20, $f16 \n\t"
1911 "psubh $f22, $f22, $f18 \n\t"
1912 "paddh $f8, $f8, $f0 \n\t"
1913 "paddh $f10, $f10, $f2 \n\t"
1914 "paddh $f8, $f8, $f20 \n\t"
1915 "paddh $f10, $f10, $f22 \n\t"
1916 "psllh $f20, $f20, $f24 \n\t"
1917 "psllh $f22, $f22, $f24 \n\t"
1918 "paddh $f8, $f8, $f20 \n\t"
1919 "paddh $f10, $f10, $f22 \n\t"
1920
1921 "dmtc1 $10, $f24 \n\t"
1922 "paddh $f8, $f8, $f24 \n\t"
1923 "paddh $f10, $f10, $f24 \n\t"
1924 "dmtc1 $11, $f24 \n\t"
1925 "psrah $f8, $f8, $f24 \n\t"
1926 "psrah $f10, $f10, $f24 \n\t"
1927 "packushb $f8, $f8, $f10 \n\t"
1928 "gsswlc1 $f8, 0x4(%[pDst]) \n\t"
1929 "gsswrc1 $f8, 0x1(%[pDst]) \n\t"
1930
1931 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
1932 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
1933 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
1934 "bnez %[iHeight], 1b \n\t"
1935 : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
1936 [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
1937 : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
1938 : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
1939 "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
1940 "$f28", "$f30"
1941 );
1942 RECOVER_REG;
1943 }
1944
McHorVer20Width9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)1945 void McHorVer20Width9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
1946 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
1947 BACKUP_REG;
1948 __asm__ volatile (
1949 ".set arch=loongson3a \n\t"
1950 PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
1951 "xor $f28, $f28, $f28 \n\t"
1952 "dli $8, 0x2 \n\t"
1953 "dli $9, 0x9 \n\t"
1954 "dli $10, 0x0010001000100010 \n\t"
1955 "dli $11, 0x5 \n\t"
1956 "bne %[iWidth], $9, 2f \n\t"
1957 "1: \n\t"
1958 "xor $f28, $f28, $f28 \n\t"
1959 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
1960 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
1961 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
1962 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
1963 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
1964 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
1965 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
1966 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
1967 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
1968 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
1969 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
1970 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
1971 "punpckhbh $f2, $f0, $f28 \n\t"
1972 "punpckhbh $f6, $f4, $f28 \n\t"
1973 "punpckhbh $f10, $f8, $f28 \n\t"
1974 "punpckhbh $f14, $f12, $f28 \n\t"
1975 "punpckhbh $f18, $f16, $f28 \n\t"
1976 "punpckhbh $f22, $f20, $f28 \n\t"
1977 "punpcklbh $f0, $f0, $f28 \n\t"
1978 "punpcklbh $f4, $f4, $f28 \n\t"
1979 "punpcklbh $f8, $f8, $f28 \n\t"
1980 "punpcklbh $f12, $f12, $f28 \n\t"
1981 "punpcklbh $f16, $f16, $f28 \n\t"
1982 "punpcklbh $f20, $f20, $f28 \n\t"
1983
1984 "mov.d $f28, $f8 \n\t"
1985 "mov.d $f30, $f10 \n\t"
1986 "paddh $f28, $f28, $f12 \n\t"
1987 "paddh $f30, $f30, $f14 \n\t"
1988 "mov.d $f24, $f16 \n\t"
1989 "mov.d $f26, $f18 \n\t"
1990 "paddh $f24, $f24, $f20 \n\t"
1991 "paddh $f26, $f26, $f22 \n\t"
1992 "dmfc1 $9, $f12 \n\t"
1993 "dmtc1 $8, $f12 \n\t"
1994 "psllh $f24, $f24, $f12 \n\t"
1995 "psllh $f26, $f26, $f12 \n\t"
1996 "psubh $f24, $f24, $f28 \n\t"
1997 "psubh $f26, $f26, $f30 \n\t"
1998 "paddh $f0, $f0, $f4 \n\t"
1999 "paddh $f2, $f2, $f6 \n\t"
2000 "paddh $f0, $f0, $f24 \n\t"
2001 "paddh $f2, $f2, $f26 \n\t"
2002 "psllh $f24, $f24, $f12 \n\t"
2003 "psllh $f26, $f26, $f12 \n\t"
2004 "paddh $f0, $f0, $f24 \n\t"
2005 "paddh $f2, $f2, $f26 \n\t"
2006
2007 "dmtc1 $10, $f12 \n\t"
2008 "paddh $f0, $f0, $f12 \n\t"
2009 "paddh $f2, $f2, $f12 \n\t"
2010 "dmtc1 $11, $f12 \n\t"
2011 "psrah $f0, $f0, $f12 \n\t"
2012 "psrah $f2, $f2, $f12 \n\t"
2013 "packushb $f0, $f0, $f2 \n\t"
2014
2015 "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
2016 "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
2017
2018 "gsldlc1 $f0, 0xd(%[pSrc]) \n\t"
2019 "xor $f28, $f28, $f28 \n\t"
2020 "gsldrc1 $f0, 0x6(%[pSrc]) \n\t"
2021 "punpckhbh $f2, $f0, $f28 \n\t"
2022 "punpcklbh $f0, $f0, $f28 \n\t"
2023 "dmtc1 $9, $f12 \n\t"
2024 "dmtc1 $8, $f24 \n\t"
2025
2026 "paddh $f16, $f16, $f4 \n\t"
2027 "paddh $f18, $f18, $f6 \n\t"
2028 "paddh $f20, $f20, $f12 \n\t"
2029 "paddh $f22, $f22, $f14 \n\t"
2030 "psllh $f20, $f20, $f24 \n\t"
2031 "psllh $f22, $f22, $f24 \n\t"
2032 "psubh $f20, $f20, $f16 \n\t"
2033 "psubh $f22, $f22, $f18 \n\t"
2034 "paddh $f8, $f8, $f0 \n\t"
2035 "paddh $f10, $f10, $f2 \n\t"
2036 "paddh $f8, $f8, $f20 \n\t"
2037 "paddh $f10, $f10, $f22 \n\t"
2038 "psllh $f20, $f20, $f24 \n\t"
2039 "psllh $f22, $f22, $f24 \n\t"
2040 "paddh $f8, $f8, $f20 \n\t"
2041 "paddh $f10, $f10, $f22 \n\t"
2042
2043 "dmtc1 $10, $f24 \n\t"
2044 "paddh $f8, $f8, $f24 \n\t"
2045 "paddh $f10, $f10, $f24 \n\t"
2046 "dmtc1 $11, $f24 \n\t"
2047 "psrah $f8, $f8, $f24 \n\t"
2048 "psrah $f10, $f10, $f24 \n\t"
2049 "packushb $f8, $f8, $f10 \n\t"
2050 "gssdlc1 $f8, 0x8(%[pDst]) \n\t"
2051 "gssdrc1 $f8, 0x1(%[pDst]) \n\t"
2052
2053 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2054 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2055 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2056 "bnez %[iHeight], 1b \n\t"
2057 "j 3f \n\t"
2058
2059 "2: \n\t"
2060 "xor $f28, $f28, $f28 \n\t"
2061 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
2062 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
2063 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
2064 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
2065 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
2066 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
2067 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
2068 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
2069 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
2070 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
2071 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
2072 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
2073 "punpckhbh $f2, $f0, $f28 \n\t"
2074 "punpckhbh $f6, $f4, $f28 \n\t"
2075 "punpckhbh $f10, $f8, $f28 \n\t"
2076 "punpckhbh $f14, $f12, $f28 \n\t"
2077 "punpckhbh $f18, $f16, $f28 \n\t"
2078 "punpckhbh $f22, $f20, $f28 \n\t"
2079 "punpcklbh $f0, $f0, $f28 \n\t"
2080 "punpcklbh $f4, $f4, $f28 \n\t"
2081 "punpcklbh $f8, $f8, $f28 \n\t"
2082 "punpcklbh $f12, $f12, $f28 \n\t"
2083 "punpcklbh $f16, $f16, $f28 \n\t"
2084 "punpcklbh $f20, $f20, $f28 \n\t"
2085
2086 "dmtc1 $8, $f30 \n\t"
2087 "paddh $f8, $f8, $f12 \n\t"
2088 "paddh $f10, $f10, $f14 \n\t"
2089 "paddh $f16, $f16, $f20 \n\t"
2090 "paddh $f18, $f18, $f22 \n\t"
2091 "psllh $f16, $f16, $f30 \n\t"
2092 "psllh $f18, $f18, $f30 \n\t"
2093 "psubh $f16, $f16, $f8 \n\t"
2094 "psubh $f18, $f18, $f10 \n\t"
2095 "paddh $f0, $f0, $f4 \n\t"
2096 "paddh $f2, $f2, $f6 \n\t"
2097 "paddh $f0, $f0, $f16 \n\t"
2098 "paddh $f2, $f2, $f18 \n\t"
2099 "psllh $f16, $f16, $f30 \n\t"
2100 "psllh $f18, $f18, $f30 \n\t"
2101 "paddh $f0, $f0, $f16 \n\t"
2102 "paddh $f2, $f2, $f18 \n\t"
2103
2104 "dmtc1 $10, $f30 \n\t"
2105 "paddh $f0, $f0, $f30 \n\t"
2106 "paddh $f2, $f2, $f30 \n\t"
2107 "dmtc1 $11, $f30 \n\t"
2108 "psrah $f0, $f0, $f30 \n\t"
2109 "psrah $f2, $f2, $f30 \n\t"
2110 "packushb $f0, $f0, $f2 \n\t"
2111 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
2112 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
2113
2114 "gsldlc1 $f0, 15(%[pSrc]) \n\t"
2115 "gsldlc1 $f4, 0x14(%[pSrc]) \n\t"
2116 "gsldlc1 $f8, 0x10(%[pSrc]) \n\t"
2117 "gsldlc1 $f12, 0x13(%[pSrc]) \n\t"
2118 "gsldlc1 $f16, 0x11(%[pSrc]) \n\t"
2119 "gsldlc1 $f20, 0x12(%[pSrc]) \n\t"
2120 "gsldrc1 $f0, 8(%[pSrc]) \n\t"
2121 "gsldrc1 $f4, 0xd(%[pSrc]) \n\t"
2122 "gsldrc1 $f8, 0x9(%[pSrc]) \n\t"
2123 "gsldrc1 $f12, 0xc(%[pSrc]) \n\t"
2124 "gsldrc1 $f16, 0xa(%[pSrc]) \n\t"
2125 "gsldrc1 $f20, 0xb(%[pSrc]) \n\t"
2126 "punpckhbh $f2, $f0, $f28 \n\t"
2127 "punpckhbh $f6, $f4, $f28 \n\t"
2128 "punpckhbh $f10, $f8, $f28 \n\t"
2129 "punpckhbh $f14, $f12, $f28 \n\t"
2130 "punpckhbh $f18, $f16, $f28 \n\t"
2131 "punpckhbh $f22, $f20, $f28 \n\t"
2132 "punpcklbh $f0, $f0, $f28 \n\t"
2133 "punpcklbh $f4, $f4, $f28 \n\t"
2134 "punpcklbh $f8, $f8, $f28 \n\t"
2135 "punpcklbh $f12, $f12, $f28 \n\t"
2136 "punpcklbh $f16, $f16, $f28 \n\t"
2137 "punpcklbh $f20, $f20, $f28 \n\t"
2138
2139 "mov.d $f28, $f8 \n\t"
2140 "mov.d $f30, $f10 \n\t"
2141 "paddh $f28, $f28, $f12 \n\t"
2142 "paddh $f30, $f30, $f14 \n\t"
2143 "mov.d $f24, $f16 \n\t"
2144 "mov.d $f26, $f18 \n\t"
2145 "paddh $f24, $f24, $f20 \n\t"
2146 "paddh $f26, $f26, $f22 \n\t"
2147 "dmfc1 $9, $f12 \n\t"
2148 "dmtc1 $8, $f12 \n\t"
2149 "psllh $f24, $f24, $f12 \n\t"
2150 "psllh $f26, $f26, $f12 \n\t"
2151 "psubh $f24, $f24, $f28 \n\t"
2152 "psubh $f26, $f26, $f30 \n\t"
2153 "paddh $f0, $f0, $f4 \n\t"
2154 "paddh $f2, $f2, $f6 \n\t"
2155 "paddh $f0, $f0, $f24 \n\t"
2156 "paddh $f2, $f2, $f26 \n\t"
2157 "psllh $f24, $f24, $f12 \n\t"
2158 "psllh $f26, $f26, $f12 \n\t"
2159 "paddh $f0, $f0, $f24 \n\t"
2160 "paddh $f2, $f2, $f26 \n\t"
2161
2162 "dmtc1 $10, $f30 \n\t"
2163 "paddh $f0, $f0, $f30 \n\t"
2164 "paddh $f2, $f2, $f30 \n\t"
2165 "dmtc1 $11, $f30 \n\t"
2166 "psrah $f0, $f0, $f30 \n\t"
2167 "psrah $f2, $f2, $f30 \n\t"
2168 "packushb $f0, $f0, $f2 \n\t"
2169 "gsswlc1 $f0, 0xb(%[pDst]) \n\t"
2170 "gsswrc1 $f0, 0x8(%[pDst]) \n\t"
2171
2172 "dmtc1 $9, $f12 \n\t"
2173 "xor $f28, $f28, $f28 \n\t"
2174 "dli $9, 0x20 \n\t"
2175 "gsldlc1 $f0, 0x15(%[pSrc]) \n\t"
2176 "dmtc1 $9, $f30 \n\t"
2177 "gsldrc1 $f0, 0xE(%[pSrc]) \n\t"
2178 "punpckhbh $f2, $f0, $f28 \n\t"
2179 "punpcklbh $f0, $f0, $f28 \n\t"
2180 "dmtc1 $8, $f24 \n\t"
2181
2182 "paddh $f16, $f16, $f4 \n\t"
2183 "paddh $f18, $f18, $f6 \n\t"
2184 "paddh $f20, $f20, $f12 \n\t"
2185 "paddh $f22, $f22, $f14 \n\t"
2186 "psllh $f20, $f20, $f24 \n\t"
2187 "psllh $f22, $f22, $f24 \n\t"
2188 "psubh $f20, $f20, $f16 \n\t"
2189 "psubh $f22, $f22, $f18 \n\t"
2190 "paddh $f8, $f8, $f0 \n\t"
2191 "paddh $f10, $f10, $f2 \n\t"
2192 "paddh $f8, $f8, $f20 \n\t"
2193 "paddh $f10, $f10, $f22 \n\t"
2194 "psllh $f20, $f20, $f24 \n\t"
2195 "psllh $f22, $f22, $f24 \n\t"
2196 "paddh $f8, $f8, $f20 \n\t"
2197 "paddh $f10, $f10, $f22 \n\t"
2198
2199 "dmtc1 $10, $f24 \n\t"
2200 "paddh $f8, $f8, $f24 \n\t"
2201 "paddh $f10, $f10, $f24 \n\t"
2202 "dmtc1 $11, $f24 \n\t"
2203 "psrah $f8, $f8, $f24 \n\t"
2204 "psrah $f10, $f10, $f24 \n\t"
2205 "packushb $f8, $f8, $f10 \n\t"
2206 "gssdlc1 $f8, 0x10(%[pDst]) \n\t"
2207 "gssdrc1 $f8, 0x9(%[pDst]) \n\t"
2208
2209 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2210 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2211 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2212 "bnez %[iHeight], 2b \n\t"
2213 "3: \n\t"
2214 : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2215 [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
2216 : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
2217 : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
2218 "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
2219 "$f28", "$f30"
2220 );
2221 RECOVER_REG;
2222 }
2223
2224 //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
McHorVer20Width5Or9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2225 static inline void McHorVer20Width5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
2226 uint8_t* pDst, int32_t iDstStride,
2227 int32_t iWidth, int32_t iHeight) {
2228 if (iWidth == 17 || iWidth == 9)
2229 McHorVer20Width9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2230 else //if (iWidth == 5)
2231 McHorVer20Width5_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2232 }
2233
McHorVer02Height5_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2234 void McHorVer02Height5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
2235 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
2236 BACKUP_REG;
2237 __asm__ volatile (
2238 ".set arch=loongson3a \n\t"
2239 "move $12, %[pSrc] \n\t"
2240 "move $13, %[pDst] \n\t"
2241 "move $14, %[iHeight] \n\t"
2242
2243 "dsrl %[iWidth], %[iWidth], 0x2 \n\t"
2244 PTR_ADDU "$10, %[iSrcStride], %[iSrcStride] \n\t"
2245 PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
2246
2247 "1: \n\t"
2248 "xor $f28, $f28, $f28 \n\t"
2249 MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
2250 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2251 MMI_LOAD_8P($f4, $f6, $f28, $8)
2252
2253 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2254 MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
2255 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2256 MMI_LOAD_8P($f12, $f14, $f28, $8)
2257 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2258 MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
2259 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2260 MMI_LOAD_8P($f20, $f22, $f28, $8)
2261 FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2262 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2263 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2264 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2265 MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2266 "mov.d $f0, $f4 \n\t"
2267 "mov.d $f2, $f6 \n\t"
2268 "mov.d $f4, $f8 \n\t"
2269 "mov.d $f6, $f10 \n\t"
2270 "mov.d $f8, $f12 \n\t"
2271 "mov.d $f10, $f14 \n\t"
2272 "mov.d $f12, $f16 \n\t"
2273 "mov.d $f14, $f18 \n\t"
2274 "mov.d $f16, $f20 \n\t"
2275 "mov.d $f18, $f22 \n\t"
2276 "mov.d $f20, $f24 \n\t"
2277 "mov.d $f22, $f26 \n\t"
2278
2279 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2280 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2281
2282 "2: \n\t"
2283 FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2284 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2285 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2286 "beqz %[iHeight], 3f \n\t"
2287
2288 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2289 MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2290 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2291 FILTER_HV_W4($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2292 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
2293 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2294 "beqz %[iHeight], 3f \n\t"
2295
2296 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2297 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2298 MMI_LOAD_8P($f28, $f30, $f0, $8)
2299 FILTER_HV_W4($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2300 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
2301 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2302 "beqz %[iHeight], 3f \n\t"
2303
2304 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2305 MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
2306 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2307 FILTER_HV_W4($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2308 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
2309 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2310 "beqz %[iHeight], 3f \n\t"
2311
2312 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2313 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2314 MMI_LOAD_8P($f4, $f6, $f8, $8)
2315 FILTER_HV_W4($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
2316 $f8, $f10, $f12, $f14, %[pDst], $8, $9)
2317 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2318 "beqz %[iHeight], 3f \n\t"
2319
2320 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2321 MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
2322 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2323 FILTER_HV_W4($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
2324 $f12, $f14, $f16, $f18, %[pDst], $8, $9)
2325 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2326 "beqz %[iHeight], 3f \n\t"
2327
2328 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2329 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2330 MMI_LOAD_8P($f12, $f14, $f16, $8)
2331 FILTER_HV_W4($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
2332 $f16, $f18, $f20, $f22, %[pDst], $8, $9)
2333 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2334 "beqz %[iHeight], 3f \n\t"
2335
2336 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2337 MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
2338 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2339 FILTER_HV_W4($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2340 $f20, $f22, $f24, $f26, %[pDst], $8, $9)
2341 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2342 "beqz %[iHeight], 3f \n\t"
2343
2344 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2345 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2346 MMI_LOAD_8P($f20, $f22, $f24, $8)
2347 "j 2b \n\t"
2348
2349 "3: \n\t"
2350 PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
2351 "beqz %[iWidth], 4f \n\t"
2352 "move %[pSrc], $12 \n\t"
2353 "move %[pDst], $13 \n\t"
2354 "move %[iHeight], $14 \n\t"
2355 PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
2356 PTR_ADDIU "%[pSrc], %[pSrc], 0x4 \n\t"
2357 PTR_ADDIU "%[pDst], %[pDst], 0x4 \n\t"
2358 "j 1b \n\t"
2359 "4: \n\t"
2360 : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2361 [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
2362 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
2363 : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
2364 "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2365 "$f24", "$f26", "$f28", "$f30"
2366 );
2367 RECOVER_REG;
2368 }
2369
McHorVer02Height9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2370 void McHorVer02Height9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
2371 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
2372 BACKUP_REG;
2373 __asm__ volatile (
2374 ".set arch=loongson3a \n\t"
2375 "move $12, %[pSrc] \n\t"
2376 "move $13, %[pDst] \n\t"
2377 "move $14, %[iHeight] \n\t"
2378
2379 "dsrl %[iWidth], %[iWidth], 0x3 \n\t"
2380 PTR_ADDU "$10, %[iSrcStride], %[iSrcStride] \n\t"
2381 PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
2382
2383 "1: \n\t"
2384 "dli $8, 0x20 \n\t"
2385 "xor $f28, $f28, $f28 \n\t"
2386 "dmtc1 $8, $f30 \n\t"
2387
2388 MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
2389 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2390 MMI_LOAD_8P($f4, $f6, $f28, $8)
2391 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2392 MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
2393 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2394 MMI_LOAD_8P($f12, $f14, $f28, $8)
2395 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2396 MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
2397 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2398 MMI_LOAD_8P($f20, $f22, $f28, $8)
2399 FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2400 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2401 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2402 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2403 MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2404 "mov.d $f0, $f4 \n\t"
2405 "mov.d $f2, $f6 \n\t"
2406 "mov.d $f4, $f8 \n\t"
2407 "mov.d $f6, $f10 \n\t"
2408 "mov.d $f8, $f12 \n\t"
2409 "mov.d $f10, $f14 \n\t"
2410 "mov.d $f12, $f16 \n\t"
2411 "mov.d $f14, $f18 \n\t"
2412 "mov.d $f16, $f20 \n\t"
2413 "mov.d $f18, $f22 \n\t"
2414 "mov.d $f20, $f24 \n\t"
2415 "mov.d $f22, $f26 \n\t"
2416 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2417 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2418
2419 "2: \n\t"
2420 FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2421 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
2422 "dmtc1 $9, $f8 \n\t"
2423 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2424 "beqz %[iHeight], 3f \n\t"
2425
2426 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2427 MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
2428 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2429 FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2430 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
2431 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2432 "beqz %[iHeight], 3f \n\t"
2433
2434 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2435 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2436 MMI_LOAD_8P($f28, $f30, $f0, $8)
2437 FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2438 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
2439 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2440 "beqz %[iHeight], 3f \n\t"
2441
2442 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2443 MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
2444 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2445 FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2446 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
2447 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2448 "beqz %[iHeight], 3f \n\t"
2449
2450 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2451 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2452 MMI_LOAD_8P($f4, $f6, $f8, $8)
2453 FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
2454 $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
2455 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2456 "beqz %[iHeight], 3f \n\t"
2457
2458 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2459 MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
2460 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2461 FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
2462 $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
2463 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2464 "beqz %[iHeight], 3f \n\t"
2465
2466 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2467 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2468 MMI_LOAD_8P($f12, $f14, $f16, $8)
2469 FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
2470 $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
2471 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2472 "beqz %[iHeight], 3f \n\t"
2473
2474 PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
2475 MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
2476 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2477 FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
2478 $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
2479 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2480 "beqz %[iHeight], 3f \n\t"
2481
2482 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2483 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
2484 MMI_LOAD_8P($f20, $f22, $f24, $8)
2485 "j 2b \n\t"
2486
2487 "3: \n\t"
2488 PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
2489 "beqz %[iWidth], 4f \n\t"
2490
2491 "move %[pSrc], $12 \n\t"
2492 "move %[pDst], $13 \n\t"
2493 "move %[iHeight], $14 \n\t"
2494 PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
2495 PTR_ADDIU "%[pSrc], %[pSrc], 0x8 \n\t"
2496 PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t"
2497 "j 1b \n\t"
2498 "4: \n\t"
2499 : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
2500 [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
2501 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
2502 : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
2503 "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
2504 "$f24", "$f26", "$f28", "$f30"
2505 );
2506 RECOVER_REG;
2507 }
2508
2509 //vertical filter to gain half sample, that is (0, 2) location in quarter sample
McHorVer02Height5Or9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2510 static inline void McHorVer02Height5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
2511 uint8_t* pDst, int32_t iDstStride,
2512 int32_t iWidth, int32_t iHeight) {
2513 if (iWidth == 16 || iWidth == 8)
2514 McHorVer02Height9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight );
2515 else
2516 McHorVer02Height5_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
2517 }
2518
McHorVer22HorFirst_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pTap,int32_t iTapStride,int32_t iWidth,int32_t iHeight)2519 static inline void McHorVer22HorFirst_mmi(const uint8_t *pSrc, int32_t iSrcStride,
2520 uint8_t * pTap, int32_t iTapStride,
2521 int32_t iWidth, int32_t iHeight) {
2522 BACKUP_REG;
2523 __asm__ volatile (
2524 ".set arch=loongson3a \n\t"
2525 "dli $8, 0x9 \n\t"
2526 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2527 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2528 "bne %[iWidth], $8, 2f \n\t"
2529
2530 "1: \n\t"
2531 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
2532 "xor $f28, $f28, $f28 \n\t"
2533 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
2534 "punpckhbh $f2, $f0, $f28 \n\t"
2535 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
2536 "punpcklbh $f0, $f0, $f28 \n\t"
2537 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
2538 "punpckhbh $f6, $f4, $f28 \n\t"
2539 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
2540 "punpcklbh $f4, $f4, $f28 \n\t"
2541 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
2542 "punpckhbh $f10, $f8, $f28 \n\t"
2543 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
2544 "punpcklbh $f8, $f8, $f28 \n\t"
2545 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
2546 "punpckhbh $f14, $f12, $f28 \n\t"
2547 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
2548 "punpcklbh $f12, $f12, $f28 \n\t"
2549 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
2550 "punpckhbh $f18, $f16, $f28 \n\t"
2551 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
2552 "punpcklbh $f16, $f16, $f28 \n\t"
2553 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
2554 "punpckhbh $f22, $f20, $f28 \n\t"
2555 "punpcklbh $f20, $f20, $f28 \n\t"
2556
2557 "mov.d $f28, $f8 \n\t"
2558 "mov.d $f30, $f10 \n\t"
2559 "paddh $f28, $f28, $f12 \n\t"
2560 "paddh $f30, $f30, $f14 \n\t"
2561 "mov.d $f24, $f16 \n\t"
2562 "mov.d $f26, $f18 \n\t"
2563 "paddh $f24, $f24, $f20 \n\t"
2564 "paddh $f26, $f26, $f22 \n\t"
2565 "dli $8, 0x2 \n\t"
2566 "dmfc1 $9, $f12 \n\t"
2567 "dmtc1 $8, $f12 \n\t"
2568 "psllh $f24, $f24, $f12 \n\t"
2569 "psllh $f26, $f26, $f12 \n\t"
2570 "psubh $f24, $f24, $f28 \n\t"
2571 "psubh $f26, $f26, $f30 \n\t"
2572 "paddh $f0, $f0, $f4 \n\t"
2573 "paddh $f2, $f2, $f6 \n\t"
2574 "paddh $f0, $f0, $f24 \n\t"
2575 "paddh $f2, $f2, $f26 \n\t"
2576 "psllh $f24, $f24, $f12 \n\t"
2577 "psllh $f26, $f26, $f12 \n\t"
2578 "paddh $f0, $f0, $f24 \n\t"
2579 "paddh $f2, $f2, $f26 \n\t"
2580 "gsswlc1 $f0, 0x3(%[pTap]) \n\t"
2581 "gsswrc1 $f0, 0x0(%[pTap]) \n\t"
2582
2583 "gsldlc1 $f0, 0xd(%[pSrc]) \n\t"
2584 "xor $f28, $f28, $f28 \n\t"
2585 "gsldrc1 $f0, 0x6(%[pSrc]) \n\t"
2586 "punpckhbh $f2, $f0, $f28 \n\t"
2587 "punpcklbh $f0, $f0, $f28 \n\t"
2588 "dli $8, 0x2 \n\t"
2589 "dmtc1 $9, $f12 \n\t"
2590 "dmtc1 $8, $f24 \n\t"
2591
2592 "paddh $f16, $f16, $f4 \n\t"
2593 "paddh $f18, $f18, $f6 \n\t"
2594 "paddh $f20, $f20, $f12 \n\t"
2595 "paddh $f22, $f22, $f14 \n\t"
2596 "psllh $f20, $f20, $f24 \n\t"
2597 "psllh $f22, $f22, $f24 \n\t"
2598 "psubh $f20, $f20, $f16 \n\t"
2599 "psubh $f22, $f22, $f18 \n\t"
2600 "paddh $f8, $f8, $f0 \n\t"
2601 "paddh $f10, $f10, $f2 \n\t"
2602 "paddh $f8, $f8, $f20 \n\t"
2603 "paddh $f10, $f10, $f22 \n\t"
2604 "psllh $f20, $f20, $f24 \n\t"
2605 "psllh $f22, $f22, $f24 \n\t"
2606 "paddh $f8, $f8, $f20 \n\t"
2607 "paddh $f10, $f10, $f22 \n\t"
2608 "gssdlc1 $f8, 0x9(%[pTap]) \n\t"
2609 "gssdlc1 $f10, 0x11(%[pTap]) \n\t"
2610 "gssdrc1 $f8, 0x2(%[pTap]) \n\t"
2611 "gssdrc1 $f10, 0xa(%[pTap]) \n\t"
2612
2613 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2614 PTR_ADDU "%[pTap], %[pTap], %[iTapStride] \n\t"
2615 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2616 "bnez %[iHeight], 1b \n\t"
2617 "j 3f \n\t"
2618
2619 "2: \n\t"
2620 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
2621 "xor $f28, $f28, $f28 \n\t"
2622 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
2623 "punpckhbh $f2, $f0, $f28 \n\t"
2624 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
2625 "punpcklbh $f0, $f0, $f28 \n\t"
2626 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
2627 "punpckhbh $f6, $f4, $f28 \n\t"
2628 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
2629 "punpcklbh $f4, $f4, $f28 \n\t"
2630 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
2631 "punpckhbh $f10, $f8, $f28 \n\t"
2632 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
2633 "punpcklbh $f8, $f8, $f28 \n\t"
2634 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
2635 "punpckhbh $f14, $f12, $f28 \n\t"
2636 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
2637 "punpcklbh $f12, $f12, $f28 \n\t"
2638 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
2639 "punpckhbh $f18, $f16, $f28 \n\t"
2640 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
2641 "punpcklbh $f16, $f16, $f28 \n\t"
2642 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
2643 "punpckhbh $f22, $f20, $f28 \n\t"
2644 "dli $8, 0x2 \n\t"
2645 "punpcklbh $f20, $f20, $f28 \n\t"
2646
2647 "dmtc1 $8, $f30 \n\t"
2648 "paddh $f8, $f8, $f12 \n\t"
2649 "paddh $f10, $f10, $f14 \n\t"
2650 "paddh $f16, $f16, $f20 \n\t"
2651 "paddh $f18, $f18, $f22 \n\t"
2652 "psllh $f16, $f16, $f30 \n\t"
2653 "psllh $f18, $f18, $f30 \n\t"
2654 "psubh $f16, $f16, $f8 \n\t"
2655 "psubh $f18, $f18, $f10 \n\t"
2656 "paddh $f0, $f0, $f4 \n\t"
2657 "paddh $f2, $f2, $f6 \n\t"
2658 "paddh $f0, $f0, $f16 \n\t"
2659 "paddh $f2, $f2, $f18 \n\t"
2660 "psllh $f16, $f16, $f30 \n\t"
2661 "psllh $f18, $f18, $f30 \n\t"
2662 "paddh $f0, $f0, $f16 \n\t"
2663 "paddh $f2, $f2, $f18 \n\t"
2664 "gssqc1 $f2, $f0, 0x0(%[pTap]) \n\t"
2665
2666 "gsldlc1 $f0, 15(%[pSrc]) \n\t"
2667 "gsldrc1 $f0, 8(%[pSrc]) \n\t"
2668 "punpckhbh $f2, $f0, $f28 \n\t"
2669 "gsldlc1 $f4, 0x14(%[pSrc]) \n\t"
2670 "punpcklbh $f0, $f0, $f28 \n\t"
2671 "gsldrc1 $f4, 0xd(%[pSrc]) \n\t"
2672 "punpckhbh $f6, $f4, $f28 \n\t"
2673 "gsldlc1 $f8, 0x10(%[pSrc]) \n\t"
2674 "punpcklbh $f4, $f4, $f28 \n\t"
2675 "gsldrc1 $f8, 0x9(%[pSrc]) \n\t"
2676 "punpckhbh $f10, $f8, $f28 \n\t"
2677 "gsldlc1 $f12, 0x13(%[pSrc]) \n\t"
2678 "punpcklbh $f8, $f8, $f28 \n\t"
2679 "gsldrc1 $f12, 0xc(%[pSrc]) \n\t"
2680 "punpckhbh $f14, $f12, $f28 \n\t"
2681 "gsldlc1 $f16, 0x11(%[pSrc]) \n\t"
2682 "punpcklbh $f12, $f12, $f28 \n\t"
2683 "gsldrc1 $f16, 0xa(%[pSrc]) \n\t"
2684 "punpckhbh $f18, $f16, $f28 \n\t"
2685 "gsldlc1 $f20, 0x12(%[pSrc]) \n\t"
2686 "punpcklbh $f16, $f16, $f28 \n\t"
2687 "gsldrc1 $f20, 0xb(%[pSrc]) \n\t"
2688 "punpckhbh $f22, $f20, $f28 \n\t"
2689 "punpcklbh $f20, $f20, $f28 \n\t"
2690
2691 "mov.d $f28, $f8 \n\t"
2692 "mov.d $f30, $f10 \n\t"
2693 "paddh $f28, $f28, $f12 \n\t"
2694 "paddh $f30, $f30, $f14 \n\t"
2695 "mov.d $f24, $f16 \n\t"
2696 "mov.d $f26, $f18 \n\t"
2697 "dli $8, 0x2 \n\t"
2698 "paddh $f24, $f24, $f20 \n\t"
2699 "paddh $f26, $f26, $f22 \n\t"
2700 "dmfc1 $9, $f12 \n\t"
2701 "dmtc1 $8, $f12 \n\t"
2702 "psllh $f24, $f24, $f12 \n\t"
2703 "psllh $f26, $f26, $f12 \n\t"
2704 "psubh $f24, $f24, $f28 \n\t"
2705 "psubh $f26, $f26, $f30 \n\t"
2706 "paddh $f0, $f0, $f4 \n\t"
2707 "paddh $f2, $f2, $f6 \n\t"
2708 "paddh $f0, $f0, $f24 \n\t"
2709 "paddh $f2, $f2, $f26 \n\t"
2710 "psllh $f24, $f24, $f12 \n\t"
2711 "psllh $f26, $f26, $f12 \n\t"
2712 "paddh $f0, $f0, $f24 \n\t"
2713 "paddh $f2, $f2, $f26 \n\t"
2714 "gsswlc1 $f0, 0x13(%[pTap]) \n\t"
2715 "gsswrc1 $f0, 0x10(%[pTap]) \n\t"
2716
2717 "gsldlc1 $f0, 0x15(%[pSrc]) \n\t"
2718 "xor $f28, $f28, $f28 \n\t"
2719 "gsldrc1 $f0, 0xE(%[pSrc]) \n\t"
2720 "punpckhbh $f2, $f0, $f28 \n\t"
2721 "punpcklbh $f0, $f0, $f28 \n\t"
2722 "dli $8, 0x2 \n\t"
2723 "dmtc1 $9, $f12 \n\t"
2724 "dmtc1 $8, $f24 \n\t"
2725
2726 "paddh $f16, $f16, $f4 \n\t"
2727 "paddh $f18, $f18, $f6 \n\t"
2728 "paddh $f20, $f20, $f12 \n\t"
2729 "paddh $f22, $f22, $f14 \n\t"
2730 "psllh $f20, $f20, $f24 \n\t"
2731 "psllh $f22, $f22, $f24 \n\t"
2732 "psubh $f20, $f20, $f16 \n\t"
2733 "psubh $f22, $f22, $f18 \n\t"
2734 "paddh $f8, $f8, $f0 \n\t"
2735 "paddh $f10, $f10, $f2 \n\t"
2736 "paddh $f8, $f8, $f20 \n\t"
2737 "paddh $f10, $f10, $f22 \n\t"
2738 "psllh $f20, $f20, $f24 \n\t"
2739 "psllh $f22, $f22, $f24 \n\t"
2740 "paddh $f8, $f8, $f20 \n\t"
2741 "paddh $f10, $f10, $f22 \n\t"
2742 "gssdlc1 $f8, 0x19(%[pTap]) \n\t"
2743 "gssdlc1 $f10, 0x21(%[pTap]) \n\t"
2744 "gssdrc1 $f8, 0x12(%[pTap]) \n\t"
2745 "gssdrc1 $f10, 0x1a(%[pTap]) \n\t"
2746
2747 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
2748 PTR_ADDU "%[pTap], %[pTap], %[iTapStride] \n\t"
2749 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2750 "bnez %[iHeight], 2b \n\t"
2751 "3: \n\t"
2752 : [pSrc]"+&r"(pSrc), [pTap]"+&r"(pTap), [iWidth]"+&r"(iWidth),
2753 [iHeight]"+&r"(iHeight)
2754 : [iSrcStride]"r"(iSrcStride), [iTapStride]"r"(iTapStride)
2755 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
2756 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
2757 );
2758 RECOVER_REG;
2759 }
2760
McHorVer22Width8VerLastAlign_mmi(const uint8_t * pTap,int32_t iTapStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2761 static inline void McHorVer22Width8VerLastAlign_mmi(const uint8_t *pTap,
2762 int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
2763 int32_t iWidth, int32_t iHeight) {
2764 BACKUP_REG;
2765 __asm__ volatile (
2766 ".set arch=loongson3a \n\t"
2767 "move $10, %[pTap] \n\t"
2768 "move $11, %[pDst] \n\t"
2769 "move $12, %[iHeight] \n\t"
2770 "dsrl %[iWidth], 0x3 \n\t"
2771 PTR_ADDU "$13, %[iTapStride], %[iTapStride] \n\t"
2772 PTR_ADDU "$14, %[iDstStride], %[iDstStride] \n\t"
2773 "dli $15, 0x0020002000200020 \n\t"
2774
2775 "4: \n\t"
2776 "gslqc1 $f2, $f0, 0x0(%[pTap]) \n\t"
2777 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2778 "gslqc1 $f6, $f4, 0x0($8) \n\t"
2779 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2780 "gslqc1 $f10, $f8, 0x0(%[pTap]) \n\t"
2781 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2782 "gslqc1 $f14, $f12, 0x0($8) \n\t"
2783 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2784 "gslqc1 $f18, $f16, 0x0(%[pTap]) \n\t"
2785 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2786 "gslqc1 $f22, $f20, 0x0($8) \n\t"
2787
2788 FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2789 $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
2790
2791 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2792 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2793 "gslqc1 $f26, $f24, 0x0(%[pTap]) \n\t"
2794 "mov.d $f0, $f4 \n\t"
2795 "mov.d $f2, $f6 \n\t"
2796 "mov.d $f4, $f8 \n\t"
2797 "mov.d $f6, $f10 \n\t"
2798 "mov.d $f8, $f12 \n\t"
2799 "mov.d $f10, $f14 \n\t"
2800 "mov.d $f12, $f16 \n\t"
2801 "mov.d $f14, $f18 \n\t"
2802 "mov.d $f16, $f20 \n\t"
2803 "mov.d $f18, $f22 \n\t"
2804 "mov.d $f20, $f24 \n\t"
2805 "mov.d $f22, $f26 \n\t"
2806 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2807 PTR_SUBU "%[pTap], %[pTap], %[iTapStride] \n\t"
2808
2809 "5: \n\t"
2810 FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
2811 $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
2812 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2813 "beqz %[iHeight], 6f \n\t"
2814 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2815 "gslqc1 $f26, $f24, 0x0(%[pTap]) \n\t"
2816
2817 FILTER_VER_ALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
2818 $f26, $f28, $f30, $f0, $f2, %[pDst], %[iDstStride], $8, $9, $15)
2819 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2820 "beqz %[iHeight], 6f \n\t"
2821 PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
2822 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2823 "gslqc1 $f30, $f28, 0x0($8) \n\t"
2824
2825 FILTER_VER_ALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2826 $f30, $f0, $f2, $f4, $f6, %[pDst], $0, $8, $9, $15)
2827 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2828 "beqz %[iHeight], 6f \n\t"
2829 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2830 "gslqc1 $f2, $f0, 0x0(%[pTap]) \n\t"
2831
2832 FILTER_VER_ALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
2833 $f2, $f4, $f6, $f8, $f10, %[pDst], %[iDstStride], $8, $9, $15)
2834 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2835 "beqz %[iHeight], 6f \n\t"
2836 PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
2837 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2838 "gslqc1 $f6, $f4, 0x0($8) \n\t"
2839
2840 FILTER_VER_ALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
2841 $f6, $f8, $f10, $f12, $f14, %[pDst], $0, $8, $9, $15)
2842 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2843 "beqz %[iHeight], 6f \n\t"
2844 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2845 "gslqc1 $f10, $f8, 0x0(%[pTap]) \n\t"
2846
2847 FILTER_VER_ALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
2848 $f10, $f12, $f14, $f16, $f18, %[pDst], %[iDstStride], $8, $9, $15)
2849 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2850 "beqz %[iHeight], 6f \n\t"
2851 PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
2852 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2853 "gslqc1 $f14, $f12, 0x0($8) \n\t"
2854
2855 FILTER_VER_ALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
2856 $f14, $f16, $f18, $f20, $f22, %[pDst], $0, $8, $9, $15)
2857 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2858 "beqz %[iHeight], 6f \n\t"
2859 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2860 "gslqc1 $f18, $f16, 0x0(%[pTap]) \n\t"
2861
2862 FILTER_VER_ALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
2863 $f18, $f20, $f22, $f24, $f26, %[pDst], %[iDstStride], $8, $9, $15)
2864 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2865 "beqz %[iHeight], 6f \n\t"
2866 PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
2867 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2868 "gslqc1 $f22, $f20, 0x0($8) \n\t"
2869 "j 5b \n\t"
2870
2871 "6: \n\t"
2872 PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
2873 "beqz %[iWidth], 7f \n\t"
2874 "move %[pTap], $10 \n\t"
2875 "move %[pDst], $11 \n\t"
2876 "move %[iHeight], $12 \n\t"
2877 PTR_ADDIU "%[pTap], %[pTap], 0x10 \n\t"
2878 PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t"
2879 "j 4b \n\t"
2880 "7: \n\t"
2881 : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
2882 [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
2883 : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
2884 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
2885 "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18",
2886 "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
2887 );
2888 RECOVER_REG;
2889 }
2890
McHorVer22Width8VerLastUnAlign_mmi(const uint8_t * pTap,int32_t iTapStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)2891 static inline void McHorVer22Width8VerLastUnAlign_mmi(const uint8_t *pTap,
2892 int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
2893 int32_t iWidth, int32_t iHeight) {
2894 BACKUP_REG;
2895 __asm__ volatile (
2896 ".set arch=loongson3a \n\t"
2897 "move $10, %[pTap] \n\t"
2898 "move $11, %[pDst] \n\t"
2899 "move $12, %[iHeight] \n\t"
2900 "dsrl %[iWidth], 0x3 \n\t"
2901 PTR_ADDU "$13, %[iTapStride], %[iTapStride] \n\t"
2902 "dli $14, 0x0020002000200020 \n\t"
2903
2904 "4: \n\t"
2905 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2906 "gsldlc1 $f0, 0x7(%[pTap]) \n\t"
2907 "gsldlc1 $f2, 0xF(%[pTap]) \n\t"
2908 "gsldlc1 $f4, 0x7($8) \n\t"
2909 "gsldlc1 $f6, 0xF($8) \n\t"
2910 "gsldrc1 $f0, 0x0(%[pTap]) \n\t"
2911 "gsldrc1 $f2, 0x8(%[pTap]) \n\t"
2912 "gsldrc1 $f4, 0x0($8) \n\t"
2913 "gsldrc1 $f6, 0x8($8) \n\t"
2914 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2915 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2916 "gsldlc1 $f8, 0x7(%[pTap]) \n\t"
2917 "gsldlc1 $f10, 0xF(%[pTap]) \n\t"
2918 "gsldlc1 $f12, 0x7($8) \n\t"
2919 "gsldlc1 $f14, 0xF($8) \n\t"
2920 "gsldrc1 $f8, 0x0(%[pTap]) \n\t"
2921 "gsldrc1 $f10, 0x8(%[pTap]) \n\t"
2922 "gsldrc1 $f12, 0x0($8) \n\t"
2923 "gsldrc1 $f14, 0x8($8) \n\t"
2924 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2925 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2926 "gsldlc1 $f16, 0x7(%[pTap]) \n\t"
2927 "gsldlc1 $f18, 0xF(%[pTap]) \n\t"
2928 "gsldlc1 $f20, 0x7($8) \n\t"
2929 "gsldlc1 $f22, 0xF($8) \n\t"
2930 "gsldrc1 $f16, 0x0(%[pTap]) \n\t"
2931 "gsldrc1 $f18, 0x8(%[pTap]) \n\t"
2932 "gsldrc1 $f20, 0x0($8) \n\t"
2933 "gsldrc1 $f22, 0x8($8) \n\t"
2934
2935 FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2936 $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
2937 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2938 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2939 "gsldlc1 $f24, 0x7(%[pTap]) \n\t"
2940 "gsldlc1 $f26, 0xF(%[pTap]) \n\t"
2941 "gsldrc1 $f24, 0x0(%[pTap]) \n\t"
2942 "gsldrc1 $f26, 0x8(%[pTap]) \n\t"
2943 "mov.d $f0, $f4 \n\t"
2944 "mov.d $f2, $f6 \n\t"
2945 "mov.d $f4, $f8 \n\t"
2946 "mov.d $f6, $f10 \n\t"
2947 "mov.d $f8, $f12 \n\t"
2948 "mov.d $f10, $f14 \n\t"
2949 "mov.d $f12, $f16 \n\t"
2950 "mov.d $f14, $f18 \n\t"
2951 "mov.d $f16, $f20 \n\t"
2952 "mov.d $f18, $f22 \n\t"
2953 "mov.d $f20, $f24 \n\t"
2954 "mov.d $f22, $f26 \n\t"
2955 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2956 PTR_SUBU "%[pTap], %[pTap], %[iTapStride] \n\t"
2957
2958 "5: \n\t"
2959 FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
2960 $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
2961
2962 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2963 "beqz %[iHeight], 6f \n\t"
2964 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2965 "gsldlc1 $f24, 0x7(%[pTap]) \n\t"
2966 "gsldlc1 $f26, 0xF(%[pTap]) \n\t"
2967 "gsldrc1 $f24, 0x0(%[pTap]) \n\t"
2968 "gsldrc1 $f26, 0x8(%[pTap]) \n\t"
2969 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2970
2971 FILTER_VER_UNALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22,
2972 $f24, $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9, $14)
2973 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2974 "beqz %[iHeight], 6f \n\t"
2975 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2976 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2977 "gsldlc1 $f28, 0x7($8) \n\t"
2978 "gsldlc1 $f30, 0xF($8) \n\t"
2979 "gsldrc1 $f28, 0x0($8) \n\t"
2980 "gsldrc1 $f30, 0x8($8) \n\t"
2981
2982 FILTER_VER_UNALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
2983 $f28, $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9, $14)
2984 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2985 "beqz %[iHeight], 6f \n\t"
2986 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
2987 "gsldlc1 $f0, 0x7(%[pTap]) \n\t"
2988 "gsldlc1 $f2, 0xF(%[pTap]) \n\t"
2989 "gsldrc1 $f0, 0x0(%[pTap]) \n\t"
2990 "gsldrc1 $f2, 0x8(%[pTap]) \n\t"
2991 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2992
2993 FILTER_VER_UNALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
2994 $f30, $f0, $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9, $14)
2995 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
2996 "beqz %[iHeight], 6f \n\t"
2997 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
2998 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
2999 "gsldlc1 $f4, 0x7($8) \n\t"
3000 "gsldlc1 $f6, 0xF($8) \n\t"
3001 "gsldrc1 $f4, 0x0($8) \n\t"
3002 "gsldrc1 $f6, 0x8($8) \n\t"
3003
3004 FILTER_VER_UNALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2,
3005 $f4, $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9, $14)
3006 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3007 "beqz %[iHeight], 6f \n\t"
3008 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
3009 "gsldlc1 $f8, 0x7(%[pTap]) \n\t"
3010 "gsldlc1 $f10, 0xF(%[pTap]) \n\t"
3011 "gsldrc1 $f8, 0x0(%[pTap]) \n\t"
3012 "gsldrc1 $f10, 0x8(%[pTap]) \n\t"
3013 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3014
3015 FILTER_VER_UNALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
3016 $f8, $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9, $14)
3017 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3018 "beqz %[iHeight], 6f \n\t"
3019 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3020 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
3021 "gsldlc1 $f12, 0x7($8) \n\t"
3022 "gsldlc1 $f14, 0xF($8) \n\t"
3023 "gsldrc1 $f12, 0x0($8) \n\t"
3024 "gsldrc1 $f14, 0x8($8) \n\t"
3025
3026 FILTER_VER_UNALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
3027 $f12, $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9, $14)
3028 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3029 "beqz %[iHeight], 6f \n\t"
3030 PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
3031 "gsldlc1 $f16, 0x7(%[pTap]) \n\t"
3032 "gsldlc1 $f18, 0xF(%[pTap]) \n\t"
3033 "gsldrc1 $f16, 0x0(%[pTap]) \n\t"
3034 "gsldrc1 $f18, 0x8(%[pTap]) \n\t"
3035 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3036
3037 FILTER_VER_UNALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
3038 $f16, $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9, $14)
3039 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3040 "beqz %[iHeight], 6f \n\t"
3041 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3042 PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
3043 "gsldlc1 $f20, 0x7($8) \n\t"
3044 "gsldlc1 $f22, 0xF($8) \n\t"
3045 "gsldrc1 $f20, 0x0($8) \n\t"
3046 "gsldrc1 $f22, 0x8($8) \n\t"
3047 "j 5b \n\t"
3048
3049 "6: \n\t"
3050 PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
3051 "beqz %[iWidth], 7f \n\t"
3052 "move %[pTap], $10 \n\t"
3053 "move %[pDst], $11 \n\t"
3054 "move %[iHeight], $12 \n\t"
3055 PTR_ADDIU "%[pTap], %[pTap], 0x10 \n\t"
3056 PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t"
3057 "j 4b \n\t"
3058
3059 "7: \n\t"
3060 : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
3061 [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
3062 : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
3063 : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
3064 "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
3065 "$f22", "$f24", "$f26", "$f28", "$f30"
3066 );
3067 RECOVER_REG;
3068 }
3069
3070 //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3071 static inline void McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t* pSrc,
3072 int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
3073 int32_t iWidth, int32_t iHeight) {
3074 ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
3075
3076 if (iWidth == 17 || iWidth == 9){
3077 int32_t tmp1 = 2 * (iWidth - 8);
3078 McHorVer22HorFirst_mmi(pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
3079
3080 McHorVer22Width8VerLastAlign_mmi((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
3081
3082 McHorVer22Width8VerLastUnAlign_mmi((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8,
3083 iDstStride, 8, iHeight);
3084 } else {
3085 int16_t iTmp[17 + 5];
3086 int32_t i, j, k;
3087
3088 for (i = 0; i < iHeight; i++) {
3089 for (j = 0; j < iWidth + 5; j++) {
3090 iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
3091 }
3092 for (k = 0; k < iWidth; k++) {
3093 pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
3094 }
3095 pSrc += iSrcStride;
3096 pDst += iDstStride;
3097 }
3098 }
3099 }
3100
McCopyWidthEq4_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3101 void McCopyWidthEq4_mmi(const uint8_t *pSrc, int iSrcStride,
3102 uint8_t *pDst, int iDstStride, int iHeight) {
3103 __asm__ volatile (
3104 ".set arch=loongson3a \n\t"
3105 "1: \n\t"
3106 "lwl $8, 0x3(%[pSrc]) \n\t"
3107 "lwr $8, 0x0(%[pSrc]) \n\t"
3108 "swl $8, 0x3(%[pDst]) \n\t"
3109 "swr $8, 0x0(%[pDst]) \n\t"
3110 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3111 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3112 PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
3113 "bnez %[iHeight], 1b \n\t"
3114 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3115 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3116 : "memory", "$8"
3117 );
3118 }
3119
McCopyWidthEq8_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3120 void McCopyWidthEq8_mmi(const uint8_t *pSrc, int iSrcStride,
3121 uint8_t *pDst, int iDstStride, int iHeight) {
3122 __asm__ volatile (
3123 ".set arch=loongson3a \n\t"
3124 "1: \n\t"
3125 "ldl $8, 0x7(%[pSrc]) \n\t"
3126 "ldr $8, 0x0(%[pSrc]) \n\t"
3127 "sdl $8, 0x7(%[pDst]) \n\t"
3128 "sdr $8, 0x0(%[pDst]) \n\t"
3129 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3130 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3131 PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
3132 "bnez %[iHeight], 1b \n\t"
3133 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3134 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3135 : "memory", "$8"
3136 );
3137 }
3138
McCopyWidthEq16_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3139 void McCopyWidthEq16_mmi(const uint8_t *pSrc, int iSrcStride,
3140 uint8_t *pDst, int iDstStride, int iHeight) {
3141 __asm__ volatile (
3142 ".set arch=loongson3a \n\t"
3143 "1: \n\t"
3144 "ldl $8, 0x7(%[pSrc]) \n\t"
3145 "ldl $9, 0xF(%[pSrc]) \n\t"
3146 "ldr $8, 0x0(%[pSrc]) \n\t"
3147 "ldr $9, 0x8(%[pSrc]) \n\t"
3148 "sdl $8, 0x7(%[pDst]) \n\t"
3149 "sdl $9, 0xF(%[pDst]) \n\t"
3150 "sdr $8, 0x0(%[pDst]) \n\t"
3151 "sdr $9, 0x8(%[pDst]) \n\t"
3152 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3153 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3154 PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
3155 "bnez %[iHeight], 1b \n\t"
3156 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3157 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3158 : "memory", "$8", "$9"
3159 );
3160 }
3161
McCopy_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3162 static inline void McCopy_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3163 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3164 if (iWidth == 16)
3165 McCopyWidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3166 else if (iWidth == 8)
3167 McCopyWidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3168 else if (iWidth == 4)
3169 McCopyWidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3170 else
3171 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3172 }
3173
McChromaWidthEq4_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,const uint8_t * pABCD,int32_t iHeight)3174 void McChromaWidthEq4_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
3175 int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
3176 __asm__ volatile (
3177 ".set arch=loongson3a \n\t"
3178 "gsldlc1 $f6, 0x7(%[pABCD]) \n\t"
3179 "gsldrc1 $f6, 0x0(%[pABCD]) \n\t"
3180 "xor $f14, $f14, $f14 \n\t"
3181 "punpcklbh $f6, $f6, $f6 \n\t"
3182 "mov.d $f8, $f6 \n\t"
3183 "punpcklhw $f6, $f6, $f6 \n\t"
3184 "punpckhhw $f8, $f8, $f8 \n\t"
3185 "mov.d $f10, $f6 \n\t"
3186 "punpcklbh $f6, $f6, $f14 \n\t"
3187 "punpckhbh $f10, $f10, $f14 \n\t"
3188
3189 "mov.d $f12, $f8 \n\t"
3190 "punpcklbh $f8, $f8, $f14 \n\t"
3191 "punpckhbh $f12, $f12, $f14 \n\t"
3192 PTR_ADDU "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
3193 "dli $8, 0x6 \n\t"
3194 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
3195 "gsldlc1 $f2, 0x8(%[pSrc]) \n\t"
3196 "dmtc1 $8, $f16 \n\t"
3197 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
3198 "gsldrc1 $f2, 0x1(%[pSrc]) \n\t"
3199 "dli $8, 0x0020002000200020 \n\t"
3200 "punpcklbh $f0, $f0, $f14 \n\t"
3201 "punpcklbh $f2, $f2, $f14 \n\t"
3202
3203 "dmtc1 $8, $f18 \n\t"
3204 "1: \n\t"
3205 "pmullh $f0, $f0, $f6 \n\t"
3206 "pmullh $f2, $f2, $f10 \n\t"
3207 "paddh $f0, $f0, $f2 \n\t"
3208
3209 "gsldlc1 $f2, 0x7(%[pABCD]) \n\t"
3210 "gsldrc1 $f2, 0x0(%[pABCD]) \n\t"
3211 "punpcklbh $f2, $f2, $f14 \n\t"
3212 "mov.d $f4, $f2 \n\t"
3213 "pmullh $f2, $f2, $f8 \n\t"
3214 "paddh $f0, $f0, $f2 \n\t"
3215 "gsldlc1 $f2, 0x8(%[pABCD]) \n\t"
3216 "gsldrc1 $f2, 0x1(%[pABCD]) \n\t"
3217 "punpcklbh $f2, $f2, $f14 \n\t"
3218 "mov.d $f14, $f2 \n\t"
3219 "pmullh $f2, $f2, $f12 \n\t"
3220 "paddh $f0, $f0, $f2 \n\t"
3221 "mov.d $f2, $f14 \n\t"
3222 "paddh $f0, $f0, $f18 \n\t"
3223 "psrlh $f0, $f0, $f16 \n\t"
3224 "xor $f14, $f14, $f14 \n\t"
3225 "packushb $f0, $f0, $f14 \n\t"
3226 "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
3227 "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
3228 "mov.d $f0, $f4 \n\t"
3229 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3230 PTR_ADDU "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
3231 PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
3232 "bnez %[iHeight], 1b \n\t"
3233 : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
3234 [pABCD]"+&r"((unsigned char *)pABCD), [iHeight]"+&r"((int)iHeight)
3235 : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
3236 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3237 "$f14", "$f16", "$f18"
3238 );
3239 }
3240
McChromaWidthEq8_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,const uint8_t * pABCD,int32_t iHeight)3241 void McChromaWidthEq8_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
3242 int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
3243 BACKUP_REG;
3244 __asm__ volatile (
3245 ".set arch=loongson3a \n\t"
3246 "gsldlc1 $f12, 0x7(%[pABCD]) \n\t"
3247 "xor $f28, $f28, $f28 \n\t"
3248 "gsldrc1 $f12, 0x0(%[pABCD]) \n\t"
3249 "punpcklbh $f12, $f12, $f12 \n\t"
3250 "punpckhhw $f14, $f12, $f12 \n\t"
3251 "punpcklhw $f12, $f12, $f12 \n\t"
3252
3253 "mov.d $f16, $f14 \n\t"
3254 "punpckhwd $f14, $f12, $f12 \n\t"
3255 "punpcklwd $f12, $f12, $f12 \n\t"
3256 "punpckhwd $f18, $f16, $f16 \n\t"
3257 "punpcklwd $f16, $f16, $f16 \n\t"
3258 "mov.d $f20, $f14 \n\t"
3259 "mov.d $f24, $f18 \n\t"
3260
3261 "punpckhbh $f14, $f12, $f28 \n\t"
3262 "punpcklbh $f12, $f12, $f28 \n\t"
3263 "punpckhbh $f22, $f20, $f28 \n\t"
3264 "punpcklbh $f20, $f20, $f28 \n\t"
3265 "punpckhbh $f18, $f16, $f28 \n\t"
3266 "punpcklbh $f16, $f16, $f28 \n\t"
3267 "punpckhbh $f26, $f24, $f28 \n\t"
3268 "punpcklbh $f24, $f24, $f28 \n\t"
3269
3270 PTR_ADDU "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
3271 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
3272 "gsldlc1 $f4, 0x8(%[pSrc]) \n\t"
3273 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
3274 "gsldrc1 $f4, 0x1(%[pSrc]) \n\t"
3275 "punpckhbh $f2, $f0, $f28 \n\t"
3276 "punpcklbh $f0, $f0, $f28 \n\t"
3277 "punpckhbh $f6, $f4, $f28 \n\t"
3278 "punpcklbh $f4, $f4, $f28 \n\t"
3279 "1: \n\t"
3280 "dli $8, 0x20 \n\t"
3281 "dmtc1 $8, $f30 \n\t"
3282
3283 "pmullh $f0, $f0, $f12 \n\t"
3284 "pmullh $f2, $f2, $f14 \n\t"
3285 "pmullh $f4, $f4, $f20 \n\t"
3286 "pmullh $f6, $f6, $f22 \n\t"
3287 "paddh $f0, $f0, $f4 \n\t"
3288 "paddh $f2, $f2, $f6 \n\t"
3289
3290 "gsldlc1 $f4, 0x7(%[pABCD]) \n\t"
3291 "gsldrc1 $f4, 0x0(%[pABCD]) \n\t"
3292 "punpckhbh $f6, $f4, $f28 \n\t"
3293 "punpcklbh $f4, $f4, $f28 \n\t"
3294 "mov.d $f8, $f4 \n\t"
3295 "mov.d $f10, $f6 \n\t"
3296 "pmullh $f4, $f4, $f16 \n\t"
3297 "pmullh $f6, $f6, $f18 \n\t"
3298 "paddh $f0, $f0, $f4 \n\t"
3299 "paddh $f2, $f2, $f6 \n\t"
3300
3301 "gsldlc1 $f4, 0x8(%[pABCD]) \n\t"
3302 "gsldrc1 $f4, 0x1(%[pABCD]) \n\t"
3303 "punpckhbh $f6, $f4, $f28 \n\t"
3304 "punpcklbh $f4, $f4, $f28 \n\t"
3305 "mov.d $f28, $f4 \n\t"
3306 "mov.d $f30, $f6 \n\t"
3307 "pmullh $f4, $f4, $f24 \n\t"
3308 "pmullh $f6, $f6, $f26 \n\t"
3309 "paddh $f0, $f0, $f4 \n\t"
3310 "paddh $f2, $f2, $f6 \n\t"
3311 "mov.d $f4, $f28 \n\t"
3312 "mov.d $f6, $f30 \n\t"
3313
3314 "dli $8, 0x0020002000200020 \n\t"
3315 "dmfc1 $9, $f20 \n\t"
3316 "dmtc1 $8, $f20 \n\t"
3317 "dli $8, 0x6 \n\t"
3318 "paddh $f0, $f0, $f20 \n\t"
3319 "paddh $f2, $f2, $f20 \n\t"
3320 "dmtc1 $8, $f20 \n\t"
3321 "psrlh $f0, $f0, $f20 \n\t"
3322 "psrlh $f2, $f2, $f20 \n\t"
3323
3324 "xor $f28, $f28, $f28 \n\t"
3325 "packushb $f0, $f0, $f2 \n\t"
3326 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
3327 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
3328
3329 "mov.d $f0, $f8 \n\t"
3330 "mov.d $f2, $f10 \n\t"
3331 "dmtc1 $9, $f20 \n\t"
3332 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3333 PTR_ADDU "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
3334
3335 PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
3336 "bnez %[iHeight], 1b \n\t"
3337 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [pABCD]"+&r"(pABCD),
3338 [iHeight]"+&r"(iHeight)
3339 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3340 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3341 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3342 );
3343 RECOVER_REG;
3344 }
3345
McChroma_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)3346 void McChroma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3347 int32_t iDstStride, int16_t iMvX, int16_t iMvY,
3348 int32_t iWidth, int32_t iHeight) {
3349 static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
3350 McChromaWidthEq4_mmi,
3351 McChromaWidthEq8_mmi
3352 };
3353 const int32_t kiD8x = iMvX & 0x07;
3354 const int32_t kiD8y = iMvY & 0x07;
3355 if (kiD8x == 0 && kiD8y == 0) {
3356 McCopy_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
3357 return;
3358 }
3359 if (iWidth != 2) {
3360 kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
3361 g_kuiABCD[kiD8y][kiD8x], iHeight);
3362 } else
3363 McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
3364 iWidth, iHeight);
3365 }
3366
McHorVer20WidthEq8_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3367 void McHorVer20WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3368 int iDstStride, int iHeight) {
3369 BACKUP_REG;
3370 __asm__ volatile (
3371 ".set arch=loongson3a \n\t"
3372 PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
3373 "xor $f28, $f28, $f28 \n\t"
3374 "dli $8, 0x0010001000100010 \n\t"
3375 "dmtc1 $8, $f24 \n\t"
3376 "dli $8, 0x2 \n\t"
3377 "dmtc1 $8, $f26 \n\t"
3378 "dli $8, 0x5 \n\t"
3379 "dmtc1 $8, $f30 \n\t"
3380 "1: \n\t"
3381 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
3382 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
3383 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
3384 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
3385 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
3386 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
3387 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
3388 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
3389 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
3390 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
3391 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
3392 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
3393 "punpckhbh $f2, $f0, $f28 \n\t"
3394 "punpckhbh $f6, $f4, $f28 \n\t"
3395 "punpckhbh $f10, $f8, $f28 \n\t"
3396 "punpckhbh $f14, $f12, $f28 \n\t"
3397 "punpckhbh $f18, $f16, $f28 \n\t"
3398 "punpckhbh $f22, $f20, $f28 \n\t"
3399 "punpcklbh $f0, $f0, $f28 \n\t"
3400 "punpcklbh $f4, $f4, $f28 \n\t"
3401 "punpcklbh $f8, $f8, $f28 \n\t"
3402 "punpcklbh $f12, $f12, $f28 \n\t"
3403 "punpcklbh $f16, $f16, $f28 \n\t"
3404 "punpcklbh $f20, $f20, $f28 \n\t"
3405 "paddh $f8, $f8, $f12 \n\t"
3406 "paddh $f10, $f10, $f14 \n\t"
3407 "paddh $f16, $f16, $f20 \n\t"
3408 "paddh $f18, $f18, $f22 \n\t"
3409 "psllh $f16, $f16, $f26 \n\t"
3410 "psllh $f18, $f18, $f26 \n\t"
3411 "psubh $f16, $f16, $f8 \n\t"
3412 "psubh $f18, $f18, $f10 \n\t"
3413 "paddh $f0, $f0, $f4 \n\t"
3414 "paddh $f2, $f2, $f6 \n\t"
3415 "paddh $f0, $f0, $f16 \n\t"
3416 "paddh $f2, $f2, $f18 \n\t"
3417 "psllh $f16, $f16, $f26 \n\t"
3418 "psllh $f18, $f18, $f26 \n\t"
3419 "paddh $f0, $f0, $f16 \n\t"
3420 "paddh $f2, $f2, $f18 \n\t"
3421 "paddh $f0, $f0, $f24 \n\t"
3422 "paddh $f2, $f2, $f24 \n\t"
3423 "psrah $f0, $f0, $f30 \n\t"
3424 "psrah $f2, $f2, $f30 \n\t"
3425 "packushb $f0, $f0, $f2 \n\t"
3426 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
3427 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
3428 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3429 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3430 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3431 "bnez %[iHeight], 1b \n\t"
3432 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3433 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3434 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3435 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3436 );
3437 RECOVER_REG;
3438 }
3439
McHorVer20WidthEq16_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3440 void McHorVer20WidthEq16_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3441 int iDstStride, int iHeight) {
3442 BACKUP_REG;
3443 __asm__ volatile (
3444 ".set arch=loongson3a \n\t"
3445 PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
3446 "dli $8, 0x0010001000100010 \n\t"
3447 "dmtc1 $8, $f24 \n\t"
3448 "dli $8, 0x2 \n\t"
3449 "dmtc1 $8, $f26 \n\t"
3450 "dli $8, 0x5 \n\t"
3451 "dmtc1 $8, $f30 \n\t"
3452 "1: \n\t"
3453 "xor $f28, $f28, $f28 \n\t"
3454 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
3455 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
3456 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
3457 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
3458 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
3459 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
3460 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
3461 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
3462 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
3463 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
3464 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
3465 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
3466 "punpckhbh $f2, $f0, $f28 \n\t"
3467 "punpckhbh $f6, $f4, $f28 \n\t"
3468 "punpckhbh $f10, $f8, $f28 \n\t"
3469 "punpckhbh $f14, $f12, $f28 \n\t"
3470 "punpckhbh $f18, $f16, $f28 \n\t"
3471 "punpckhbh $f22, $f20, $f28 \n\t"
3472 "punpcklbh $f0, $f0, $f28 \n\t"
3473 "punpcklbh $f4, $f4, $f28 \n\t"
3474 "punpcklbh $f8, $f8, $f28 \n\t"
3475 "punpcklbh $f12, $f12, $f28 \n\t"
3476 "punpcklbh $f16, $f16, $f28 \n\t"
3477 "punpcklbh $f20, $f20, $f28 \n\t"
3478 "paddh $f8, $f8, $f12 \n\t"
3479 "paddh $f10, $f10, $f14 \n\t"
3480 "paddh $f16, $f16, $f20 \n\t"
3481 "paddh $f18, $f18, $f22 \n\t"
3482 "psllh $f16, $f16, $f26 \n\t"
3483 "psllh $f18, $f18, $f26 \n\t"
3484 "psubh $f16, $f16, $f8 \n\t"
3485 "psubh $f18, $f18, $f10 \n\t"
3486 "paddh $f0, $f0, $f4 \n\t"
3487 "paddh $f2, $f2, $f6 \n\t"
3488 "paddh $f0, $f0, $f16 \n\t"
3489 "paddh $f2, $f2, $f18 \n\t"
3490 "psllh $f16, $f16, $f26 \n\t"
3491 "psllh $f18, $f18, $f26 \n\t"
3492 "paddh $f0, $f0, $f16 \n\t"
3493 "paddh $f2, $f2, $f18 \n\t"
3494 "paddh $f0, $f0, $f24 \n\t"
3495 "paddh $f2, $f2, $f24 \n\t"
3496 "psrah $f0, $f0, $f30 \n\t"
3497 "psrah $f2, $f2, $f30 \n\t"
3498 "packushb $f0, $f0, $f2 \n\t"
3499 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
3500 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
3501 "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
3502 "gsldlc1 $f4, 0x14(%[pSrc]) \n\t"
3503 "gsldlc1 $f8, 0x10(%[pSrc]) \n\t"
3504 "gsldlc1 $f12, 0x13(%[pSrc]) \n\t"
3505 "gsldlc1 $f16, 0x11(%[pSrc]) \n\t"
3506 "gsldlc1 $f20, 0x12(%[pSrc]) \n\t"
3507 "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
3508 "gsldrc1 $f4, 0xd(%[pSrc]) \n\t"
3509 "gsldrc1 $f8, 0x9(%[pSrc]) \n\t"
3510 "gsldrc1 $f12, 0xc(%[pSrc]) \n\t"
3511 "gsldrc1 $f16, 0xa(%[pSrc]) \n\t"
3512 "gsldrc1 $f20, 0xb(%[pSrc]) \n\t"
3513 "punpckhbh $f2, $f0, $f28 \n\t"
3514 "punpckhbh $f6, $f4, $f28 \n\t"
3515 "punpckhbh $f10, $f8, $f28 \n\t"
3516 "punpckhbh $f14, $f12, $f28 \n\t"
3517 "punpckhbh $f18, $f16, $f28 \n\t"
3518 "punpckhbh $f22, $f20, $f28 \n\t"
3519 "punpcklbh $f0, $f0, $f28 \n\t"
3520 "punpcklbh $f4, $f4, $f28 \n\t"
3521 "punpcklbh $f8, $f8, $f28 \n\t"
3522 "punpcklbh $f12, $f12, $f28 \n\t"
3523 "punpcklbh $f16, $f16, $f28 \n\t"
3524 "punpcklbh $f20, $f20, $f28 \n\t"
3525 "paddh $f8, $f8, $f12 \n\t"
3526 "paddh $f10, $f10, $f14 \n\t"
3527 "paddh $f16, $f16, $f20 \n\t"
3528 "paddh $f18, $f18, $f22 \n\t"
3529 "psllh $f16, $f16, $f26 \n\t"
3530 "psllh $f18, $f18, $f26 \n\t"
3531 "psubh $f16, $f16, $f8 \n\t"
3532 "psubh $f18, $f18, $f10 \n\t"
3533 "paddh $f0, $f0, $f4 \n\t"
3534 "paddh $f2, $f2, $f6 \n\t"
3535 "paddh $f0, $f0, $f16 \n\t"
3536 "paddh $f2, $f2, $f18 \n\t"
3537 "psllh $f16, $f16, $f26 \n\t"
3538 "psllh $f18, $f18, $f26 \n\t"
3539 "paddh $f0, $f0, $f16 \n\t"
3540 "paddh $f2, $f2, $f18 \n\t"
3541 "paddh $f0, $f0, $f24 \n\t"
3542 "paddh $f2, $f2, $f24 \n\t"
3543 "psrah $f0, $f0, $f30 \n\t"
3544 "psrah $f2, $f2, $f30 \n\t"
3545 "packushb $f0, $f0, $f2 \n\t"
3546 "gssdlc1 $f0, 0xF(%[pDst]) \n\t"
3547 "gssdrc1 $f0, 0x8(%[pDst]) \n\t"
3548 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3549 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3550 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3551 "bnez %[iHeight], 1b \n\t"
3552 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3553 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3554 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3555 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3556 );
3557 RECOVER_REG;
3558 }
3559
McHorVer20WidthEq4_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3560 void McHorVer20WidthEq4_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3561 int iDstStride, int iHeight) {
3562 __asm__ volatile (
3563 ".set arch=loongson3a \n\t"
3564 "1: \n\t"
3565 PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
3566 "xor $f14, $f14, $f14 \n\t"
3567 "dli $8, 0x0010001000100010 \n\t"
3568 "dmtc1 $8, $f12 \n\t"
3569 "1: \n\t"
3570 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
3571 "gsldlc1 $f2, 0xc(%[pSrc]) \n\t"
3572 "gsldlc1 $f4, 0x8(%[pSrc]) \n\t"
3573 "gsldlc1 $f6, 0xb(%[pSrc]) \n\t"
3574 "gsldlc1 $f8, 0x9(%[pSrc]) \n\t"
3575 "gsldlc1 $f10, 0xa(%[pSrc]) \n\t"
3576 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
3577 "gsldrc1 $f2, 0x5(%[pSrc]) \n\t"
3578 "gsldrc1 $f4, 0x1(%[pSrc]) \n\t"
3579 "gsldrc1 $f6, 0x4(%[pSrc]) \n\t"
3580 "gsldrc1 $f8, 0x2(%[pSrc]) \n\t"
3581 "gsldrc1 $f10, 0x3(%[pSrc]) \n\t"
3582 "dli $8, 0x2 \n\t"
3583 "punpcklbh $f0, $f0, $f14 \n\t"
3584 "punpcklbh $f2, $f2, $f14 \n\t"
3585 "punpcklbh $f4, $f4, $f14 \n\t"
3586 "punpcklbh $f6, $f6, $f14 \n\t"
3587 "punpcklbh $f8, $f8, $f14 \n\t"
3588 "punpcklbh $f10, $f10, $f14 \n\t"
3589 "dmtc1 $8, $f16 \n\t"
3590 "paddh $f4, $f4, $f6 \n\t"
3591 "paddh $f8, $f8, $f10 \n\t"
3592 "psllh $f8, $f8, $f16 \n\t"
3593 "psubh $f8, $f8, $f4 \n\t"
3594 "paddh $f0, $f0, $f2 \n\t"
3595 "paddh $f0, $f0, $f8 \n\t"
3596 "dli $8, 0x5 \n\t"
3597 "psllh $f8, $f8, $f16 \n\t"
3598 "paddh $f0, $f0, $f8 \n\t"
3599 "paddh $f0, $f0, $f12 \n\t"
3600 "dmtc1 $8, $f16 \n\t"
3601 "psrah $f0, $f0, $f16 \n\t"
3602 "packushb $f0, $f0, $f14 \n\t"
3603 "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
3604 "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
3605 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3606 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3607 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3608 "bnez %[iHeight], 1b \n\t"
3609 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3610 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3611 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3612 "$f14", "$f16"
3613 );
3614 }
3615
McHorVer20_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3616 static inline void McHorVer20_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3617 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3618 if (iWidth == 16)
3619 McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3620 else if (iWidth == 8)
3621 McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3622 else
3623 McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3624 }
3625
McHorVer02WidthEq8_mmi(const uint8_t * pSrc,int iSrcStride,uint8_t * pDst,int iDstStride,int iHeight)3626 void McHorVer02WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
3627 int iDstStride, int iHeight) {
3628 BACKUP_REG;
3629 __asm__ volatile (
3630 ".set arch=loongson3a \n\t"
3631 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3632 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3633 "xor $f28, $f28, $f28 \n\t"
3634 MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
3635 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
3636 MMI_LOAD_8P($f4, $f6, $f28, $8)
3637 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3638 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3639 MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
3640 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
3641 MMI_LOAD_8P($f12, $f14, $f28, $8)
3642 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3643 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3644 MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
3645 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
3646 MMI_LOAD_8P($f20, $f22, $f28, $8)
3647
3648 "1: \n\t"
3649 FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
3650 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
3651 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3652 "beqz %[iHeight], 2f \n\t"
3653 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3654 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3655 MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
3656 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3657 FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
3658 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
3659 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3660 "beqz %[iHeight], 2f \n\t"
3661 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3662 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
3663 MMI_LOAD_8P($f28, $f30, $f0, $8)
3664 FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
3665 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
3666 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3667 "beqz %[iHeight], 2f \n\t"
3668 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3669 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3670 MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
3671 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3672 FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
3673 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
3674 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3675 "beqz %[iHeight], 2f \n\t"
3676 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3677 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
3678 MMI_LOAD_8P($f4, $f6, $f8, $8)
3679 FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
3680 $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
3681 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3682 "beqz %[iHeight], 2f \n\t"
3683 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3684 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3685 MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
3686 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3687 FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
3688 $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
3689 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3690 "beqz %[iHeight], 2f \n\t"
3691 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3692 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
3693 MMI_LOAD_8P($f12, $f14, $f16, $8)
3694 FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
3695 $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
3696 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3697 "beqz %[iHeight], 2f \n\t"
3698 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3699 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3700 MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
3701 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3702 FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
3703 $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
3704 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3705 "beqz %[iHeight], 2f \n\t"
3706 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3707 PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
3708 MMI_LOAD_8P($f20, $f22, $f24, $8)
3709 "j 1b \n\t"
3710 "2: \n\t"
3711 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3712 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3713 : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3714 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3715 );
3716 RECOVER_REG;
3717 }
3718
McHorVer02WidthEq16_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3719 static inline void McHorVer02WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3720 uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3721 McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3722 McHorVer02WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
3723 }
3724
McHorVer02_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3725 static inline void McHorVer02_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3726 uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
3727 int32_t iHeight) {
3728 if (iWidth == 16)
3729 McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3730 else if (iWidth == 8)
3731 McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3732 else
3733 McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
3734 }
3735
McHorVer22Width8HorFirst_mmi(const uint8_t * pSrc,int16_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3736 void McHorVer22Width8HorFirst_mmi(const uint8_t *pSrc, int16_t iSrcStride,
3737 uint8_t *pDst, int32_t iDstStride, int32_t iHeight) {
3738 BACKUP_REG;
3739 __asm__ volatile (
3740 ".set arch=loongson3a \n\t"
3741 "xor $f28, $f28, $f28 \n\t"
3742 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3743 PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3744 "dli $8, 0x2 \n\t"
3745 "dmtc1 $8, $f30 \n\t"
3746 "1: \n\t"
3747 "xor $f28, $f28, $f28 \n\t"
3748 "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
3749 "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
3750 "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
3751 "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
3752 "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
3753 "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
3754 "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
3755 "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
3756 "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
3757 "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
3758 "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
3759 "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
3760 "punpckhbh $f2, $f0, $f28 \n\t"
3761 "punpckhbh $f6, $f4, $f28 \n\t"
3762 "punpckhbh $f10, $f8, $f28 \n\t"
3763 "punpckhbh $f14, $f12, $f28 \n\t"
3764 "punpckhbh $f18, $f16, $f28 \n\t"
3765 "punpckhbh $f22, $f20, $f28 \n\t"
3766 "punpcklbh $f0, $f0, $f28 \n\t"
3767 "punpcklbh $f4, $f4, $f28 \n\t"
3768 "punpcklbh $f8, $f8, $f28 \n\t"
3769 "punpcklbh $f12, $f12, $f28 \n\t"
3770 "punpcklbh $f16, $f16, $f28 \n\t"
3771 "punpcklbh $f20, $f20, $f28 \n\t"
3772 "paddh $f8, $f8, $f12 \n\t"
3773 "paddh $f10, $f10, $f14 \n\t"
3774 "paddh $f16, $f16, $f20 \n\t"
3775 "paddh $f18, $f18, $f22 \n\t"
3776 "psllh $f16, $f16, $f30 \n\t"
3777 "psllh $f18, $f18, $f30 \n\t"
3778 "psubh $f16, $f16, $f8 \n\t"
3779 "psubh $f18, $f18, $f10 \n\t"
3780 "paddh $f0, $f0, $f4 \n\t"
3781 "paddh $f2, $f2, $f6 \n\t"
3782 "paddh $f0, $f0, $f16 \n\t"
3783 "paddh $f2, $f2, $f18 \n\t"
3784 "psllh $f16, $f16, $f30 \n\t"
3785 "psllh $f18, $f18, $f30 \n\t"
3786 "paddh $f0, $f0, $f16 \n\t"
3787 "paddh $f2, $f2, $f18 \n\t"
3788 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
3789 "gssdlc1 $f2, 0xF(%[pDst]) \n\t"
3790 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
3791 "gssdrc1 $f2, 0x8(%[pDst]) \n\t"
3792 PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
3793 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3794 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3795 "bnez %[iHeight], 1b \n\t"
3796 : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
3797 : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
3798 : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
3799 "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
3800 );
3801 RECOVER_REG;
3802 }
3803
McHorVer22WidthEq8_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3804 static inline void McHorVer22WidthEq8_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3805 uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3806 ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
3807 McHorVer22Width8HorFirst_mmi (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
3808 McHorVer22Width8VerLastAlign_mmi ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
3809 }
3810
McHorVer22WidthEq16_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iHeight)3811 static inline void McHorVer22WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
3812 uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
3813 McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3814 McHorVer22WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
3815 }
3816
McHorVer22_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3817 static inline void McHorVer22_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3818 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3819 if (iWidth == 16)
3820 McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3821 else if (iWidth == 8)
3822 McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
3823 else
3824 McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
3825 }
3826
PixelAvgWidthEq4_mmi(uint8_t * pDst,int iDstStride,const uint8_t * pSrcA,int iSrcAStride,const uint8_t * pSrcB,int iSrcBStride,int iHeight)3827 void PixelAvgWidthEq4_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
3828 int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3829 __asm__ volatile (
3830 ".set arch=loongson3a \n\t"
3831 "1: \n\t"
3832 "gsldlc1 $f0, 0x7(%[pSrcB]) \n\t"
3833 "gsldlc1 $f2, 0x7(%[pSrcA]) \n\t"
3834 "gsldrc1 $f0, 0x0(%[pSrcB]) \n\t"
3835 "gsldrc1 $f2, 0x0(%[pSrcA]) \n\t"
3836 "pavgb $f0, $f0, $f2 \n\t"
3837 "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
3838 "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
3839 PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
3840 PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
3841 PTR_ADDU "%[pSrcA], %[pSrcA], %[iSrcAStride] \n\t"
3842 PTR_ADDU "%[pSrcB], %[pSrcB], %[iSrcBStride] \n\t"
3843 "bnez %[iHeight], 1b \n\t"
3844 : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3845 [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3846 : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3847 [iSrcBStride]"r"((int)iSrcBStride)
3848 : "memory", "$8", "$9", "$10", "$f0", "$f2"
3849 );
3850 }
3851
PixelAvgWidthEq8_mmi(uint8_t * pDst,int iDstStride,const uint8_t * pSrcA,int iSrcAStride,const uint8_t * pSrcB,int iSrcBStride,int iHeight)3852 void PixelAvgWidthEq8_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
3853 int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3854 __asm__ volatile (
3855 ".set arch=loongson3a \n\t"
3856 "1: \n\t"
3857 "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t"
3858 "gsldlc1 $f2, 0x7(%[pSrcB]) \n\t"
3859 "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t"
3860 "gsldrc1 $f2, 0x0(%[pSrcB]) \n\t"
3861 "pavgb $f0, $f0, $f2 \n\t"
3862 PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t"
3863 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
3864 PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t"
3865 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
3866 "gsldlc1 $f0, 0x7($8) \n\t"
3867 "gsldlc1 $f2, 0x7($9) \n\t"
3868 "gsldrc1 $f0, 0x0($8) \n\t"
3869 "gsldrc1 $f2, 0x0($9) \n\t"
3870 "pavgb $f0, $f0, $f2 \n\t"
3871 PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t"
3872 "gssdlc1 $f0, 0x7($10) \n\t"
3873 PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t"
3874 "gssdrc1 $f0, 0x0($10) \n\t"
3875 PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t"
3876 PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t"
3877 PTR_ADDIU "%[iHeight], %[iHeight], -0x2 \n\t"
3878 "bnez %[iHeight], 1b \n\t"
3879 : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3880 [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3881 : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3882 [iSrcBStride]"r"((int)iSrcBStride)
3883 : "memory", "$8", "$9", "$10", "$f0", "$f2"
3884 );
3885 }
3886
PixelAvgWidthEq16_mmi(uint8_t * pDst,int iDstStride,const uint8_t * pSrcA,int iSrcAStride,const uint8_t * pSrcB,int iSrcBStride,int iHeight)3887 void PixelAvgWidthEq16_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
3888 int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
3889 __asm__ volatile (
3890 ".set arch=loongson3a \n\t"
3891 "1: \n\t"
3892 "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t"
3893 "gsldlc1 $f2, 0xF(%[pSrcA]) \n\t"
3894 "gsldlc1 $f4, 0x7(%[pSrcB]) \n\t"
3895 "gsldlc1 $f6, 0xF(%[pSrcB]) \n\t"
3896 "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t"
3897 "gsldrc1 $f2, 0x8(%[pSrcA]) \n\t"
3898 "gsldrc1 $f4, 0x0(%[pSrcB]) \n\t"
3899 "gsldrc1 $f6, 0x8(%[pSrcB]) \n\t"
3900 "pavgb $f0, $f0, $f4 \n\t"
3901 "pavgb $f2, $f2, $f6 \n\t"
3902 PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t"
3903 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
3904 "gssdlc1 $f2, 0xF(%[pDst]) \n\t"
3905 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
3906 "gssdrc1 $f2, 0x8(%[pDst]) \n\t"
3907 PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t"
3908 "gsldlc1 $f0, 0x7($8) \n\t"
3909 "gsldlc1 $f2, 0xF($8) \n\t"
3910 "gsldrc1 $f0, 0x0($8) \n\t"
3911 "gsldrc1 $f2, 0x8($8) \n\t"
3912 PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t"
3913 "gsldlc1 $f4, 0x7($9) \n\t"
3914 "gsldlc1 $f6, 0xF($9) \n\t"
3915 "gsldrc1 $f4, 0x0($9) \n\t"
3916 "gsldrc1 $f6, 0x8($9) \n\t"
3917 "pavgb $f0, $f0, $f4 \n\t"
3918 "pavgb $f2, $f2, $f6 \n\t"
3919 "gssdlc1 $f0, 0x7($10) \n\t"
3920 "gssdlc1 $f2, 0xF($10) \n\t"
3921 "gssdrc1 $f0, 0x0($10) \n\t"
3922 "gssdrc1 $f2, 0x8($10) \n\t"
3923
3924 PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t"
3925 PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t"
3926 PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t"
3927 "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t"
3928 "gsldlc1 $f2, 0xF(%[pSrcA]) \n\t"
3929 "gsldlc1 $f4, 0x7(%[pSrcB]) \n\t"
3930 "gsldlc1 $f6, 0xF(%[pSrcB]) \n\t"
3931 "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t"
3932 "gsldrc1 $f2, 0x8(%[pSrcA]) \n\t"
3933 "gsldrc1 $f4, 0x0(%[pSrcB]) \n\t"
3934 "gsldrc1 $f6, 0x8(%[pSrcB]) \n\t"
3935 "pavgb $f0, $f0, $f4 \n\t"
3936 "pavgb $f2, $f2, $f6 \n\t"
3937 PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t"
3938 PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t"
3939 "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
3940 "gssdlc1 $f2, 0xF(%[pDst]) \n\t"
3941 "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
3942 "gssdrc1 $f2, 0x8(%[pDst]) \n\t"
3943 "gsldlc1 $f0, 0x7($8) \n\t"
3944 "gsldlc1 $f2, 0xF($8) \n\t"
3945 "gsldlc1 $f4, 0x7($9) \n\t"
3946 "gsldlc1 $f6, 0xF($9) \n\t"
3947 "gsldrc1 $f0, 0x0($8) \n\t"
3948 "gsldrc1 $f2, 0x8($8) \n\t"
3949 "gsldrc1 $f4, 0x0($9) \n\t"
3950 "gsldrc1 $f6, 0x8($9) \n\t"
3951 PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t"
3952 "pavgb $f0, $f0, $f4 \n\t"
3953 "pavgb $f2, $f2, $f6 \n\t"
3954 "gssdlc1 $f0, 0x7($10) \n\t"
3955 "gssdlc1 $f2, 0xF($10) \n\t"
3956 "gssdrc1 $f0, 0x0($10) \n\t"
3957 "gssdrc1 $f2, 0x8($10) \n\t"
3958 PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t"
3959 PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t"
3960 PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t"
3961 PTR_ADDIU "%[iHeight], %[iHeight], -0x4 \n\t"
3962 "bnez %[iHeight], 1b \n\t"
3963 : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
3964 [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
3965 : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
3966 [iSrcBStride]"r"((int)iSrcBStride)
3967 : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6"
3968 );
3969 }
3970
McHorVer01_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3971 static inline void McHorVer01_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3972 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3973 ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
3974 if (iWidth == 16) {
3975 McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3976 PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3977 } else if (iWidth == 8) {
3978 McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3979 PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3980 } else {
3981 McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
3982 PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
3983 }
3984 }
3985
McHorVer03_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)3986 static inline void McHorVer03_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
3987 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
3988 ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
3989 if (iWidth == 16) {
3990 McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3991 PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3992 } else if (iWidth == 8) {
3993 McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
3994 PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3995 } else {
3996 McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
3997 PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
3998 }
3999 }
4000
McHorVer10_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4001 static inline void McHorVer10_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4002 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4003 ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
4004 if (iWidth == 16) {
4005 McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4006 PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4007 } else if (iWidth == 8) {
4008 McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4009 PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4010 } else {
4011 McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
4012 PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
4013 }
4014 }
4015
McHorVer11_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4016 static inline void McHorVer11_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4017 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4018 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4019 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4020 if (iWidth == 16) {
4021 McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4022 McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4023 PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4024 } else if (iWidth == 8) {
4025 McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4026 McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4027 PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4028 } else {
4029 McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4030 McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4031 PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4032 }
4033 }
4034
McHorVer12_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4035 static inline void McHorVer12_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4036 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4037 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4038 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4039 if (iWidth == 16) {
4040 McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4041 McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4042 PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4043 } else if (iWidth == 8) {
4044 McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4045 McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4046 PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4047 } else {
4048 McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
4049 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4050 PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4051 }
4052 }
McHorVer13_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4053 static inline void McHorVer13_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4054 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4055 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4056 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4057 if (iWidth == 16) {
4058 McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4059 McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4060 PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4061 } else if (iWidth == 8) {
4062 McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4063 McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
4064 PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4065 } else {
4066 McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4067 McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4 , iHeight);
4068 PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4069 }
4070 }
McHorVer21_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4071 static inline void McHorVer21_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4072 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4073 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4074 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4075 if (iWidth == 16) {
4076 McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4077 McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4078 PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4079 } else if (iWidth == 8) {
4080 McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4081 McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4082 PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4083 } else {
4084 McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4085 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4086 PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4087 }
4088 }
4089
McHorVer23_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4090 static inline void McHorVer23_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4091 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4092 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4093 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4094 if (iWidth == 16) {
4095 McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4096 McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4097 PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4098 } else if (iWidth == 8) {
4099 McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4100 McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4101 PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4102 } else {
4103 McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4104 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4105 PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
4106 }
4107 }
McHorVer30_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4108 static inline void McHorVer30_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4109 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4110 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4111 if (iWidth == 16) {
4112 McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4113 PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4114 } else if (iWidth == 8) {
4115 McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4116 PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4117 } else {
4118 McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4119 PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
4120 }
4121 }
McHorVer31_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4122 static inline void McHorVer31_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4123 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4124 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4125 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4126 if (iWidth == 16) {
4127 McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4128 McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4129 PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4130 } else if (iWidth == 8) {
4131 McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4132 McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4133 PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4134 } else {
4135 McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
4136 McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4137 PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4138 }
4139 }
McHorVer32_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4140 static inline void McHorVer32_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4141 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4142 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4143 ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
4144 if (iWidth == 16) {
4145 McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4146 McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4147 PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4148 } else if (iWidth == 8) {
4149 McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4150 McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
4151 PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4152 } else {
4153 McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4154 McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
4155 PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
4156 }
4157 }
McHorVer33_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4158 static inline void McHorVer33_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4159 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4160 ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
4161 ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
4162 if (iWidth == 16) {
4163 McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4164 McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4165 PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4166 } else if (iWidth == 8) {
4167 McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4168 McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
4169 PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4170 } else {
4171 McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
4172 McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
4173 PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
4174 }
4175 }
4176
McLuma_mmi(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)4177 void McLuma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
4178 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
4179 static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
4180 {McCopy_mmi, McHorVer01_mmi, McHorVer02_mmi, McHorVer03_mmi},
4181 {McHorVer10_mmi, McHorVer11_mmi, McHorVer12_mmi, McHorVer13_mmi},
4182 {McHorVer20_mmi, McHorVer21_mmi, McHorVer22_mmi, McHorVer23_mmi},
4183 {McHorVer30_mmi, McHorVer31_mmi, McHorVer32_mmi, McHorVer33_mmi},
4184 };
4185
4186 pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
4187 }
4188
PixelAvg_mmi(uint8_t * pDst,int32_t iDstStride,const uint8_t * pSrcA,int32_t iSrcAStride,const uint8_t * pSrcB,int32_t iSrcBStride,int32_t iWidth,int32_t iHeight)4189 void PixelAvg_mmi(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
4190 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
4191 static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
4192 PixelAvgWidthEq8_mmi,
4193 PixelAvgWidthEq16_mmi
4194 };
4195 kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
4196 }
4197 #endif//HAVE_MMI
4198
4199 #if defined(HAVE_LSX)
McCopy_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight)4200 static inline void McCopy_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4201 int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
4202 if (iWidth == 16)
4203 McCopyWidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4204 else if (iWidth == 8)
4205 McCopyWidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4206 else if (iWidth == 4)
4207 McCopyWidthEq4_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4208 else
4209 McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
4210 }
4211
McChroma_lsx(const uint8_t * pSrc,int32_t iSrcStride,uint8_t * pDst,int32_t iDstStride,int16_t iMvX,int16_t iMvY,int32_t iWidth,int32_t iHeight)4212 void McChroma_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
4213 int32_t iDstStride, int16_t iMvX, int16_t iMvY,
4214 int32_t iWidth, int32_t iHeight) {
4215 static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
4216 McChromaWidthEq4_lsx,
4217 McChromaWidthEq8_lsx
4218 };
4219 const int32_t kiD8x = iMvX & 0x07;
4220 const int32_t kiD8y = iMvY & 0x07;
4221 if (kiD8x == 0 && kiD8y == 0) {
4222 McCopy_lsx (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
4223 return;
4224 }
4225 if (iWidth != 2) {
4226 kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
4227 g_kuiABCD[kiD8y][kiD8x], iHeight);
4228 } else
4229 McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
4230 iWidth, iHeight);
4231 }
4232 #endif//HAVE_LSX
4233
4234 } // anon ns.
4235
InitMcFunc(SMcFunc * pMcFuncs,uint32_t uiCpuFlag)4236 void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
4237 pMcFuncs->pfLumaHalfpelHor = McHorVer20_c;
4238 pMcFuncs->pfLumaHalfpelVer = McHorVer02_c;
4239 pMcFuncs->pfLumaHalfpelCen = McHorVer22_c;
4240 pMcFuncs->pfSampleAveraging = PixelAvg_c;
4241 pMcFuncs->pMcChromaFunc = McChroma_c;
4242 pMcFuncs->pMcLumaFunc = McLuma_c;
4243
4244 #if defined (X86_ASM)
4245 if (uiCpuFlag & WELS_CPU_SSE2) {
4246 pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_sse2;
4247 pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_sse2;
4248 pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_sse2;
4249 pMcFuncs->pfSampleAveraging = PixelAvg_sse2;
4250 pMcFuncs->pMcChromaFunc = McChroma_sse2;
4251 pMcFuncs->pMcLumaFunc = McLuma_sse2;
4252 }
4253
4254 if (uiCpuFlag & WELS_CPU_SSSE3) {
4255 pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_ssse3;
4256 pMcFuncs->pfLumaHalfpelVer = McHorVer02_ssse3;
4257 pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17_ssse3;
4258 pMcFuncs->pMcChromaFunc = McChroma_ssse3;
4259 pMcFuncs->pMcLumaFunc = McLuma_ssse3;
4260 }
4261 #ifdef HAVE_AVX2
4262 if (uiCpuFlag & WELS_CPU_AVX2) {
4263 pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_avx2;
4264 pMcFuncs->pfLumaHalfpelVer = McHorVer02_avx2;
4265 pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17_avx2;
4266 pMcFuncs->pMcLumaFunc = McLuma_avx2;
4267 }
4268 #endif
4269 #endif //(X86_ASM)
4270
4271 #if defined(HAVE_NEON)
4272 if (uiCpuFlag & WELS_CPU_NEON) {
4273 pMcFuncs->pMcLumaFunc = McLuma_neon;
4274 pMcFuncs->pMcChromaFunc = McChroma_neon;
4275 pMcFuncs->pfSampleAveraging = PixelAvg_neon;
4276 pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_neon;//iWidth+1:4/8/16
4277 pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_neon;//heigh+1:4/8/16
4278 pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_neon;//iWidth+1/heigh+1
4279 }
4280 #endif
4281 #if defined(HAVE_NEON_AARCH64)
4282 if (uiCpuFlag & WELS_CPU_NEON) {
4283 pMcFuncs->pMcLumaFunc = McLuma_AArch64_neon;
4284 pMcFuncs->pMcChromaFunc = McChroma_AArch64_neon;
4285 pMcFuncs->pfSampleAveraging = PixelAvg_AArch64_neon;
4286 pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_AArch64_neon;//iWidth+1:4/8/16
4287 pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_AArch64_neon;//heigh+1:4/8/16
4288 pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon;//iWidth+1/heigh+1
4289 }
4290 #endif
4291
4292 #if defined(HAVE_MMI)
4293 if (uiCpuFlag & WELS_CPU_MMI) {
4294 pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_mmi;
4295 pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_mmi;
4296 pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_mmi;
4297 pMcFuncs->pfSampleAveraging = PixelAvg_mmi;
4298 pMcFuncs->pMcChromaFunc = McChroma_mmi;
4299 pMcFuncs->pMcLumaFunc = McLuma_mmi;
4300 }
4301 #endif//HAVE_MMI
4302
4303 #if defined(HAVE_LSX)
4304 if (uiCpuFlag & WELS_CPU_LSX) {
4305 pMcFuncs->pMcChromaFunc = McChroma_lsx;
4306 }
4307 #endif//HAVE_LSX
4308 }
4309