1 /*!
2 * \copy
3 * Copyright (c) 2009-2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file sample.c
33 *
34 * \brief compute SAD and SATD
35 *
36 * \date 2009.06.02 Created
37 *
38 *************************************************************************************
39 */
40
41 #include "sample.h"
42 #include "sad_common.h"
43 #include "intra_pred_common.h"
44 #include "mc.h"
45 #include "cpu_core.h"
46
47 namespace WelsEnc {
WelsSampleSatd4x4_c(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)48 int32_t WelsSampleSatd4x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
49 int32_t iSatdSum = 0;
50 int32_t pSampleMix[4][4] = {{ 0 }};
51 int32_t iSample0, iSample1, iSample2, iSample3;
52 int32_t i = 0;
53 uint8_t* pSrc1 = pSample1;
54 uint8_t* pSrc2 = pSample2;
55
56 //step 1: get the difference
57 for (i = 0; i < 4; i++) {
58 pSampleMix[i][0] = pSrc1[0] - pSrc2[0];
59 pSampleMix[i][1] = pSrc1[1] - pSrc2[1];
60 pSampleMix[i][2] = pSrc1[2] - pSrc2[2];
61 pSampleMix[i][3] = pSrc1[3] - pSrc2[3];
62
63 pSrc1 += iStride1;
64 pSrc2 += iStride2;
65 }
66
67 //step 2: horizontal transform
68 for (i = 0; i < 4; i++) {
69 iSample0 = pSampleMix[i][0] + pSampleMix[i][2];
70 iSample1 = pSampleMix[i][1] + pSampleMix[i][3];
71 iSample2 = pSampleMix[i][0] - pSampleMix[i][2];
72 iSample3 = pSampleMix[i][1] - pSampleMix[i][3];
73
74 pSampleMix[i][0] = iSample0 + iSample1;
75 pSampleMix[i][1] = iSample2 + iSample3;
76 pSampleMix[i][2] = iSample2 - iSample3;
77 pSampleMix[i][3] = iSample0 - iSample1;
78 }
79
80 //step 3: vertical transform and get the sum of SATD
81 for (i = 0; i < 4; i++) {
82 iSample0 = pSampleMix[0][i] + pSampleMix[2][i];
83 iSample1 = pSampleMix[1][i] + pSampleMix[3][i];
84 iSample2 = pSampleMix[0][i] - pSampleMix[2][i];
85 iSample3 = pSampleMix[1][i] - pSampleMix[3][i];
86
87 pSampleMix[0][i] = iSample0 + iSample1;
88 pSampleMix[1][i] = iSample2 + iSample3;
89 pSampleMix[2][i] = iSample2 - iSample3;
90 pSampleMix[3][i] = iSample0 - iSample1;
91
92 iSatdSum += (WELS_ABS (pSampleMix[0][i]) + WELS_ABS (pSampleMix[1][i]) + WELS_ABS (pSampleMix[2][i]) + WELS_ABS (
93 pSampleMix[3][i]));
94 }
95
96 return ((iSatdSum + 1) >> 1);
97 }
98
WelsSampleSatd8x4_c(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)99 int32_t WelsSampleSatd8x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
100 int32_t iSatdSum = 0;
101 iSatdSum += WelsSampleSatd4x4_c (pSample1, iStride1, pSample2, iStride2);
102 iSatdSum += WelsSampleSatd4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
103 return iSatdSum;
104 }
105
WelsSampleSatd4x8_c(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)106 int32_t WelsSampleSatd4x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
107 int32_t iSatdSum = 0;
108 iSatdSum += WelsSampleSatd4x4_c (pSample1, iStride1, pSample2, iStride2);
109 iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
110 return iSatdSum;
111 }
112
WelsSampleSatd8x8_c(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)113 int32_t WelsSampleSatd8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
114 int32_t iSatdSum = 0;
115
116 iSatdSum += WelsSampleSatd4x4_c (pSample1, iStride1, pSample2, iStride2);
117 iSatdSum += WelsSampleSatd4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
118 iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
119 iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2) + 4, iStride1, pSample2 + (iStride2 << 2) + 4, iStride2);
120
121 return iSatdSum;
122 }
WelsSampleSatd16x8_c(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)123 int32_t WelsSampleSatd16x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
124 int32_t iSatdSum = 0;
125
126 iSatdSum += WelsSampleSatd8x8_c (pSample1, iStride1, pSample2, iStride2);
127 iSatdSum += WelsSampleSatd8x8_c (pSample1 + 8, iStride1, pSample2 + 8, iStride2);
128
129 return iSatdSum;
130 }
WelsSampleSatd8x16_c(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)131 int32_t WelsSampleSatd8x16_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
132 int32_t iSatdSum = 0;
133
134 iSatdSum += WelsSampleSatd8x8_c (pSample1, iStride1, pSample2, iStride2);
135 iSatdSum += WelsSampleSatd8x8_c (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2);
136
137 return iSatdSum;
138 }
WelsSampleSatd16x16_c(uint8_t * pSample1,int32_t iStride1,uint8_t * pSample2,int32_t iStride2)139 int32_t WelsSampleSatd16x16_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
140 int32_t iSatdSum = 0;
141
142 iSatdSum += WelsSampleSatd8x8_c (pSample1, iStride1, pSample2, iStride2);
143 iSatdSum += WelsSampleSatd8x8_c (pSample1 + 8, iStride1, pSample2 + 8, iStride2);
144 iSatdSum += WelsSampleSatd8x8_c (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2);
145 iSatdSum += WelsSampleSatd8x8_c (pSample1 + (iStride1 << 3) + 8, iStride1, pSample2 + (iStride2 << 3) + 8, iStride2);
146
147 return iSatdSum;
148 }
149
150
151 extern void WelsI4x4LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
152 extern void WelsI4x4LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
153 extern void WelsI4x4LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
154
WelsSampleSatdIntra4x4Combined3_c(uint8_t * pDec,int32_t iDecStride,uint8_t * pEnc,int32_t iEncStride,uint8_t * pDst,int32_t * pBestMode,int32_t iLambda2,int32_t iLambda1,int32_t iLambda0)155 int32_t WelsSampleSatdIntra4x4Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
156 uint8_t* pDst,
157 int32_t* pBestMode, int32_t iLambda2, int32_t iLambda1, int32_t iLambda0) {
158 int32_t iBestMode = -1;
159 int32_t iCurCost, iBestCost = INT_MAX;
160 ENFORCE_STACK_ALIGN_2D (uint8_t, uiLocalBuffer, 3, 16, 16)
161
162 WelsI4x4LumaPredDc_c (uiLocalBuffer[2], pDec, iDecStride);
163 iCurCost = WelsSampleSatd4x4_c (uiLocalBuffer[2], 4, pEnc, iEncStride) + iLambda2;
164 if (iCurCost < iBestCost) {
165 iBestMode = 2;
166 iBestCost = iCurCost;
167 }
168
169 WelsI4x4LumaPredH_c (uiLocalBuffer[1], pDec, iDecStride);
170 iCurCost = WelsSampleSatd4x4_c (uiLocalBuffer[1], 4, pEnc, iEncStride) + iLambda1;
171 if (iCurCost < iBestCost) {
172 iBestMode = 1;
173 iBestCost = iCurCost;
174 }
175 WelsI4x4LumaPredV_c (uiLocalBuffer[0], pDec, iDecStride);
176 iCurCost = WelsSampleSatd4x4_c (uiLocalBuffer[0], 4, pEnc, iEncStride) + iLambda0;
177 if (iCurCost < iBestCost) {
178 iBestMode = 0;
179 iBestCost = iCurCost;
180 }
181
182 memcpy (pDst, uiLocalBuffer[iBestMode], 16 * sizeof (uint8_t)); // confirmed_safe_unsafe_usage
183 *pBestMode = iBestMode;
184
185 return iBestCost;
186 }
187 extern void WelsIChromaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
188 extern void WelsIChromaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
189 extern void WelsIChromaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
190
WelsSampleSatdIntra8x8Combined3_c(uint8_t * pDecCb,int32_t iDecStride,uint8_t * pEncCb,int32_t iEncStride,int32_t * pBestMode,int32_t iLambda,uint8_t * pDstChroma,uint8_t * pDecCr,uint8_t * pEncCr)191 int32_t WelsSampleSatdIntra8x8Combined3_c (uint8_t* pDecCb, int32_t iDecStride, uint8_t* pEncCb, int32_t iEncStride,
192 int32_t* pBestMode, int32_t iLambda, uint8_t* pDstChroma, uint8_t* pDecCr, uint8_t* pEncCr) {
193 int32_t iBestMode = -1;
194 int32_t iCurCost, iBestCost = INT_MAX;
195
196 WelsIChromaPredV_c (pDstChroma, pDecCb, iDecStride);
197 WelsIChromaPredV_c (pDstChroma + 64, pDecCr, iDecStride);
198 iCurCost = WelsSampleSatd8x8_c (pDstChroma, 8, pEncCb, iEncStride);
199 iCurCost += WelsSampleSatd8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
200
201 if (iCurCost < iBestCost) {
202 iBestMode = 2;
203 iBestCost = iCurCost;
204 }
205
206 WelsIChromaPredH_c (pDstChroma, pDecCb, iDecStride);
207 WelsIChromaPredH_c (pDstChroma + 64, pDecCr, iDecStride);
208 iCurCost = WelsSampleSatd8x8_c (pDstChroma, 8, pEncCb, iEncStride);
209 iCurCost += WelsSampleSatd8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
210 if (iCurCost < iBestCost) {
211 iBestMode = 1;
212 iBestCost = iCurCost;
213 }
214 WelsIChromaPredDc_c (pDstChroma, pDecCb, iDecStride);
215 WelsIChromaPredDc_c (pDstChroma + 64, pDecCr, iDecStride);
216 iCurCost = WelsSampleSatd8x8_c (pDstChroma, 8, pEncCb, iEncStride);
217 iCurCost += WelsSampleSatd8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride);
218 if (iCurCost < iBestCost) {
219 iBestMode = 0;
220 iBestCost = iCurCost;
221 }
222
223 *pBestMode = iBestMode;
224
225 return iBestCost;
226
227
228 }
WelsSampleSadIntra8x8Combined3_c(uint8_t * pDecCb,int32_t iDecStride,uint8_t * pEncCb,int32_t iEncStride,int32_t * pBestMode,int32_t iLambda,uint8_t * pDstChroma,uint8_t * pDecCr,uint8_t * pEncCr)229 int32_t WelsSampleSadIntra8x8Combined3_c (uint8_t* pDecCb, int32_t iDecStride, uint8_t* pEncCb, int32_t iEncStride,
230 int32_t* pBestMode, int32_t iLambda, uint8_t* pDstChroma, uint8_t* pDecCr, uint8_t* pEncCr) {
231 int32_t iBestMode = -1;
232 int32_t iCurCost, iBestCost = INT_MAX;
233
234 WelsIChromaPredV_c (pDstChroma, pDecCb, iDecStride);
235 WelsIChromaPredV_c (pDstChroma + 64, pDecCr, iDecStride);
236 iCurCost = WelsSampleSad8x8_c (pDstChroma, 8, pEncCb, iEncStride);
237 iCurCost += WelsSampleSad8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
238
239 if (iCurCost < iBestCost) {
240 iBestMode = 2;
241 iBestCost = iCurCost;
242 }
243
244 WelsIChromaPredH_c (pDstChroma, pDecCb, iDecStride);
245 WelsIChromaPredH_c (pDstChroma + 64, pDecCr, iDecStride);
246 iCurCost = WelsSampleSad8x8_c (pDstChroma, 8, pEncCb, iEncStride);
247 iCurCost += WelsSampleSad8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
248 if (iCurCost < iBestCost) {
249 iBestMode = 1;
250 iBestCost = iCurCost;
251 }
252 WelsIChromaPredDc_c (pDstChroma, pDecCb, iDecStride);
253 WelsIChromaPredDc_c (pDstChroma + 64, pDecCr, iDecStride);
254 iCurCost = WelsSampleSad8x8_c (pDstChroma, 8, pEncCb, iEncStride);
255 iCurCost += WelsSampleSad8x8_c (pDstChroma + 64, 8, pEncCr, iEncStride);
256 if (iCurCost < iBestCost) {
257 iBestMode = 0;
258 iBestCost = iCurCost;
259 }
260
261 *pBestMode = iBestMode;
262
263 return iBestCost;
264
265 }
266
267 extern void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
268 //extern void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
269 //extern void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
270
WelsSampleSatdIntra16x16Combined3_c(uint8_t * pDec,int32_t iDecStride,uint8_t * pEnc,int32_t iEncStride,int32_t * pBestMode,int32_t iLambda,uint8_t * pDst)271 int32_t WelsSampleSatdIntra16x16Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
272 int32_t* pBestMode, int32_t iLambda, uint8_t* pDst) {
273 int32_t iBestMode = -1;
274 int32_t iCurCost, iBestCost = INT_MAX;
275
276 WelsI16x16LumaPredV_c (pDst, pDec, iDecStride);
277 iCurCost = WelsSampleSatd16x16_c (pDst, 16, pEnc, iEncStride);
278
279 if (iCurCost < iBestCost) {
280 iBestMode = 0;
281 iBestCost = iCurCost;
282 }
283
284 WelsI16x16LumaPredH_c (pDst, pDec, iDecStride);
285 iCurCost = WelsSampleSatd16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
286 if (iCurCost < iBestCost) {
287 iBestMode = 1;
288 iBestCost = iCurCost;
289 }
290 WelsI16x16LumaPredDc_c (pDst, pDec, iDecStride);
291 iCurCost = WelsSampleSatd16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
292 if (iCurCost < iBestCost) {
293 iBestMode = 2;
294 iBestCost = iCurCost;
295 }
296
297 *pBestMode = iBestMode;
298
299 return iBestCost;
300
301
302 }
WelsSampleSadIntra16x16Combined3_c(uint8_t * pDec,int32_t iDecStride,uint8_t * pEnc,int32_t iEncStride,int32_t * pBestMode,int32_t iLambda,uint8_t * pDst)303 int32_t WelsSampleSadIntra16x16Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
304 int32_t* pBestMode, int32_t iLambda, uint8_t* pDst) {
305 int32_t iBestMode = -1;
306 int32_t iCurCost, iBestCost = INT_MAX;
307
308 WelsI16x16LumaPredV_c (pDst, pDec, iDecStride);
309 iCurCost = WelsSampleSad16x16_c (pDst, 16, pEnc, iEncStride);
310
311 if (iCurCost < iBestCost) {
312 iBestMode = 0;
313 iBestCost = iCurCost;
314 }
315
316 WelsI16x16LumaPredH_c (pDst, pDec, iDecStride);
317 iCurCost = WelsSampleSad16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
318 if (iCurCost < iBestCost) {
319 iBestMode = 1;
320 iBestCost = iCurCost;
321 }
322 WelsI16x16LumaPredDc_c (pDst, pDec, iDecStride);
323 iCurCost = WelsSampleSad16x16_c (pDst, 16, pEnc, iEncStride) + iLambda * 2;
324 if (iCurCost < iBestCost) {
325 iBestMode = 2;
326 iBestCost = iCurCost;
327 }
328
329 *pBestMode = iBestMode;
330
331 return iBestCost;
332
333
334 }
335
WelsInitSampleSadFunc(SWelsFuncPtrList * pFuncList,uint32_t uiCpuFlag)336 void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
337 //pfSampleSad init
338 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_c;
339 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_c;
340 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16 ] = WelsSampleSad8x16_c;
341 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8 ] = WelsSampleSad8x8_c;
342 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_c;
343 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x4 ] = WelsSampleSad8x4_c;
344 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x8 ] = WelsSampleSad4x8_c;
345
346 //pfSampleSatd init
347 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_c;
348 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_c;
349 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_c;
350 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_c;
351 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_c;
352 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x4 ] = WelsSampleSatd8x4_c;
353 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x8 ] = WelsSampleSatd4x8_c;
354
355 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_c;
356 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_c;
357 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_c;
358 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_c;
359 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_c;
360 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x4] = WelsSampleSadFour8x4_c;
361 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x8] = WelsSampleSadFour4x8_c;
362
363 pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = NULL;
364 pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = NULL;
365 pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = NULL;
366 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = NULL;
367 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = NULL;
368
369 #if defined (X86_ASM)
370 if (uiCpuFlag & WELS_CPU_MMXEXT) {
371 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_mmx;
372 }
373
374 if (uiCpuFlag & WELS_CPU_SSE2) {
375 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_sse2;
376 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_sse2;
377 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_sse2;
378 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_sse21;
379
380 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_sse2;
381 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_sse2;
382 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_sse2;
383 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_sse2;
384 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_sse2;
385
386 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_sse2;
387 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_sse2;
388 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
389 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
390 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;
391 pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSampleSatdThree4x4_sse2;
392 }
393
394 if (uiCpuFlag & WELS_CPU_SSSE3) {
395 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
396 }
397
398 if (uiCpuFlag & WELS_CPU_SSE41) {
399 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse41;
400 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8] = WelsSampleSatd16x8_sse41;
401 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
402 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
403 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
404 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
405 pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
406 }
407 #if defined(HAVE_AVX2)
408 if (uiCpuFlag & WELS_CPU_AVX2) {
409 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_avx2;
410 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8] = WelsSampleSatd16x8_avx2;
411 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_avx2;
412 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_avx2;
413 }
414 #endif
415 #endif //(X86_ASM)
416
417 #if defined (HAVE_NEON)
418 if (uiCpuFlag & WELS_CPU_NEON) {
419 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_neon;
420 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_neon;
421 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_neon;
422 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_neon;
423 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_neon;
424
425 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_neon;
426 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_neon;
427 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_neon;
428 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_neon;
429 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_neon;
430
431 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_neon;
432 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_neon;
433 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_neon;
434 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_neon;
435 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_neon;
436
437 pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_neon;
438 pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_neon;
439 pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_neon;
440 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_neon;
441 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon;
442 }
443 #endif
444
445 #if defined (HAVE_NEON_AARCH64)
446 if (uiCpuFlag & WELS_CPU_NEON) {
447 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_AArch64_neon;
448 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_AArch64_neon;
449 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_AArch64_neon;
450 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_AArch64_neon;
451 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_AArch64_neon;
452
453 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_AArch64_neon;
454 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_AArch64_neon;
455 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_AArch64_neon;
456 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_AArch64_neon;
457 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_AArch64_neon;
458
459 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_AArch64_neon;
460 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_AArch64_neon;
461 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_AArch64_neon;
462 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_AArch64_neon;
463 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_AArch64_neon;
464
465 pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_AArch64_neon;
466 pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_AArch64_neon;
467 pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_AArch64_neon;
468 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_AArch64_neon;
469 pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_AArch64_neon;
470 }
471 #endif
472
473 #if defined (HAVE_MMI)
474 if (uiCpuFlag & WELS_CPU_MMI) {
475 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_mmi;
476 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_mmi;
477 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_mmi;
478 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_mmi;
479 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_mmi;
480
481 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_mmi;
482 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_mmi;
483 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_mmi;
484 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_mmi;
485 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_mmi;
486
487 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_mmi;
488 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_mmi;
489 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_mmi;
490 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_mmi;
491 }
492 #endif//HAVE_MMI
493
494 #if defined (HAVE_LASX)
495 if (uiCpuFlag & WELS_CPU_LASX) {
496 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4] = WelsSampleSad4x4_lasx;
497 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_lasx;
498 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_lasx;
499 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8] = WelsSampleSad16x8_lasx;
500 pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_lasx;
501
502 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_lasx;
503 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_lasx;
504 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_lasx;
505 pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_lasx;
506
507 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_lasx;
508 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_lasx;
509 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_lasx;
510 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8] = WelsSampleSatd16x8_lasx;
511 pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_lasx;
512 }
513 #endif
514 }
515
516 } // namespace WelsEnc
517