1 /*!
2 * \copy
3 * Copyright (c) 2009-2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file svc motion estimate.c
33 *
34 * \brief Interfaces introduced in svc mb motion estimation
35 *
36 * \date 08/11/2009 Created
37 *
38 *************************************************************************************
39 */
40
41 #include "cpu_core.h"
42 #include "ls_defines.h"
43 #include "svc_motion_estimate.h"
44 #include "wels_transpose_matrix.h"
45
46 namespace WelsEnc {
47
48 const int32_t QStepx16ByQp[52] = { /* save QStep<<4 for int32_t */
49 10, 11, 13, 14, 16, 18, /* 0~5 */
50 20, 22, 26, 28, 32, 36, /* 6~11 */
51 40, 44, 52, 56, 64, 72, /* 12~17 */
52 80, 88, 104, 112, 128, 144, /* 18~23 */
53 160, 176, 208, 224, 256, 288, /* 24~29 */
54 320, 352, 416, 448, 512, 576, /* 30~35 */
55 640, 704, 832, 896, 1024, 1152, /* 36~41 */
56 1280, 1408, 1664, 1792, 2048, 2304, /* 42~47 */
57 2560, 2816, 3328, 3584 /* 48~51 */
58 };
59
UpdateMeResults(const SMVUnitXY ksBestMv,const uint32_t kiBestSadCost,uint8_t * pRef,SWelsME * pMe)60 static inline void UpdateMeResults (const SMVUnitXY ksBestMv, const uint32_t kiBestSadCost, uint8_t* pRef,
61 SWelsME* pMe) {
62 pMe->sMv = ksBestMv;
63 pMe->pRefMb = pRef;
64 pMe->uiSadCost = kiBestSadCost;
65 }
MeEndIntepelSearch(SWelsME * pMe)66 static inline void MeEndIntepelSearch (SWelsME* pMe) {
67 /* -> qpel mv */
68 pMe->sMv.iMvX *= (1 << 2);
69 pMe->sMv.iMvY *= (1 << 2);
70 pMe->uiSatdCost = pMe->uiSadCost;
71 }
72
WelsInitMeFunc(SWelsFuncPtrList * pFuncList,uint32_t uiCpuFlag,bool bScreenContent)73 void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScreenContent) {
74 pFuncList->pfUpdateFMESwitch = UpdateFMESwitchNull;
75
76 if (!bScreenContent) {
77 pFuncList->pfCheckDirectionalMv = CheckDirectionalMvFalse;
78 pFuncList->pfCalculateBlockFeatureOfFrame[0] =
79 pFuncList->pfCalculateBlockFeatureOfFrame[1] = NULL;
80 pFuncList->pfCalculateSingleBlockFeature[0] =
81 pFuncList->pfCalculateSingleBlockFeature[1] = NULL;
82
83 } else {
84 pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;
85
86 //for cross serarch
87 pFuncList->pfVerticalFullSearch = LineFullSearch_c;
88 pFuncList->pfHorizontalFullSearch = LineFullSearch_c;
89
90 #if defined (X86_ASM)
91 if (uiCpuFlag & WELS_CPU_SSE41) {
92 pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;
93 pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;
94 pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;
95 pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;
96 }
97 #endif
98
99 //for feature search
100 pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_c;
101 pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_c;
102 pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_c;
103 pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_c;
104 //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
105 pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
106 pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
107 #if defined (X86_ASM)
108 if (uiCpuFlag & WELS_CPU_SSE2) {
109 //for feature search
110 pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_sse2;
111 pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_sse2;
112 pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
113 pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
114 //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
115 pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_sse2;
116 pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_sse2;
117 }
118 if (uiCpuFlag & WELS_CPU_SSE41) {
119 //for feature search
120 pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse4;
121 pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse4;
122 }
123 #endif
124
125 #if defined (HAVE_NEON)
126 if (uiCpuFlag & WELS_CPU_NEON) {
127 //for feature search
128 pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon;
129 pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon;
130 pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
131 pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
132 //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
133 pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
134 pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
135 }
136 #endif
137
138 #if defined (HAVE_NEON_AARCH64)
139 if (uiCpuFlag & WELS_CPU_NEON) {
140 //for feature search
141 pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon;
142 pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon;
143 pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
144 pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
145 //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
146 pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
147 pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
148 }
149 #endif
150 }
151 }
152
153 /*!
154 * \brief BL mb motion estimate search
155 *
156 * \param enc Wels encoder context
157 * \param pMe Wels me information
158 *
159 * \return NONE
160 */
161
WelsMotionEstimateSearch(SWelsFuncPtrList * pFuncList,SDqLayer * pCurDqLayer,SWelsME * pMe,SSlice * pSlice)162 void WelsMotionEstimateSearch (SWelsFuncPtrList* pFuncList, SDqLayer* pCurDqLayer, SWelsME* pMe, SSlice* pSlice) {
163 const int32_t kiStrideEnc = pCurDqLayer->iEncStride[0];
164 const int32_t kiStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
165
166 // Step 1: Initial point prediction
167 if (!WelsMotionEstimateInitialPoint (pFuncList, pMe, pSlice, kiStrideEnc, kiStrideRef)) {
168 pFuncList->pfSearchMethod[pMe->uiBlockSize] (pFuncList, pMe, pSlice, kiStrideEnc, kiStrideRef);
169 MeEndIntepelSearch (pMe);
170 }
171
172 pFuncList->pfCalculateSatd (pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiBlockSize], pMe, kiStrideEnc,
173 kiStrideRef);
174 }
175
WelsMotionEstimateSearchStatic(SWelsFuncPtrList * pFuncList,SDqLayer * pCurDqLayer,SWelsME * pMe,SSlice * pLpslice)176 void WelsMotionEstimateSearchStatic (SWelsFuncPtrList* pFuncList, SDqLayer* pCurDqLayer, SWelsME* pMe,
177 SSlice* pLpslice) {
178 const int32_t kiStrideEnc = pCurDqLayer->iEncStride[0];
179 const int32_t kiStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
180
181 pMe->sMv.iMvX = pMe->sMv.iMvY = 0;
182 pMe->uiSadCost =
183 pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize] (pMe->pEncMb, kiStrideEnc, pMe->pRefMb, kiStrideRef) ;
184 pMe->uiSadCost += COST_MVD (pMe->pMvdCost, - pMe->sMvp.iMvX, - pMe->sMvp.iMvY);
185 MeEndIntepelSearch (pMe);
186 pFuncList->pfCalculateSatd (pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiBlockSize], pMe, kiStrideEnc,
187 kiStrideRef);
188 }
189
WelsMotionEstimateSearchScrolled(SWelsFuncPtrList * pFuncList,SDqLayer * pCurDqLayer,SWelsME * pMe,SSlice * pSlice)190 void WelsMotionEstimateSearchScrolled (SWelsFuncPtrList* pFuncList, SDqLayer* pCurDqLayer, SWelsME* pMe,
191 SSlice* pSlice) {
192 const int32_t kiStrideEnc = pCurDqLayer->iEncStride[0];
193 const int32_t kiStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
194
195 pMe->sMv = pMe->sDirectionalMv;
196 pMe->pRefMb = pMe->pColoRefMb + pMe->sMv.iMvY * kiStrideRef + pMe->sMv.iMvX;
197 pMe->uiSadCost =
198 pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize] (pMe->pEncMb, kiStrideEnc, pMe->pRefMb, kiStrideRef)
199 + COST_MVD (pMe->pMvdCost, (pMe->sMv.iMvX * (1 << 2)) - pMe->sMvp.iMvX, (pMe->sMv.iMvY * (1 << 2)) - pMe->sMvp.iMvY);
200 MeEndIntepelSearch (pMe);
201 pFuncList->pfCalculateSatd (pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiBlockSize], pMe, kiStrideEnc,
202 kiStrideRef);
203 }
204 /*!
205 * \brief EL mb motion estimate initial point testing
206 *
207 * \param pix_pFuncList SSampleDealingFunc
208 * \param pMe Wels me information
209 * \param mv_range search range in motion estimate
210 * \param point the best match point in motion estimation
211 *
212 * \return NONE
213 */
WelsMotionEstimateInitialPoint(SWelsFuncPtrList * pFuncList,SWelsME * pMe,SSlice * pSlice,int32_t iStrideEnc,int32_t iStrideRef)214 bool WelsMotionEstimateInitialPoint (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice, int32_t iStrideEnc,
215 int32_t iStrideRef) {
216 PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
217 const uint16_t* kpMvdCost = pMe->pMvdCost;
218 uint8_t* const kpEncMb = pMe->pEncMb;
219 int16_t iMvc0, iMvc1;
220 int32_t iSadCost;
221 int32_t iBestSadCost;
222 uint8_t* pRefMb;
223 uint8_t* pFref2;
224 uint32_t i;
225 const uint32_t kuiMvcNum = pSlice->uiMvcNum;
226 const SMVUnitXY* kpMvcList = &pSlice->sMvc[0];
227 const SMVUnitXY ksMvStartMin = pSlice->sMvStartMin;
228 const SMVUnitXY ksMvStartMax = pSlice->sMvStartMax;
229 const SMVUnitXY ksMvp = pMe->sMvp;
230 SMVUnitXY sMv;
231
232 // Step 1: Initial point prediction
233 // init with sMvp
234 sMv.iMvX = WELS_CLIP3 ((2 + ksMvp.iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX);
235 sMv.iMvY = WELS_CLIP3 ((2 + ksMvp.iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY);
236
237 pRefMb = &pMe->pRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];
238
239 iBestSadCost = pSad (kpEncMb, iStrideEnc, pRefMb, iStrideRef);
240 iBestSadCost += COST_MVD (kpMvdCost, ((sMv.iMvX) * (1 << 2)) - ksMvp.iMvX, ((sMv.iMvY) * (1 << 2)) - ksMvp.iMvY);
241
242 for (i = 0; i < kuiMvcNum; i++) {
243 //clipping here is essential since some pOut-of-range MVC may happen here (i.e., refer to baseMV)
244 iMvc0 = WELS_CLIP3 ((2 + kpMvcList[i].iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX);
245 iMvc1 = WELS_CLIP3 ((2 + kpMvcList[i].iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY);
246
247 if (((iMvc0 - sMv.iMvX) || (iMvc1 - sMv.iMvY))) {
248 pFref2 = &pMe->pRefMb[iMvc1 * iStrideRef + iMvc0];
249
250 iSadCost = pSad (kpEncMb, iStrideEnc, pFref2, iStrideRef) +
251 COST_MVD (kpMvdCost, (iMvc0 * (1 << 2)) - ksMvp.iMvX, (iMvc1 * (1 << 2)) - ksMvp.iMvY);
252
253 if (iSadCost < iBestSadCost) {
254 sMv.iMvX = iMvc0;
255 sMv.iMvY = iMvc1;
256 pRefMb = pFref2;
257 iBestSadCost = iSadCost;
258 }
259 }
260 }
261
262 if (pFuncList->pfCheckDirectionalMv
263 (pSad, pMe, ksMvStartMin, ksMvStartMax, iStrideEnc, iStrideRef, iSadCost)) {
264 sMv = pMe->sDirectionalMv;
265 pRefMb = &pMe->pColoRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];
266 iBestSadCost = iSadCost;
267 }
268
269 UpdateMeResults (sMv, iBestSadCost, pRefMb, pMe);
270 if (iBestSadCost < static_cast<int32_t> (pMe->uSadPredISatd.uiSadPred)) {
271 //Initial point early Stop
272 MeEndIntepelSearch (pMe);
273 return true;
274 }
275 return false;
276 }
277
CalculateSatdCost(PSampleSadSatdCostFunc pSatd,SWelsME * pMe,const int32_t kiEncStride,const int32_t kiRefStride)278 void CalculateSatdCost (PSampleSadSatdCostFunc pSatd, SWelsME* pMe,
279 const int32_t kiEncStride, const int32_t kiRefStride) {
280 pMe->uSadPredISatd.uiSatd = pSatd (pMe->pEncMb, kiEncStride, pMe->pRefMb, kiRefStride);
281 pMe->uiSatdCost = pMe->uSadPredISatd.uiSatd + COST_MVD (pMe->pMvdCost, pMe->sMv.iMvX - pMe->sMvp.iMvX,
282 pMe->sMv.iMvY - pMe->sMvp.iMvY);
283 }
NotCalculateSatdCost(PSampleSadSatdCostFunc pSatd,SWelsME * pMe,const int32_t kiEncStride,const int32_t kiRefStride)284 void NotCalculateSatdCost (PSampleSadSatdCostFunc pSatd, SWelsME* pMe,
285 const int32_t kiEncStride, const int32_t kiRefStride) {
286 }
287
288
289 /////////////////////////
290 // Diamond Search Basics
291 /////////////////////////
WelsMeSadCostSelect(int32_t * iSadCost,const uint16_t * kpMvdCost,int32_t * pBestCost,const int32_t kiDx,const int32_t kiDy,int32_t * pIx,int32_t * pIy)292 bool WelsMeSadCostSelect (int32_t* iSadCost, const uint16_t* kpMvdCost, int32_t* pBestCost, const int32_t kiDx,
293 const int32_t kiDy, int32_t* pIx, int32_t* pIy) {
294 int32_t iTempSadCost[4];
295 int32_t iInputSadCost = *pBestCost;
296 iTempSadCost[0] = iSadCost[0] + COST_MVD (kpMvdCost, kiDx, kiDy - 4);
297 iTempSadCost[1] = iSadCost[1] + COST_MVD (kpMvdCost, kiDx, kiDy + 4);
298 iTempSadCost[2] = iSadCost[2] + COST_MVD (kpMvdCost, kiDx - 4, kiDy);
299 iTempSadCost[3] = iSadCost[3] + COST_MVD (kpMvdCost, kiDx + 4, kiDy);
300
301 if (iTempSadCost[0] < *pBestCost) {
302 *pBestCost = iTempSadCost[0];
303 *pIx = 0;
304 *pIy = 1;
305 }
306
307 if (iTempSadCost[1] < *pBestCost) {
308 *pBestCost = iTempSadCost[1];
309 *pIx = 0;
310 *pIy = -1;
311 }
312
313 if (iTempSadCost[2] < *pBestCost) {
314 *pBestCost = iTempSadCost[2];
315 *pIx = 1;
316 *pIy = 0;
317 }
318
319 if (iTempSadCost[3] < *pBestCost) {
320 *pBestCost = iTempSadCost[3];
321 *pIx = -1;
322 *pIy = 0;
323 }
324 return (*pBestCost == iInputSadCost);
325 }
326
WelsDiamondSearch(SWelsFuncPtrList * pFuncList,SWelsME * pMe,SSlice * pSlice,const int32_t kiStrideEnc,const int32_t kiStrideRef)327 void WelsDiamondSearch (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice,
328 const int32_t kiStrideEnc, const int32_t kiStrideRef) {
329 PSample4SadCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSample4Sad[pMe->uiBlockSize];
330
331 uint8_t* pFref = pMe->pRefMb;
332 uint8_t* const kpEncMb = pMe->pEncMb;
333 const uint16_t* kpMvdCost = pMe->pMvdCost;
334
335 const SMVUnitXY ksMvStartMin = pSlice->sMvStartMin;
336 const SMVUnitXY ksMvStartMax = pSlice->sMvStartMax;
337
338 int32_t iMvDx = ((pMe->sMv.iMvX) * (1 << 2)) - pMe->sMvp.iMvX;
339 int32_t iMvDy = ((pMe->sMv.iMvY) * (1 << 2)) - pMe->sMvp.iMvY;
340
341 uint8_t* pRefMb = pFref;
342 int32_t iBestCost = (pMe->uiSadCost);
343
344 int32_t iTimeThreshold = ITERATIVE_TIMES;
345 ENFORCE_STACK_ALIGN_1D (int32_t, iSadCosts, 4, 16)
346
347 while (iTimeThreshold--) {
348 pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) >> 2;
349 pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) >> 2;
350 if (!CheckMvInRange (pMe->sMv, ksMvStartMin, ksMvStartMax))
351 continue;
352 pSad (kpEncMb, kiStrideEnc, pRefMb, kiStrideRef, &iSadCosts[0]);
353
354 int32_t iX, iY;
355
356 const bool kbIsBestCostWorse = WelsMeSadCostSelect (iSadCosts, kpMvdCost, &iBestCost, iMvDx, iMvDy, &iX, &iY);
357 if (kbIsBestCostWorse)
358 break;
359
360 iMvDx -= (iX * (1 << 2)) ;
361 iMvDy -= (iY * (1 << 2)) ;
362
363 pRefMb -= (iX + iY * kiStrideRef);
364
365 }
366
367 /* integer-pel mv */
368 pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) >> 2;
369 pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) >> 2;
370 pMe->uiSatdCost = pMe->uiSadCost = (iBestCost);
371 pMe->pRefMb = pRefMb;
372 }
373
374 /////////////////////////
375 // DirectionalMv Basics
376 /////////////////////////
CheckDirectionalMv(PSampleSadSatdCostFunc pSad,SWelsME * pMe,const SMVUnitXY ksMinMv,const SMVUnitXY ksMaxMv,const int32_t kiEncStride,const int32_t kiRefStride,int32_t & iBestSadCost)377 bool CheckDirectionalMv (PSampleSadSatdCostFunc pSad, SWelsME* pMe,
378 const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride,
379 int32_t& iBestSadCost) {
380 const int16_t kiMvX = pMe->sDirectionalMv.iMvX;
381 const int16_t kiMvY = pMe->sDirectionalMv.iMvY;
382
383 //Check MV from scrolling detection
384 if ((BLOCK_16x16 != pMe->uiBlockSize) //scrolled_MV with P16x16 is checked SKIP checking function
385 && (kiMvX | kiMvY) //(0,0) checked in ordinary initial point checking
386 && CheckMvInRange (pMe->sDirectionalMv, ksMinMv, ksMaxMv)) {
387 uint8_t* pRef = &pMe->pColoRefMb[kiMvY * kiRefStride + kiMvX];
388 uint32_t uiCurrentSadCost = pSad (pMe->pEncMb, kiEncStride, pRef, kiRefStride) +
389 COST_MVD (pMe->pMvdCost, (kiMvX * (1 << 2)) - pMe->sMvp.iMvX, (kiMvY * (1 << 2)) - pMe->sMvp.iMvY);
390 if (uiCurrentSadCost < pMe->uiSadCost) {
391 iBestSadCost = uiCurrentSadCost;
392 return true;
393 }
394 }
395 return false;
396 }
397
CheckDirectionalMvFalse(PSampleSadSatdCostFunc pSad,SWelsME * vpMe,const SMVUnitXY ksMinMv,const SMVUnitXY ksMaxMv,const int32_t kiEncStride,const int32_t kiRefStride,int32_t & iBestSadCost)398 bool CheckDirectionalMvFalse (PSampleSadSatdCostFunc pSad, SWelsME* vpMe,
399 const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride,
400 int32_t& iBestSadCost) {
401 return false;
402 }
403
404 /////////////////////////
405 // Cross Search Basics
406 /////////////////////////
407 #if defined (X86_ASM)
CalcMvdCostx8_c(uint16_t * pMvdCost,const int32_t kiStartMv,uint16_t * pMvdTable,const uint16_t kiFixedCost)408 void CalcMvdCostx8_c (uint16_t* pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost) {
409 uint16_t* pBaseCost = pMvdCost;
410 const int32_t kiOffset = (kiStartMv * (1 << 2));
411 uint16_t* pMvd = pMvdTable + kiOffset;
412 for (int32_t i = 0; i < 8; ++ i) {
413 pBaseCost[i] = ((*pMvd) + kiFixedCost);
414 pMvd += 4;
415 }
416 }
VerticalFullSearchUsingSSE41(SWelsFuncPtrList * pFuncList,SWelsME * pMe,uint16_t * pMvdTable,const int32_t kiEncStride,const int32_t kiRefStride,const int16_t kiMinMv,const int16_t kiMaxMv,const bool bVerticalSearch)417 void VerticalFullSearchUsingSSE41 (SWelsFuncPtrList* pFuncList, SWelsME* pMe,
418 uint16_t* pMvdTable,
419 const int32_t kiEncStride, const int32_t kiRefStride,
420 const int16_t kiMinMv, const int16_t kiMaxMv,
421 const bool bVerticalSearch) {
422 uint8_t* kpEncMb = pMe->pEncMb;
423 const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixY;
424 uint8_t* pRef = &pMe->pColoRefMb[kiMinMv * kiRefStride];
425
426 const int32_t kiCurMeBlockPixY = pMe->iCurMeBlockPixY;
427
428 int32_t iMinPos = kiCurMeBlockPixY + kiMinMv;
429 int32_t iMaxPos = kiCurMeBlockPixY + kiMaxMv;
430 int32_t iFixedMvd = * (pMvdTable - pMe->sMvp.iMvX);
431 uint16_t* pMvdCost = & (pMvdTable[ (kiMinMv * (1 << 2)) - pMe->sMvp.iMvY]);
432 int16_t iStartMv = 0;
433
434
435 const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
436 const int32_t kiEdgeBlocks = kIsBlock16x16 ? 16 : 8;
437 PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
438 PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
439 PTransposeMatrixBlockFunc TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 :
440 TransposeMatrixBlock8x8_mmx;
441 PTransposeMatrixBlocksFunc TransposeMatrixBlocks = kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 :
442 TransposeMatrixBlocksx8_mmx;
443
444 const int32_t kiDiff = iMaxPos - iMinPos;
445 const int32_t kiRowNum = WELS_ALIGN ((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
446 const int32_t kiBlocksNum = kIsBlock16x16 ? (kiRowNum >> 4) : (kiRowNum >> 3);
447 int32_t iCountLoop8 = (kiRowNum - kiEdgeBlocks) >> 3;
448 const int32_t kiRemainingVectors = kiDiff - (iCountLoop8 << 3);
449 const int32_t kiMatrixStride = MAX_VERTICAL_MV_RANGE;
450 ENFORCE_STACK_ALIGN_2D (uint8_t, uiMatrixRef, 16, kiMatrixStride, 16); // transpose matrix result for ref
451 ENFORCE_STACK_ALIGN_2D (uint8_t, uiMatrixEnc, 16, 16, 16); // transpose matrix result for enc
452 assert (kiRowNum <= kiMatrixStride); // make sure effective memory
453
454 TransposeMatrixBlock (&uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride);
455 TransposeMatrixBlocks (&uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum);
456 ENFORCE_STACK_ALIGN_1D (uint16_t, uiBaseCost, 8, 16);
457 int32_t iTargetPos = iMinPos;
458 int16_t iBestPos = pMe->sMv.iMvX;
459 uint32_t uiBestCost = pMe->uiSadCost;
460 uint32_t uiCostMin;
461 int32_t iIndexMinPos;
462 kpEncMb = &uiMatrixEnc[0][0];
463 pRef = &uiMatrixRef[0][0];
464
465 while (iCountLoop8 > 0) {
466 CalcMvdCostx8_c (uiBaseCost, iStartMv, pMvdCost, iFixedMvd);
467 uiCostMin = pSampleSadHor8 (kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos);
468 if (uiCostMin < uiBestCost) {
469 uiBestCost = uiCostMin;
470 iBestPos = iTargetPos + iIndexMinPos;
471 }
472 iTargetPos += 8;
473 pRef += 8;
474 iStartMv += 8;
475 -- iCountLoop8;
476 }
477 if (kiRemainingVectors > 0) {
478 kpEncMb = pMe->pEncMb;
479 pRef = &pMe->pColoRefMb[ (iTargetPos - kiCurMeBlockPix) * kiRefStride];
480 while (iTargetPos < iMaxPos) {
481 const uint16_t uiMvdCost = pMvdCost[iStartMv * (1 << 2)];
482 uint32_t uiSadCost = pSad (kpEncMb, kiEncStride, pRef, kiRefStride) + (iFixedMvd + uiMvdCost);
483 if (uiSadCost < uiBestCost) {
484 uiBestCost = uiSadCost;
485 iBestPos = iTargetPos;
486 }
487 iStartMv++;
488 pRef += kiRefStride;
489 ++iTargetPos;
490 }
491 }
492 if (uiBestCost < pMe->uiSadCost) {
493 SMVUnitXY sBestMv;
494 sBestMv.iMvX = 0;
495 sBestMv.iMvY = iBestPos - kiCurMeBlockPix;
496 UpdateMeResults (sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY * kiRefStride], pMe);
497 }
498 }
499
HorizontalFullSearchUsingSSE41(SWelsFuncPtrList * pFuncList,SWelsME * pMe,uint16_t * pMvdTable,const int32_t kiEncStride,const int32_t kiRefStride,const int16_t kiMinMv,const int16_t kiMaxMv,const bool bVerticalSearch)500 void HorizontalFullSearchUsingSSE41 (SWelsFuncPtrList* pFuncList, SWelsME* pMe,
501 uint16_t* pMvdTable,
502 const int32_t kiEncStride, const int32_t kiRefStride,
503 const int16_t kiMinMv, const int16_t kiMaxMv,
504 const bool bVerticalSearch) {
505 uint8_t* kpEncMb = pMe->pEncMb;
506
507 const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
508 int32_t iMinPos = iCurMeBlockPixX + kiMinMv;
509 int32_t iMaxPos = iCurMeBlockPixX + kiMaxMv;
510 int32_t iFixedMvd = * (pMvdTable - pMe->sMvp.iMvY);
511 uint16_t* pMvdCost = & (pMvdTable[ (kiMinMv * (1 << 2)) - pMe->sMvp.iMvX]);
512 int16_t iStartMv = 0;
513 uint8_t* pRef = &pMe->pColoRefMb[kiMinMv];
514 const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
515 PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
516 PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
517 ENFORCE_STACK_ALIGN_1D (uint16_t, uiBaseCost, 8, 16);
518 const int32_t kiNumVector = iMaxPos - iMinPos;
519 int32_t iCountLoop8 = kiNumVector >> 3;
520 const int32_t kiRemainingLoop8 = kiNumVector & 7;
521 int32_t iTargetPos = iMinPos;
522 int16_t iBestPos = pMe->sMv.iMvX;
523 uint32_t uiBestCost = pMe->uiSadCost;
524 uint32_t uiCostMin;
525 int32_t iIndexMinPos;
526
527 while (iCountLoop8 > 0) {
528 CalcMvdCostx8_c (uiBaseCost, iStartMv, pMvdCost, iFixedMvd);
529 uiCostMin = pSampleSadHor8 (kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos);
530 if (uiCostMin < uiBestCost) {
531 uiBestCost = uiCostMin;
532 iBestPos = iTargetPos + iIndexMinPos;
533 }
534 iTargetPos += 8;
535 pRef += 8;
536 iStartMv += 8;
537 -- iCountLoop8;
538 }
539 if (kiRemainingLoop8 > 0) {
540 while (iTargetPos < iMaxPos) {
541 const uint16_t uiMvdCost = pMvdCost[iStartMv * (1 << 2)];
542 uint32_t uiSadCost = pSad (kpEncMb, kiEncStride, pRef, kiRefStride) + (iFixedMvd + uiMvdCost);
543 if (uiSadCost < uiBestCost) {
544 uiBestCost = uiSadCost;
545 iBestPos = iTargetPos;
546 }
547 iStartMv++;
548 ++pRef;
549 ++iTargetPos;
550 }
551 }
552 if (uiBestCost < pMe->uiSadCost) {
553 SMVUnitXY sBestMv;
554 sBestMv.iMvX = iBestPos - iCurMeBlockPixX;
555 sBestMv.iMvY = 0;
556 UpdateMeResults (sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvX], pMe);
557 }
558 }
559 #endif
LineFullSearch_c(SWelsFuncPtrList * pFuncList,SWelsME * pMe,uint16_t * pMvdTable,const int32_t kiEncStride,const int32_t kiRefStride,const int16_t iMinMv,const int16_t iMaxMv,const bool bVerticalSearch)560 void LineFullSearch_c (SWelsFuncPtrList* pFuncList, SWelsME* pMe,
561 uint16_t* pMvdTable,
562 const int32_t kiEncStride, const int32_t kiRefStride,
563 const int16_t iMinMv, const int16_t iMaxMv,
564 const bool bVerticalSearch) {
565 PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
566 const int32_t kiCurMeBlockPixX = pMe->iCurMeBlockPixX;
567 const int32_t kiCurMeBlockPixY = pMe->iCurMeBlockPixY;
568 int32_t iMinPos, iMaxPos;
569 int32_t iFixedMvd;
570 int32_t iCurMeBlockPix;
571 int32_t iStride;
572 uint16_t* pMvdCost;
573
574 if (bVerticalSearch) {
575 iMinPos = kiCurMeBlockPixY + iMinMv;
576 iMaxPos = kiCurMeBlockPixY + iMaxMv;
577 iFixedMvd = * (pMvdTable - pMe->sMvp.iMvX);
578 iCurMeBlockPix = pMe->iCurMeBlockPixY;
579 iStride = kiRefStride;
580 pMvdCost = & (pMvdTable[ (iMinMv * (1 << 2)) - pMe->sMvp.iMvY]);
581 } else {
582 iMinPos = kiCurMeBlockPixX + iMinMv;
583 iMaxPos = kiCurMeBlockPixX + iMaxMv;
584 iFixedMvd = * (pMvdTable - pMe->sMvp.iMvY);
585 iCurMeBlockPix = pMe->iCurMeBlockPixX;
586 iStride = 1;
587 pMvdCost = & (pMvdTable[ (iMinMv * (1 << 2)) - pMe->sMvp.iMvX]);
588 }
589 uint8_t* pRef = &pMe->pColoRefMb[ iMinMv * iStride];
590 uint32_t uiBestCost = 0xFFFFFFFF;
591 int32_t iBestPos = 0;
592
593 for (int32_t iTargetPos = iMinPos; iTargetPos < iMaxPos; ++ iTargetPos) {
594 uint8_t* const kpEncMb = pMe->pEncMb;
595 uint32_t uiSadCost = pSad (kpEncMb, kiEncStride, pRef, kiRefStride) + (iFixedMvd + *pMvdCost);
596 if (uiSadCost < uiBestCost) {
597 uiBestCost = uiSadCost;
598 iBestPos = iTargetPos;
599 }
600 pRef += iStride;
601 pMvdCost += 4;
602 }
603
604 if (uiBestCost < pMe->uiSadCost) {
605 SMVUnitXY sBestMv;
606 sBestMv.iMvX = bVerticalSearch ? 0 : (iBestPos - iCurMeBlockPix);
607 sBestMv.iMvY = bVerticalSearch ? (iBestPos - iCurMeBlockPix) : 0;
608 UpdateMeResults (sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY * kiRefStride + sBestMv.iMvX], pMe);
609 }
610 }
611
WelsMotionCrossSearch(SWelsFuncPtrList * pFuncList,SWelsME * pMe,SSlice * pSlice,const int32_t kiEncStride,const int32_t kiRefStride)612 void WelsMotionCrossSearch (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice,
613 const int32_t kiEncStride, const int32_t kiRefStride) {
614 PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfVerticalFullSearch;
615 PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfHorizontalFullSearch;
616
617 //vertical search
618 pfVerticalFullSearchFunc (pFuncList, pMe,
619 pMe->pMvdCost,
620 kiEncStride, kiRefStride,
621 pSlice->sMvStartMin.iMvY,
622 pSlice->sMvStartMax.iMvY, true);
623
624 //horizontal search
625 if (pMe->uiSadCost >= pMe->uiSadCostThreshold) {
626 pfHorizontalFullSearchFunc (pFuncList, pMe,
627 pMe->pMvdCost,
628 kiEncStride, kiRefStride,
629 pSlice->sMvStartMin.iMvX,
630 pSlice->sMvStartMax.iMvX,
631 false);
632 }
633 }
634
635
636 /////////////////////////
637 // Feature Search Basics
638 /////////////////////////
639 //memory related
RequestFeatureSearchPreparation(CMemoryAlign * pMa,const int32_t kiFrameWidth,const int32_t kiFrameHeight,const int32_t iNeedFeatureStorage,SFeatureSearchPreparation * pFeatureSearchPreparation)640 int32_t RequestFeatureSearchPreparation (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight,
641 const int32_t iNeedFeatureStorage,
642 SFeatureSearchPreparation* pFeatureSearchPreparation) {
643 const int32_t kiFeatureStrategyIndex = iNeedFeatureStorage >> 16;
644 const bool bFme8x8 = ((iNeedFeatureStorage & 0x0000FF & ME_FME) == ME_FME);
645 const int32_t kiMarginSize = bFme8x8 ? 8 : 16;
646 const int32_t kiFrameSize = (kiFrameWidth - kiMarginSize) * (kiFrameHeight - kiMarginSize);
647 int32_t iListOfFeatureOfBlock;
648
649 if (0 == kiFeatureStrategyIndex) {
650 iListOfFeatureOfBlock = sizeof (uint16_t) * kiFrameSize;
651 } else {
652 iListOfFeatureOfBlock = sizeof (uint16_t) * kiFrameSize +
653 (kiFrameWidth - kiMarginSize) * sizeof (uint32_t) + kiFrameWidth * 8 * sizeof (uint8_t);
654 }
655 pFeatureSearchPreparation->pFeatureOfBlock =
656 (uint16_t*)pMa->WelsMallocz (iListOfFeatureOfBlock, "pFeatureOfBlock");
657 WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == (pFeatureSearchPreparation->pFeatureOfBlock))
658
659 pFeatureSearchPreparation->uiFeatureStrategyIndex = kiFeatureStrategyIndex;
660 pFeatureSearchPreparation->bFMESwitchFlag = true;
661 pFeatureSearchPreparation->uiFMEGoodFrameCount = FMESWITCH_DEFAULT_GOODFRAME_NUM;
662 pFeatureSearchPreparation->iHighFreMbCount = 0;
663
664 return ENC_RETURN_SUCCESS;
665 }
ReleaseFeatureSearchPreparation(CMemoryAlign * pMa,uint16_t * & pFeatureOfBlock)666 int32_t ReleaseFeatureSearchPreparation (CMemoryAlign* pMa, uint16_t*& pFeatureOfBlock) {
667 if (pMa && pFeatureOfBlock) {
668 pMa->WelsFree (pFeatureOfBlock, "pFeatureOfBlock");
669 pFeatureOfBlock = NULL;
670 return ENC_RETURN_SUCCESS;
671 }
672 return ENC_RETURN_UNEXPECTED;
673 }
674
RequestScreenBlockFeatureStorage(CMemoryAlign * pMa,const int32_t kiFrameWidth,const int32_t kiFrameHeight,const int32_t iNeedFeatureStorage,SScreenBlockFeatureStorage * pScreenBlockFeatureStorage)675 int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight,
676 const int32_t iNeedFeatureStorage,
677 SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
678
679 const int32_t kiFeatureStrategyIndex = iNeedFeatureStorage >> 16;
680 const int32_t kiMe8x8FME = iNeedFeatureStorage & 0x0000FF & ME_FME;
681 const int32_t kiMe16x16FME = ((iNeedFeatureStorage & 0x00FF00) >> 8) & ME_FME;
682 if ((kiMe8x8FME == ME_FME) && (kiMe16x16FME == ME_FME)) {
683 return ENC_RETURN_UNSUPPORTED_PARA;
684 //the following memory allocation cannot support when FME at both size
685 }
686
687 const bool bIsBlock8x8 = (kiMe8x8FME == ME_FME);
688 const int32_t kiMarginSize = bIsBlock8x8 ? 8 : 16;
689 const int32_t kiFrameSize = (kiFrameWidth - kiMarginSize) * (kiFrameHeight - kiMarginSize);
690 const int32_t kiListSize = (0 == kiFeatureStrategyIndex) ? (bIsBlock8x8 ? LIST_SIZE_SUM_8x8 : LIST_SIZE_SUM_16x16) :
691 256;
692
693 pScreenBlockFeatureStorage->pTimesOfFeatureValue = (uint32_t*)pMa->WelsMallocz (kiListSize * sizeof (uint32_t),
694 "pScreenBlockFeatureStorage->pTimesOfFeatureValue");
695 WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pTimesOfFeatureValue)
696
697 pScreenBlockFeatureStorage->pLocationOfFeature = (uint16_t**)pMa->WelsMallocz (kiListSize * sizeof (uint16_t*),
698 "pScreenBlockFeatureStorage->pLocationOfFeature");
699 WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationOfFeature)
700
701 pScreenBlockFeatureStorage->pLocationPointer = (uint16_t*)pMa->WelsMallocz (2 * kiFrameSize * sizeof (uint16_t),
702 "pScreenBlockFeatureStorage->pLocationPointer");
703 WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationPointer)
704 // uint16_t* pFeatureValuePointerList[WELS_MAX (LIST_SIZE_SUM_16x16, LIST_SIZE_MSE_16x16)] = {0};
705 pScreenBlockFeatureStorage->pFeatureValuePointerList = (uint16_t**)pMa->WelsMallocz (WELS_MAX (LIST_SIZE_SUM_16x16,
706 LIST_SIZE_MSE_16x16) * sizeof (uint16_t*),
707 "pScreenBlockFeatureStorage->pFeatureValuePointerList");
708 WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pFeatureValuePointerList)
709
710 pScreenBlockFeatureStorage->pFeatureOfBlockPointer = NULL;
711 pScreenBlockFeatureStorage->iIs16x16 = !bIsBlock8x8;
712 pScreenBlockFeatureStorage->uiFeatureStrategyIndex = kiFeatureStrategyIndex;
713 pScreenBlockFeatureStorage->iActualListSize = kiListSize;
714 WelsSetMemMultiplebytes_c (pScreenBlockFeatureStorage->uiSadCostThreshold, UINT_MAX, BLOCK_SIZE_ALL, sizeof (uint32_t));
715 pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = false;
716
717 return ENC_RETURN_SUCCESS;
718 }
ReleaseScreenBlockFeatureStorage(CMemoryAlign * pMa,SScreenBlockFeatureStorage * pScreenBlockFeatureStorage)719 int32_t ReleaseScreenBlockFeatureStorage (CMemoryAlign* pMa, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
720 if (pMa && pScreenBlockFeatureStorage) {
721 if (pScreenBlockFeatureStorage->pTimesOfFeatureValue) {
722 pMa->WelsFree (pScreenBlockFeatureStorage->pTimesOfFeatureValue, "pScreenBlockFeatureStorage->pTimesOfFeatureValue");
723 pScreenBlockFeatureStorage->pTimesOfFeatureValue = NULL;
724 }
725
726 if (pScreenBlockFeatureStorage->pLocationOfFeature) {
727 pMa->WelsFree (pScreenBlockFeatureStorage->pLocationOfFeature, "pScreenBlockFeatureStorage->pLocationOfFeature");
728 pScreenBlockFeatureStorage->pLocationOfFeature = NULL;
729 }
730
731 if (pScreenBlockFeatureStorage->pLocationPointer) {
732 pMa->WelsFree (pScreenBlockFeatureStorage->pLocationPointer, "pScreenBlockFeatureStorage->pLocationPointer");
733 pScreenBlockFeatureStorage->pLocationPointer = NULL;
734 }
735
736 if (pScreenBlockFeatureStorage->pFeatureValuePointerList) {
737 pMa->WelsFree (pScreenBlockFeatureStorage->pFeatureValuePointerList,
738 "pScreenBlockFeatureStorage->pFeatureValuePointerList");
739 pScreenBlockFeatureStorage->pFeatureValuePointerList = NULL;
740 }
741
742 return ENC_RETURN_SUCCESS;
743 }
744 return ENC_RETURN_UNEXPECTED;
745 }
746
747 //preprocess related
SumOf8x8SingleBlock_c(uint8_t * pRef,const int32_t kiRefStride)748 int32_t SumOf8x8SingleBlock_c (uint8_t* pRef, const int32_t kiRefStride) {
749 int32_t iSum = 0, i;
750 for (i = 0; i < 8; i++) {
751 iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3];
752 iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7];
753 pRef += kiRefStride;
754 }
755 return iSum;
756 }
SumOf16x16SingleBlock_c(uint8_t * pRef,const int32_t kiRefStride)757 int32_t SumOf16x16SingleBlock_c (uint8_t* pRef, const int32_t kiRefStride) {
758 int32_t iSum = 0, i;
759 for (i = 0; i < 16; i++) {
760 iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3];
761 iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7];
762 iSum += pRef[8] + pRef[9] + pRef[10] + pRef[11];
763 iSum += pRef[12] + pRef[13] + pRef[14] + pRef[15];
764 pRef += kiRefStride;
765 }
766 return iSum;
767 }
768
SumOf8x8BlockOfFrame_c(uint8_t * pRefPicture,const int32_t kiWidth,const int32_t kiHeight,const int32_t kiRefStride,uint16_t * pFeatureOfBlock,uint32_t pTimesOfFeatureValue[])769 void SumOf8x8BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
770 const int32_t kiRefStride,
771 uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
772 int32_t x, y;
773 uint8_t* pRef;
774 uint16_t* pBuffer;
775 int32_t iSum;
776 for (y = 0; y < kiHeight; y++) {
777 pRef = pRefPicture + kiRefStride * y;
778 pBuffer = pFeatureOfBlock + kiWidth * y;
779 for (x = 0; x < kiWidth; x++) {
780 iSum = SumOf8x8SingleBlock_c (pRef + x, kiRefStride);
781
782 pBuffer[x] = iSum;
783 pTimesOfFeatureValue[iSum]++;
784 }
785 }
786 }
787
SumOf16x16BlockOfFrame_c(uint8_t * pRefPicture,const int32_t kiWidth,const int32_t kiHeight,const int32_t kiRefStride,uint16_t * pFeatureOfBlock,uint32_t pTimesOfFeatureValue[])788 void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
789 const int32_t kiRefStride,
790 uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
791 //TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able?
792 int32_t x, y;
793 uint8_t* pRef;
794 uint16_t* pBuffer;
795 int32_t iSum;
796 for (y = 0; y < kiHeight; y++) {
797 pRef = pRefPicture + kiRefStride * y;
798 pBuffer = pFeatureOfBlock + kiWidth * y;
799 for (x = 0; x < kiWidth; x++) {
800 iSum = SumOf16x16SingleBlock_c (pRef + x, kiRefStride);
801
802 pBuffer[x] = iSum;
803 pTimesOfFeatureValue[iSum]++;
804 }
805 }
806 }
807
InitializeHashforFeature_c(uint32_t * pTimesOfFeatureValue,uint16_t * pBuf,const int32_t kiListSize,uint16_t ** pLocationOfFeature,uint16_t ** pFeatureValuePointerList)808 void InitializeHashforFeature_c (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
809 uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList) {
810 //assign location pointer
811 uint16_t* pBufPos = pBuf;
812 for (int32_t i = 0 ; i < kiListSize; ++i) {
813 pLocationOfFeature[i] =
814 pFeatureValuePointerList[i] = pBufPos;
815 pBufPos += (pTimesOfFeatureValue[i] << 1);
816 }
817 }
FillQpelLocationByFeatureValue_c(uint16_t * pFeatureOfBlock,const int32_t kiWidth,const int32_t kiHeight,uint16_t ** pFeatureValuePointerList)818 void FillQpelLocationByFeatureValue_c (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
819 uint16_t** pFeatureValuePointerList) {
820 //assign each pixel's position
821 uint16_t* pSrcPointer = pFeatureOfBlock;
822 int32_t iQpelY = 0;
823 for (int32_t y = 0; y < kiHeight; y++) {
824 for (int32_t x = 0; x < kiWidth; x++) {
825 uint16_t uiFeature = pSrcPointer[x];
826 pFeatureValuePointerList[uiFeature][0] = x << 2;
827 pFeatureValuePointerList[uiFeature][1] = iQpelY;
828 pFeatureValuePointerList[uiFeature] += 2;
829 }
830 iQpelY += 4;
831 pSrcPointer += kiWidth;
832 }
833 }
834
CalculateFeatureOfBlock(SWelsFuncPtrList * pFunc,SPicture * pRef,SScreenBlockFeatureStorage * pScreenBlockFeatureStorage)835 bool CalculateFeatureOfBlock (SWelsFuncPtrList* pFunc, SPicture* pRef,
836 SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
837 uint16_t* pFeatureOfBlock = pScreenBlockFeatureStorage->pFeatureOfBlockPointer;
838 uint32_t* pTimesOfFeatureValue = pScreenBlockFeatureStorage->pTimesOfFeatureValue;
839 uint16_t** pLocationOfFeature = pScreenBlockFeatureStorage->pLocationOfFeature;
840 uint16_t* pBuf = pScreenBlockFeatureStorage->pLocationPointer;
841
842 if (NULL == pFeatureOfBlock || NULL == pTimesOfFeatureValue || NULL == pLocationOfFeature || NULL == pBuf
843 || NULL == pRef->pData[0]) {
844 return false;
845 }
846
847 uint8_t* pRefData = pRef->pData[0];
848 const int32_t iRefStride = pRef->iLineSize[0];
849 int32_t iIs16x16 = pScreenBlockFeatureStorage->iIs16x16;
850 const int32_t iEdgeDiscard = (iIs16x16 ? 16 : 8); //this is to save complexity of padding on pRef
851 const int32_t iWidth = pRef->iWidthInPixel - iEdgeDiscard;
852 const int32_t kiHeight = pRef->iHeightInPixel - iEdgeDiscard;
853 const int32_t kiActualListSize = pScreenBlockFeatureStorage->iActualListSize;
854
855 memset (pTimesOfFeatureValue, 0, sizeof (int32_t)*kiActualListSize);
856 (pFunc->pfCalculateBlockFeatureOfFrame[iIs16x16]) (pRefData, iWidth, kiHeight, iRefStride, pFeatureOfBlock,
857 pTimesOfFeatureValue);
858
859 //assign pLocationOfFeature pointer
860 pFunc->pfInitializeHashforFeature (pTimesOfFeatureValue, pBuf, kiActualListSize,
861 pLocationOfFeature, pScreenBlockFeatureStorage->pFeatureValuePointerList);
862
863 //assign each pixel's pLocationOfFeature
864 pFunc->pfFillQpelLocationByFeatureValue (pFeatureOfBlock, iWidth, kiHeight,
865 pScreenBlockFeatureStorage->pFeatureValuePointerList);
866 return true;
867 }
868
PerformFMEPreprocess(SWelsFuncPtrList * pFunc,SPicture * pRef,uint16_t * pFeatureOfBlock,SScreenBlockFeatureStorage * pScreenBlockFeatureStorage)869 void PerformFMEPreprocess (SWelsFuncPtrList* pFunc, SPicture* pRef, uint16_t* pFeatureOfBlock,
870 SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
871 pScreenBlockFeatureStorage->pFeatureOfBlockPointer = pFeatureOfBlock;
872 pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = CalculateFeatureOfBlock (pFunc, pRef,
873 pScreenBlockFeatureStorage);
874
875 if (pScreenBlockFeatureStorage->bRefBlockFeatureCalculated) {
876 uint32_t uiRefPictureAvgQstepx16 = QStepx16ByQp[WelsMedian (0, pRef->iFrameAverageQp, 51)];
877 uint32_t uiSadCostThreshold16x16 = ((30 * (uiRefPictureAvgQstepx16 + 160)) >> 3);
878 pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x16] = uiSadCostThreshold16x16;
879 pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x8] = (uiSadCostThreshold16x16 >> 2);
880 pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x8]
881 = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x16]
882 = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_4x4] = UINT_MAX;
883 }
884 }
885
886 //search related
SetFeatureSearchIn(SWelsFuncPtrList * pFunc,const SWelsME & sMe,const SSlice * pSlice,SScreenBlockFeatureStorage * pRefFeatureStorage,const int32_t kiEncStride,const int32_t kiRefStride,SFeatureSearchIn * pFeatureSearchIn)887 bool SetFeatureSearchIn (SWelsFuncPtrList* pFunc, const SWelsME& sMe,
888 const SSlice* pSlice, SScreenBlockFeatureStorage* pRefFeatureStorage,
889 const int32_t kiEncStride, const int32_t kiRefStride,
890 SFeatureSearchIn* pFeatureSearchIn) {
891 pFeatureSearchIn->pSad = pFunc->sSampleDealingFuncs.pfSampleSad[sMe.uiBlockSize];
892 pFeatureSearchIn->iFeatureOfCurrent = pFunc->pfCalculateSingleBlockFeature[BLOCK_16x16 == sMe.uiBlockSize] (sMe.pEncMb,
893 kiEncStride);
894
895 pFeatureSearchIn->pEnc = sMe.pEncMb;
896 pFeatureSearchIn->pColoRef = sMe.pColoRefMb;
897 pFeatureSearchIn->iEncStride = kiEncStride;
898 pFeatureSearchIn->iRefStride = kiRefStride;
899 pFeatureSearchIn->uiSadCostThresh = sMe.uiSadCostThreshold;
900
901 pFeatureSearchIn->iCurPixX = sMe.iCurMeBlockPixX;
902 pFeatureSearchIn->iCurPixXQpel = (pFeatureSearchIn->iCurPixX << 2);
903 pFeatureSearchIn->iCurPixY = sMe.iCurMeBlockPixY;
904 pFeatureSearchIn->iCurPixYQpel = (pFeatureSearchIn->iCurPixY << 2);
905
906 pFeatureSearchIn->pTimesOfFeature = pRefFeatureStorage->pTimesOfFeatureValue;
907 pFeatureSearchIn->pQpelLocationOfFeature = pRefFeatureStorage->pLocationOfFeature;
908 pFeatureSearchIn->pMvdCostX = sMe.pMvdCost - pFeatureSearchIn->iCurPixXQpel - sMe.sMvp.iMvX;
909 pFeatureSearchIn->pMvdCostY = sMe.pMvdCost - pFeatureSearchIn->iCurPixYQpel - sMe.sMvp.iMvY;
910
911 pFeatureSearchIn->iMinQpelX = pFeatureSearchIn->iCurPixXQpel + ((pSlice->sMvStartMin.iMvX) * (1 << 2));
912 pFeatureSearchIn->iMinQpelY = pFeatureSearchIn->iCurPixYQpel + ((pSlice->sMvStartMin.iMvY) * (1 << 2));
913 pFeatureSearchIn->iMaxQpelX = pFeatureSearchIn->iCurPixXQpel + ((pSlice->sMvStartMax.iMvX) * (1 << 2));
914 pFeatureSearchIn->iMaxQpelY = pFeatureSearchIn->iCurPixYQpel + ((pSlice->sMvStartMax.iMvY) * (1 << 2));
915
916 if (NULL == pFeatureSearchIn->pSad || NULL == pFeatureSearchIn->pTimesOfFeature
917 || NULL == pFeatureSearchIn->pQpelLocationOfFeature) {
918 return false;
919 }
920 return true;
921 }
SaveFeatureSearchOut(const SMVUnitXY sBestMv,const uint32_t uiBestSadCost,uint8_t * pRef,SFeatureSearchOut * pFeatureSearchOut)922 void SaveFeatureSearchOut (const SMVUnitXY sBestMv, const uint32_t uiBestSadCost, uint8_t* pRef,
923 SFeatureSearchOut* pFeatureSearchOut) {
924 pFeatureSearchOut->sBestMv = sBestMv;
925 pFeatureSearchOut->uiBestSadCost = uiBestSadCost;
926 pFeatureSearchOut->pBestRef = pRef;
927 }
928
FeatureSearchOne(SFeatureSearchIn & sFeatureSearchIn,const int32_t iFeatureDifference,const uint32_t kuiExpectedSearchTimes,SFeatureSearchOut * pFeatureSearchOut)929 bool FeatureSearchOne (SFeatureSearchIn& sFeatureSearchIn, const int32_t iFeatureDifference,
930 const uint32_t kuiExpectedSearchTimes,
931 SFeatureSearchOut* pFeatureSearchOut) {
932 const int32_t iFeatureOfRef = (sFeatureSearchIn.iFeatureOfCurrent + iFeatureDifference);
933 if (iFeatureOfRef < 0 || iFeatureOfRef >= LIST_SIZE)
934 return true;
935
936 PSampleSadSatdCostFunc pSad = sFeatureSearchIn.pSad;
937 uint8_t* pEnc = sFeatureSearchIn.pEnc;
938 uint8_t* pColoRef = sFeatureSearchIn.pColoRef;
939 const int32_t iEncStride = sFeatureSearchIn.iEncStride;
940 const int32_t iRefStride = sFeatureSearchIn.iRefStride;
941 const uint16_t uiSadCostThresh = sFeatureSearchIn.uiSadCostThresh;
942
943 const int32_t iCurPixX = sFeatureSearchIn.iCurPixX;
944 const int32_t iCurPixY = sFeatureSearchIn.iCurPixY;
945 const int32_t iCurPixXQpel = sFeatureSearchIn.iCurPixXQpel;
946 const int32_t iCurPixYQpel = sFeatureSearchIn.iCurPixYQpel;
947
948 const int32_t iMinQpelX = sFeatureSearchIn.iMinQpelX;
949 const int32_t iMinQpelY = sFeatureSearchIn.iMinQpelY;
950 const int32_t iMaxQpelX = sFeatureSearchIn.iMaxQpelX;
951 const int32_t iMaxQpelY = sFeatureSearchIn.iMaxQpelY;
952
953 const int32_t iSearchTimes = WELS_MIN (sFeatureSearchIn.pTimesOfFeature[iFeatureOfRef], kuiExpectedSearchTimes);
954 const int32_t iSearchTimesx2 = (iSearchTimes << 1);
955 const uint16_t* pQpelPosition = sFeatureSearchIn.pQpelLocationOfFeature[iFeatureOfRef];
956
957 SMVUnitXY sBestMv;
958 uint32_t uiBestCost, uiTmpCost;
959 uint8_t* pBestRef, *pCurRef;
960 int32_t iQpelX, iQpelY;
961 int32_t iIntepelX, iIntepelY;
962 int32_t i;
963
964 sBestMv.iMvX = pFeatureSearchOut->sBestMv.iMvX;
965 sBestMv.iMvY = pFeatureSearchOut->sBestMv.iMvY;
966 uiBestCost = pFeatureSearchOut->uiBestSadCost;
967 pBestRef = pFeatureSearchOut->pBestRef;
968
969 for (i = 0; i < iSearchTimesx2; i += 2) {
970 iQpelX = pQpelPosition[i];
971 iQpelY = pQpelPosition[i + 1];
972
973 if ((iQpelX > iMaxQpelX) || (iQpelX < iMinQpelX)
974 || (iQpelY > iMaxQpelY) || (iQpelY < iMinQpelY)
975 || (iQpelX == iCurPixXQpel) || (iQpelY == iCurPixYQpel))
976 continue;
977
978 uiTmpCost = sFeatureSearchIn.pMvdCostX[ iQpelX ] + sFeatureSearchIn.pMvdCostY[ iQpelY ];
979 if (uiTmpCost + iFeatureDifference >= uiBestCost)
980 continue;
981
982 iIntepelX = (iQpelX >> 2) - iCurPixX;
983 iIntepelY = (iQpelY >> 2) - iCurPixY;
984 pCurRef = &pColoRef[iIntepelX + iIntepelY * iRefStride];
985 uiTmpCost += pSad (pEnc, iEncStride, pCurRef, iRefStride);
986 if (uiTmpCost < uiBestCost) {
987 sBestMv.iMvX = iIntepelX;
988 sBestMv.iMvY = iIntepelY;
989 uiBestCost = uiTmpCost;
990 pBestRef = pCurRef;
991
992 if (uiBestCost < uiSadCostThresh)
993 break;
994 }
995 }
996 SaveFeatureSearchOut (sBestMv, uiBestCost, pBestRef, pFeatureSearchOut);
997 return (i < iSearchTimesx2);
998 }
999
1000
MotionEstimateFeatureFullSearch(SFeatureSearchIn & sFeatureSearchIn,const uint32_t kuiMaxSearchPoint,SWelsME * pMe)1001 void MotionEstimateFeatureFullSearch (SFeatureSearchIn& sFeatureSearchIn,
1002 const uint32_t kuiMaxSearchPoint,
1003 SWelsME* pMe) {
1004 SFeatureSearchOut sFeatureSearchOut = { { 0 } };//TODO: this can be refactored and removed
1005 sFeatureSearchOut.uiBestSadCost = pMe->uiSadCost;
1006 sFeatureSearchOut.sBestMv = pMe->sMv;
1007 sFeatureSearchOut.pBestRef = pMe->pRefMb;
1008
1009 int32_t iFeatureDifference = 0;//TODO: change it according to computational-complexity setting when needed
1010 FeatureSearchOne (sFeatureSearchIn, iFeatureDifference, kuiMaxSearchPoint, &sFeatureSearchOut);
1011 if (sFeatureSearchOut.uiBestSadCost < pMe->uiSadCost) { //TODO: this may be refactored and removed
1012 UpdateMeResults (sFeatureSearchOut.sBestMv,
1013 sFeatureSearchOut.uiBestSadCost, sFeatureSearchOut.pBestRef,
1014 pMe);
1015 }
1016 }
1017
1018 //switch related
CountFMECostDown(const SDqLayer * pCurLayer)1019 static uint32_t CountFMECostDown (const SDqLayer* pCurLayer) {
1020 uint32_t uiCostDownSum = 0;
1021 const int32_t kiSliceCount = GetCurrentSliceNum (pCurLayer);
1022 if (kiSliceCount >= 1) {
1023 int32_t iSliceIndex = 0;
1024 SSlice* pSlice = pCurLayer->ppSliceInLayer[iSliceIndex];
1025 while (iSliceIndex < kiSliceCount) {
1026 pSlice = pCurLayer->ppSliceInLayer[iSliceIndex];
1027 uiCostDownSum += pSlice->uiSliceFMECostDown;
1028 ++ iSliceIndex;
1029 }
1030 }
1031 return uiCostDownSum;
1032 }
1033 #define FMESWITCH_MBAVERCOSTSAVING_THRESHOLD (2) //empirically set.
1034 #define FMESWITCH_GOODFRAMECOUNT_MAX (5) //empirically set.
UpdateFMEGoodFrameCount(const uint32_t iAvMBNormalizedRDcostDown,uint8_t & uiFMEGoodFrameCount)1035 static void UpdateFMEGoodFrameCount (const uint32_t iAvMBNormalizedRDcostDown, uint8_t& uiFMEGoodFrameCount) {
1036 //this strategy may be changed, here the number is derived from empirical-numbers
1037 // uiFMEGoodFrameCount lies in [0,FMESWITCH_GOODFRAMECOUNT_MAX]
1038 if (iAvMBNormalizedRDcostDown > FMESWITCH_MBAVERCOSTSAVING_THRESHOLD) {
1039 if (uiFMEGoodFrameCount < FMESWITCH_GOODFRAMECOUNT_MAX)
1040 ++ uiFMEGoodFrameCount;
1041 } else {
1042 if (uiFMEGoodFrameCount > 0)
1043 -- uiFMEGoodFrameCount;
1044 }
1045 }
UpdateFMESwitch(SDqLayer * pCurLayer)1046 void UpdateFMESwitch (SDqLayer* pCurLayer) {
1047 const uint32_t iFMECost = CountFMECostDown (pCurLayer);
1048 const uint32_t iAvMBNormalizedRDcostDown = iFMECost / (pCurLayer->iMbWidth * pCurLayer->iMbHeight);
1049 UpdateFMEGoodFrameCount (iAvMBNormalizedRDcostDown, pCurLayer->pFeatureSearchPreparation->uiFMEGoodFrameCount);
1050 }
UpdateFMESwitchNull(SDqLayer * pCurLayer)1051 void UpdateFMESwitchNull (SDqLayer* pCurLayer) {
1052 }
1053 /////////////////////////
1054 // Search function options
1055 /////////////////////////
WelsDiamondCrossSearch(SWelsFuncPtrList * pFunc,SWelsME * pMe,SSlice * pSlice,const int32_t kiEncStride,const int32_t kiRefStride)1056 void WelsDiamondCrossSearch (SWelsFuncPtrList* pFunc, SWelsME* pMe, SSlice* pSlice, const int32_t kiEncStride,
1057 const int32_t kiRefStride) {
1058 // Step 1: diamond search
1059 WelsDiamondSearch (pFunc, pMe, pSlice, kiEncStride, kiRefStride);
1060
1061 // Step 2: CROSS search
1062 pMe->uiSadCostThreshold = pMe->pRefFeatureStorage->uiSadCostThreshold[pMe->uiBlockSize];
1063 if (pMe->uiSadCost >= pMe->uiSadCostThreshold) {
1064 WelsMotionCrossSearch (pFunc, pMe, pSlice, kiEncStride, kiRefStride);
1065 }
1066 }
WelsDiamondCrossFeatureSearch(SWelsFuncPtrList * pFunc,SWelsME * pMe,SSlice * pSlice,const int32_t kiEncStride,const int32_t kiRefStride)1067 void WelsDiamondCrossFeatureSearch (SWelsFuncPtrList* pFunc, SWelsME* pMe, SSlice* pSlice, const int32_t kiEncStride,
1068 const int32_t kiRefStride) {
1069 // Step 1: diamond search + cross
1070 WelsDiamondCrossSearch (pFunc, pMe, pSlice, kiEncStride, kiRefStride);
1071
1072 // Step 2: FeatureSearch
1073 if (pMe->uiSadCost >= pMe->uiSadCostThreshold) {
1074 pSlice->uiSliceFMECostDown += pMe->uiSadCost;
1075
1076 uint32_t uiMaxSearchPoint = INT_MAX;//TODO: change it according to computational-complexity setting
1077 SFeatureSearchIn sFeatureSearchIn = {0};
1078 if (SetFeatureSearchIn (pFunc, *pMe, pSlice, pMe->pRefFeatureStorage,
1079 kiEncStride, kiRefStride,
1080 &sFeatureSearchIn)) {
1081 MotionEstimateFeatureFullSearch (sFeatureSearchIn, uiMaxSearchPoint, pMe);
1082 }
1083 pSlice->uiSliceFMECostDown -= pMe->uiSadCost;
1084 }
1085 }
1086
1087
1088 } // namespace WelsEnc
1089
1090