• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2013, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    svc_base_layer_md.c
33  *
34  * \brief   mode decision
35  *
36  * \date    2009.08.10 Created
37  *
38  *************************************************************************************
39  */
40 #include "ls_defines.h"
41 #include "mv_pred.h"
42 #include "svc_enc_golomb.h"
43 #include "svc_base_layer_md.h"
44 #include "encoder.h"
45 #include "svc_encode_mb.h"
46 #include "svc_encode_slice.h"
47 namespace WelsEnc {
48 static const ALIGNED_DECLARE (int8_t, g_kiIntra16AvaliMode[8][5], 16) = {
49   { I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
50   { I16_PRED_DC_L,   I16_PRED_H,       I16_PRED_INVALID, I16_PRED_INVALID, 2 },
51   { I16_PRED_DC_T,   I16_PRED_V,       I16_PRED_INVALID, I16_PRED_INVALID, 2 },
52   { I16_PRED_V,      I16_PRED_H,       I16_PRED_DC,      I16_PRED_INVALID, 3 },
53   { I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
54   { I16_PRED_DC_L,   I16_PRED_H,       I16_PRED_INVALID, I16_PRED_INVALID, 2 },
55   { I16_PRED_DC_T,   I16_PRED_V,       I16_PRED_INVALID, I16_PRED_INVALID, 2 },
56   { I16_PRED_V,      I16_PRED_H,       I16_PRED_DC,      I16_PRED_P,       4 }
57 };
58 
59 static const ALIGNED_DECLARE (uint8_t, g_kiIntra4AvailCount[16], 16) = {
60 #ifndef  I4_PRED_MODE_EXTEND
61   1, 3, 2, 4, 1, 3, 2, 7, 1, 3, 4, 6, 1, 3, 4, 9
62 #else
63   1, 3, 4, 4, 1, 3, 4, 7, 1, 3, 4, 6, 1, 3, 4, 9
64 #endif  //I4_PRED_MODE_EXTEND
65 };
66 
67 //left_avail | (top_avail<<1) | (left_top_avail<<2) | (right_top_avail<<3);
68 static const ALIGNED_DECLARE (uint8_t, g_kiIntra4AvailMode[16][16], 16) = {
69   {
70     I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
71     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
72     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
73     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
74   },  //  0000
75 
76   {
77     I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
78     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
79     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
80     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
81   },  //  0001
82 
83 #ifndef  I4_PRED_MODE_EXTEND
84   {
85     I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID,
86     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
87     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
88     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
89   }, //  0010
90 #else
91   {
92     I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP,
93     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
94     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
95     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
96   }, //  0010
97 #endif //I4_PRED_MODE_EXTEND
98 
99   {
100     I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
101     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
102     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
103     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
104   }, //  0011
105 
106   {
107     I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
108     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
109     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
110     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
111   },  //  0100
112 
113   {
114     I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
115     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
116     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
117     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
118   },    //  0101
119 
120 #ifndef  I4_PRED_MODE_EXTEND
121   {
122     I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID,
123     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
124     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
125     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
126   },     //  0110
127 #else
128   {
129     I4_PRED_DC_T,  I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP,
130     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
131     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
132     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
133   },     //  0110
134 #endif //I4_PRED_MODE_EXTEND
135 
136   {
137     I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
138     I4_PRED_DDR,     I4_PRED_VR,      I4_PRED_HD,      I4_PRED_INVALID,
139     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
140     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
141   },               //  0111
142 
143   {
144     I4_PRED_DC_128,   I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
145     I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
146     I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
147     I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
148   },  //  1000
149 
150   {
151     I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
152     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
153     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
154     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
155   },    //  1001
156 
157   {
158     I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL,
159     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
160     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
161     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
162   },     //  1010
163 
164   {
165     I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
166     I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_INVALID, I4_PRED_INVALID,
167     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
168     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
169   },          //  1011
170 
171   {
172     I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
173     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
174     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
175     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
176   },  //  1100
177 
178   {
179     I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID,
180     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
181     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
182     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
183   },    //  1101
184 
185   {
186     I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL,
187     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
188     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
189     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
190   },     //  1110
191 
192   {
193     I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
194     I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_DDR,     I4_PRED_VR,
195     I4_PRED_HD,      I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
196     I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
197   }                          //  1111
198 
199 };
200 static const ALIGNED_DECLARE (int8_t, g_kiIntraChromaAvailMode[8][5], 16) = {
201   { C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
202   { C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
203   { C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
204   { C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_INVALID, 3 },
205   { C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
206   { C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
207   { C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
208   { C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_P,       4 }
209 };
210 
211 // for cache hit, two table are total sizeof 64 Bytes
212 const int8_t g_kiCoordinateIdx4x4X[16] = { 0, 4, 0, 4,
213                                            8, 12, 8, 12,
214                                            0, 4, 0, 4,
215                                            8, 12, 8, 12
216                                          };
217 
218 const int8_t g_kiCoordinateIdx4x4Y[16] = { 0, 0, 4, 4,
219                                            0, 0, 4, 4,
220                                            8, 8, 12, 12,
221                                            8, 8, 12, 12
222                                          };
223 static const ALIGNED_DECLARE (int8_t, g_kiNeighborIntraToI4x4[16][16], 16) = {
224   { 0,  1,  10, 7,  1,  1,  15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
225   { 1,  1,  15, 7,  1,  1,  15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
226   { 10, 15, 10, 7,  15, 7,  15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
227   { 11, 15, 15, 7,  15, 7,  15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
228   { 4,  1,  10, 7,  1,  1,  15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
229   { 5,  1,  15, 7,  1,  1,  15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
230   { 14, 15, 10, 7,  15, 7,  15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
231   { 15, 15, 15, 7,  15, 7,  15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
232   { 0,  1,  10, 7,  1,  9,  15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
233   { 1,  1,  15, 7,  1,  9,  15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
234   { 10, 15, 10, 7,  15, 15, 15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
235   { 11, 15, 15, 7,  15, 15, 15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
236   { 4,  1,  10, 7,  1,  9,  15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
237   { 5,  1,  15, 7,  1,  9,  15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
238   { 14, 15, 10, 7,  15, 15, 15, 7,  10, 15, 10, 7,  15, 7,  15, 7},
239   { 15, 15, 15, 7,  15, 15, 15, 7,  15, 15, 15, 7,  15, 7,  15, 7},
240 };
241 
242 ALIGNED_DECLARE (const int8_t, g_kiMapModeI4x4[14], 16) = {
243   0, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 3, 7
244 };
245 
PredIntra4x4Mode(int8_t * pIntraPredMode,int32_t iIdx4)246 int32_t PredIntra4x4Mode (int8_t* pIntraPredMode, int32_t iIdx4) {
247   int8_t iTopMode = pIntraPredMode[iIdx4 - 8];
248   int8_t iLeftMode = pIntraPredMode[iIdx4 - 1];
249   int8_t iBestMode;
250 
251   if (-1 == iLeftMode || -1 == iTopMode) {
252     iBestMode = 2;
253   } else {
254     iBestMode = WELS_MIN (iLeftMode, iTopMode);
255   }
256   return iBestMode;
257 }
258 
WelsMdIntraInit(sWelsEncCtx * pEncCtx,SMB * pCurMb,SMbCache * pMbCache,const int32_t iSliceFirstMbXY)259 void WelsMdIntraInit (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, const int32_t iSliceFirstMbXY) {
260   SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
261 
262   const int32_t kiMbX  = pCurMb->iMbX;
263   const int32_t kiMbY  = pCurMb->iMbY;
264   const int32_t kiMbXY = pCurMb->iMbXY;
265 
266   // step 3. locating current pEnc and pDec
267   // unroll loops here
268   if (0 == kiMbX || iSliceFirstMbXY == kiMbXY) {
269     int32_t iStrideY, iStrideUV;
270     int32_t iOffsetY, iOffsetUV;
271 
272     iStrideY    = pCurLayer->iEncStride[0];
273     iStrideUV   = pCurLayer->iEncStride[1];
274     iOffsetY    = (kiMbX + kiMbY * iStrideY) << 4;
275     iOffsetUV   = (kiMbX + kiMbY * iStrideUV) << 3;
276     pMbCache->SPicData.pEncMb[0]        = pCurLayer->pEncData[0] + iOffsetY;
277     pMbCache->SPicData.pEncMb[1]        = pCurLayer->pEncData[1] + iOffsetUV;
278     pMbCache->SPicData.pEncMb[2]        = pCurLayer->pEncData[2] + iOffsetUV;
279 
280     iStrideY    = pCurLayer->iCsStride[0];
281     iStrideUV   = pCurLayer->iCsStride[1];
282     iOffsetY    = (kiMbX + kiMbY * iStrideY) << 4;
283     iOffsetUV   = (kiMbX + kiMbY * iStrideUV) << 3;
284     pMbCache->SPicData.pCsMb[0]         = pCurLayer->pCsData[0] + iOffsetY;
285     pMbCache->SPicData.pCsMb[1]         = pCurLayer->pCsData[1] + iOffsetUV;
286     pMbCache->SPicData.pCsMb[2]         = pCurLayer->pCsData[2] + iOffsetUV;
287 
288     iStrideY    = pCurLayer->pDecPic->iLineSize[0];
289     iStrideUV   = pCurLayer->pDecPic->iLineSize[1];
290     iOffsetY    = (kiMbX + kiMbY * iStrideY) << 4;
291     iOffsetUV   = (kiMbX + kiMbY * iStrideUV) << 3;
292     pMbCache->SPicData.pDecMb[0]        = pCurLayer->pDecPic->pData[0] + iOffsetY;
293     pMbCache->SPicData.pDecMb[1]        = pCurLayer->pDecPic->pData[1] + iOffsetUV;
294     pMbCache->SPicData.pDecMb[2]        = pCurLayer->pDecPic->pData[2] + iOffsetUV;
295   } else {
296     pMbCache->SPicData.pEncMb[0]        += MB_WIDTH_LUMA;
297     pMbCache->SPicData.pEncMb[1]        += MB_WIDTH_CHROMA;
298     pMbCache->SPicData.pEncMb[2]        += MB_WIDTH_CHROMA;
299 
300     pMbCache->SPicData.pDecMb[0]        += MB_WIDTH_LUMA;
301     pMbCache->SPicData.pDecMb[1]        += MB_WIDTH_CHROMA;
302     pMbCache->SPicData.pDecMb[2]        += MB_WIDTH_CHROMA;
303 
304     pMbCache->SPicData.pCsMb[0]         += MB_WIDTH_LUMA;
305     pMbCache->SPicData.pCsMb[1]         += MB_WIDTH_CHROMA;
306     pMbCache->SPicData.pCsMb[2]         += MB_WIDTH_CHROMA;
307   }
308 
309   //step 2. initial pWelsMd
310   pCurMb->uiCbp = 0;
311 
312   //step 4: locating scaled_tcoeff
313 
314   //step 1. load neighbor cache
315   FillNeighborCacheIntra (pMbCache, pCurMb, pCurLayer->iMbWidth);
316   pMbCache->pMemPredLuma = pMbCache->pMemPredMb;// in WelsMdI16x16() will be changed, so re-init here!
317   pMbCache->pMemPredChroma = pMbCache->pMemPredMb +
318                              256;// Init with default, maybe change in WelsMdI16x16 and svc_md_i16x16_sad
319 }
320 
WelsMdInterInit(sWelsEncCtx * pEncCtx,SSlice * pSlice,SMB * pCurMb,const int32_t iSliceFirstMbXY)321 void WelsMdInterInit (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, const int32_t iSliceFirstMbXY) {
322   SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
323   SMbCache* pMbCache  = &pSlice->sMbCacheInfo;
324   const int32_t kiMbX  = pCurMb->iMbX;
325   const int32_t kiMbY  = pCurMb->iMbY;
326   const int32_t kiMbXY = pCurMb->iMbXY;
327   const int32_t kiMbWidth = pCurLayer->iMbWidth;
328   const int32_t kiMbHeight = pCurLayer->iMbHeight;
329 
330   pMbCache->pEncSad = &pCurLayer->pDecPic->pMbSkipSad[kiMbXY];
331 
332   //step 1. load neighbor cache
333   pEncCtx->pFuncList->pfFillInterNeighborCache (pMbCache, pCurMb, kiMbWidth,
334       pEncCtx->pVaa->pVaaBackgroundMbFlag + kiMbXY); //BGD spatial pFunc
335 
336   //step 3: initial cost
337 
338   //step 4. locating current p_ref
339   // merge loops
340   if (0 == kiMbX || iSliceFirstMbXY == kiMbXY) {
341     const int32_t kiRefStrideY          = pCurLayer->pRefPic->iLineSize[0];
342     const int32_t kiRefStrideUV         = pCurLayer->pRefPic->iLineSize[1];
343     const int32_t kiCurStrideY          = (kiMbX + kiMbY * kiRefStrideY) << 4;
344     const int32_t kiCurStrideUV         = (kiMbX + kiMbY * kiRefStrideUV) << 3;
345     pMbCache->SPicData.pRefMb[0]        = pCurLayer->pRefPic->pData[0] + kiCurStrideY;
346     pMbCache->SPicData.pRefMb[1]        = pCurLayer->pRefPic->pData[1] + kiCurStrideUV;
347     pMbCache->SPicData.pRefMb[2]        = pCurLayer->pRefPic->pData[2] + kiCurStrideUV;
348   } else {
349     pMbCache->SPicData.pRefMb[0]        += MB_WIDTH_LUMA;
350     pMbCache->SPicData.pRefMb[1]        += MB_WIDTH_CHROMA;
351     pMbCache->SPicData.pRefMb[2]        += MB_WIDTH_CHROMA;
352   }
353 
354   pMbCache->uiRefMbType = pCurLayer->pRefPic->uiRefMbType[kiMbXY];
355   pMbCache->bCollocatedPredFlag = false;
356 
357   //comment: sometimes, mode decision process may skip the md_p16x16 and md_pskip function,
358   ST32 (&pCurMb->sP16x16Mv, 0);
359   ST32 (&pCurLayer->pDecPic->sMvList[kiMbXY], 0);
360 
361   SetMvWithinIntegerMvRange (kiMbWidth, kiMbHeight, kiMbX, kiMbY, pEncCtx->iMvRange, & (pSlice->sMvStartMin),
362                              & (pSlice->sMvStartMax));
363 }
364 
WelsMdI16x16(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SMbCache * pMbCache,int32_t iLambda)365 int32_t WelsMdI16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda) {
366   const int8_t*  kpAvailMode;
367   int32_t iAvailCount;
368   int32_t iIdx = 0;
369   uint8_t* pPredI16x16[2] = {pMbCache->pMemPredMb, pMbCache->pMemPredMb + 256};
370   uint8_t* pDst       = pPredI16x16[0];
371   uint8_t* pDec       = pMbCache->SPicData.pCsMb[0];
372   uint8_t* pEnc       = pMbCache->SPicData.pEncMb[0];
373   int32_t iLineSizeDec = pCurDqLayer->iCsStride[0];
374   int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
375   int32_t i, iCurCost, iCurMode, iBestMode, iBestCost = INT_MAX;
376 
377   int32_t iOffset = pMbCache->uiNeighborIntra & 0x07;
378   iAvailCount = g_kiIntra16AvaliMode[iOffset][4];
379   kpAvailMode = g_kiIntra16AvaliMode[iOffset];
380   if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra16x16Combined3) {
381     iBestCost = pFunc->sSampleDealingFuncs.pfIntra16x16Combined3 (pDec, iLineSizeDec, pEnc, iLineSizeEnc, &iBestMode,
382                 iLambda, pDst/*temp*/);
383     iCurMode = kpAvailMode[3];
384     pFunc->pfGetLumaI16x16Pred[iCurMode] (pDst, pDec, iLineSizeDec);
385     iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16] (pDst, 16, pEnc, iLineSizeEnc) + iLambda * 4 ;
386     if (iCurCost < iBestCost) {
387       iBestMode = iCurMode;
388       iBestCost = iCurCost;
389     } else {
390       pFunc->pfGetLumaI16x16Pred[iBestMode] (pDst, pDec, iLineSizeDec);
391     }
392     iIdx = 1;
393     iBestCost += iLambda;
394   } else {
395     iBestMode = kpAvailMode[0];
396     for (i = 0; i < iAvailCount; ++ i) {
397       iCurMode = kpAvailMode[i];
398 
399       assert (iCurMode >= 0 && iCurMode < 7);
400 
401       pFunc->pfGetLumaI16x16Pred[iCurMode] (pDst, pDec, iLineSizeDec);
402       iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16] (pDst, 16, pEnc, iLineSizeEnc);
403       iCurCost += iLambda * (BsSizeUE (g_kiMapModeI16x16[iCurMode]));
404       if (iCurCost < iBestCost) {
405         iBestMode = iCurMode;
406         iBestCost = iCurCost;
407         iIdx = iIdx ^ 0x01;
408         pDst = pPredI16x16[iIdx];
409       }
410     }
411   }
412   pMbCache->pMemPredChroma = pPredI16x16[iIdx];
413 
414   pMbCache->pMemPredLuma = pPredI16x16[iIdx ^ 0x01];
415   pMbCache->uiLumaI16x16Mode  = iBestMode;
416   return iBestCost;
417 }
WelsMdI4x4(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)418 int32_t WelsMdI4x4 (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
419   SWelsFuncPtrList* pFunc       = pEncCtx->pFuncList;
420   SDqLayer* pCurDqLayer         = pEncCtx->pCurDqLayer;
421   int32_t iLambda               = pWelsMd->iLambda;
422   int32_t iBestCostLuma         = pWelsMd->iCostLuma;
423   uint8_t* pEncMb               = pMbCache->SPicData.pEncMb[0];
424   uint8_t* pDecMb               = pMbCache->SPicData.pCsMb[0];
425   const int32_t kiLineSizeEnc   = pCurDqLayer->iEncStride[0];
426   const int32_t kiLineSizeDec   = pCurDqLayer->iCsStride[0];
427 
428   uint8_t* pCurEnc, *pCurDec, *pDst;
429 
430   int32_t iPredMode, iCurMode, iBestMode, iFinalMode;
431   int32_t iCurCost, iBestCost;
432   int32_t iAvailCount;
433   const uint8_t* kpAvailMode;
434   int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
435   int32_t lambda[2] = {iLambda << 2, iLambda};
436   bool* pPrevIntra4x4PredModeFlag       = pMbCache->pPrevIntra4x4PredModeFlag;
437   int8_t* pRemIntra4x4PredModeFlag      = pMbCache->pRemIntra4x4PredModeFlag;
438   const uint8_t* kpIntra4x4AvailCount   = &g_kiIntra4AvailCount[0];
439   const uint8_t* kpCache48CountScan4    = &g_kuiCache48CountScan4Idx[0];
440   const int8_t* kpNeighborIntraToI4x4   = g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
441   const int8_t* kpCoordinateIdxX        = &g_kiCoordinateIdx4x4X[0];
442   const int8_t* kpCoordinateIdxY        = &g_kiCoordinateIdx4x4Y[0];
443   int32_t iBestPredBufferNum            = 0;
444   int32_t iCosti4x4                     = 0;
445 
446 #if defined(X86_ASM)
447   WelsPrefetchZero_mmx (g_kiMapModeI4x4);
448   WelsPrefetchZero_mmx ((int8_t*)&pFunc->pfGetLumaI4x4Pred);
449 #endif//X86_ASM
450 
451   for (i = 0; i < 16; i++) {
452     const int32_t kiOffset = kpNeighborIntraToI4x4[i];
453 
454     //step 1: locating current 4x4 block position in pEnc and pDecMb
455     iCoordinateX = kpCoordinateIdxX[i];
456     iCoordinateY = kpCoordinateIdxY[i];
457 
458     iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
459     pCurEnc = pEncMb + iIdxStrideEnc;
460     iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
461     pCurDec = pDecMb + iIdxStrideDec;
462 
463     //step 2: get predicted mode from neighbor
464     iPredMode = PredIntra4x4Mode (pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
465 
466     //step 3: collect candidates of iPredMode
467     iAvailCount = kpIntra4x4AvailCount[kiOffset];
468     kpAvailMode = g_kiIntra4AvailMode[kiOffset];
469 
470     //step 4: gain the best pred mode
471     iBestCost = INT_MAX;
472     iBestMode = kpAvailMode[0];
473 
474     if (pFunc->sSampleDealingFuncs.pfIntra4x4Combined3 && (iAvailCount >= 6)) {
475       pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
476 
477       iBestCost = pFunc->sSampleDealingFuncs.pfIntra4x4Combined3 (pCurDec, kiLineSizeDec, pCurEnc, kiLineSizeEnc, pDst,
478                   &iBestMode,
479                   lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
480       //     ST64(&pMbCache->pMemPredBlk4[iBestMode<<4], LD64(mem_pred_blk4_temp));
481       //     ST64(&pMbCache->pMemPredBlk4[8+(iBestMode<<4)], LD64(mem_pred_blk4_temp+8));
482 
483       for (j = 3; j < iAvailCount; ++ j) {
484         iCurMode = kpAvailMode[j];
485 
486         assert (iCurMode >= 0 && iCurMode < 14);
487 
488         pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
489 
490         pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
491         iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
492                    lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
493 
494         if (iCurCost < iBestCost) {
495           iBestMode = iCurMode;
496           iBestCost = iCurCost;
497           iBestPredBufferNum = 1 - iBestPredBufferNum;
498         }
499       }
500     } else {
501       for (j = 0; j < iAvailCount; ++ j) {
502         iCurMode = kpAvailMode[j];
503 
504         assert (iCurMode >= 0 && iCurMode < 14);
505 
506         pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
507 
508         pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
509         iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
510                    lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
511 
512         if (iCurCost < iBestCost) {
513           iBestMode = iCurMode;
514           iBestCost = iCurCost;
515           iBestPredBufferNum = 1 - iBestPredBufferNum;
516         }
517       }
518     }
519     pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
520     iCosti4x4 += iBestCost;
521     if (iCosti4x4 >= iBestCostLuma) {
522       break;
523     }
524 
525     //step 5: update pred mode and sample avail cache
526     iFinalMode = g_kiMapModeI4x4[iBestMode];
527     if (iPredMode == iFinalMode) {
528       *pPrevIntra4x4PredModeFlag++ = true;
529     } else {
530       *pPrevIntra4x4PredModeFlag++ = false;
531       *pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode : (iFinalMode - 1));
532     }
533     pRemIntra4x4PredModeFlag++;
534     // pCurMb->pIntra4x4PredMode[g_kuiMbCountScan4Idx[i]] = iFinalMode;
535     pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
536 
537     //step 6: encoding I_4x4
538     WelsEncRecI4x4Y (pEncCtx, pCurMb, pMbCache, i);
539   }
540   ST32 (pCurMb->pIntra4x4PredMode, LD32 (&pMbCache->iIntraPredMode[33]));
541   pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
542   pCurMb->pIntra4x4PredMode[5] = pMbCache->iIntraPredMode[20];
543   pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
544   iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
545   return iCosti4x4;
546 }
547 
WelsMdI4x4Fast(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)548 int32_t WelsMdI4x4Fast (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
549   SWelsFuncPtrList* pFunc       = pEncCtx->pFuncList;
550   SDqLayer* pCurDqLayer         = pEncCtx->pCurDqLayer;
551   int32_t iLambda               = pWelsMd->iLambda;
552   int32_t iBestCostLuma         = pWelsMd->iCostLuma;
553   uint8_t* pEncMb               = pMbCache->SPicData.pEncMb[0];
554   uint8_t* pDecMb               = pMbCache->SPicData.pCsMb[0];
555   const int32_t kiLineSizeEnc   = pCurDqLayer->iEncStride[0];
556   const int32_t kiLineSizeDec   = pCurDqLayer->iCsStride[0];
557 
558   uint8_t* pCurEnc, *pCurDec, *pDst;
559   int8_t iPredMode, iCurMode, iBestMode, iFinalMode;
560   int32_t iCurCost, iBestCost;
561   int32_t iAvailCount;
562   const uint8_t* kpAvailMode;
563   int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
564   int32_t iCostH, iCostV, iCostVR, iCostHD, iCostVL, iCostHU, iBestModeFake;
565   int32_t lambda[2] = {iLambda << 2, iLambda};
566   bool* pPrevIntra4x4PredModeFlag       = pMbCache->pPrevIntra4x4PredModeFlag;
567   int8_t* pRemIntra4x4PredModeFlag      = pMbCache->pRemIntra4x4PredModeFlag;
568   const uint8_t* kpIntra4x4AvailCount   = &g_kiIntra4AvailCount[0];
569   const uint8_t* kpCache48CountScan4    = &g_kuiCache48CountScan4Idx[0];
570   const int8_t* kpNeighborIntraToI4x4   = g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
571   const int8_t* kpCoordinateIdxX        = &g_kiCoordinateIdx4x4X[0];
572   const int8_t* kpCoordinateIdxY        = &g_kiCoordinateIdx4x4Y[0];
573   int32_t iBestPredBufferNum            = 0;
574   int32_t iCosti4x4                     = 0;
575 #if defined(X86_ASM)
576   WelsPrefetchZero_mmx (g_kiMapModeI4x4);
577   WelsPrefetchZero_mmx ((int8_t*)&pFunc->pfGetLumaI4x4Pred);
578 #endif//X86_ASM
579 
580   for (i = 0; i < 16; i++) {
581     const int32_t kiOffset = kpNeighborIntraToI4x4[i];
582 //    const int32_t i_next = (1+i) & 15; // next loop
583 //    const uint8_t dummy_byte= pIntra4x4AvailCount[pNeighborIntraToI4x4[i_next]]; // prefetch pIntra4x4AvailCount of next loop to avoid cache missed
584 
585     //step 1: locating current 4x4 block position in pEnc and pDecMb
586     iCoordinateX = kpCoordinateIdxX[i];
587     iCoordinateY = kpCoordinateIdxY[i];
588 
589     iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
590     pCurEnc = pEncMb + iIdxStrideEnc;
591     iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
592     pCurDec = pDecMb + iIdxStrideDec;
593 
594     //step 2: get predicted mode from neighbor
595     iPredMode = PredIntra4x4Mode (pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
596     //step 3: collect candidates of iPredMode
597     iAvailCount = kpIntra4x4AvailCount[kiOffset];
598     kpAvailMode = g_kiIntra4AvailMode[kiOffset];
599 
600     if (iAvailCount == 9 || iAvailCount == 7) {
601       //I4_PRED_DC(2)
602 
603       iBestMode = I4_PRED_DC;
604 
605       pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
606 
607       pFunc->pfGetLumaI4x4Pred[I4_PRED_DC] (pDst, pCurDec, kiLineSizeDec);
608       iBestCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
609                   lambda[iPredMode == g_kiMapModeI4x4[iBestMode]];
610 
611       //I4_PRED_H(1)
612       iCurMode = I4_PRED_H;
613 
614       pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
615 
616       pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
617       iCostH = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
618                           lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
619 
620       if (iCurCost < iBestCost) {
621         iBestMode = iCurMode;
622         iBestCost = iCurCost;
623         iBestPredBufferNum = 1 - iBestPredBufferNum;
624       }
625 
626       //I4_PRED_V(0)
627       iCurMode = I4_PRED_V;
628 
629       pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
630 
631       pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
632       iCostV = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
633                           lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
634 
635       if (iCurCost < iBestCost) {
636         iBestMode = iCurMode;
637         iBestCost = iCurCost;
638         iBestPredBufferNum = 1 - iBestPredBufferNum;
639       }
640       if (iCostV < iCostH) {
641         if (iAvailCount == 9) {
642           iBestModeFake = true; //indicating whether V is the best fake mode
643 
644           //I4_PRED_VR(5) and I4_PRED_VL(7)
645           iCurMode = I4_PRED_VR;
646 
647           pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
648 
649           pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
650           iCostVR = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
651                                lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
652 
653           if (iCurCost < iBestCost) {
654             iBestMode = iCurMode;
655             iBestCost = iCurCost;
656             iBestPredBufferNum = 1 - iBestPredBufferNum;
657           }
658 
659           if (iCurCost < iCostV)
660             iBestModeFake = false;
661 
662           iCurMode = I4_PRED_VL;
663 
664           pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
665 
666           pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
667           iCostVL = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
668                                lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
669 
670           if (iCurCost < iBestCost) {
671             iBestMode = iCurMode;
672             iBestCost = iCurCost;
673             iBestPredBufferNum = 1 - iBestPredBufferNum;
674           }
675 
676           if (iCurCost < iCostV)
677             iBestModeFake = false;
678 
679           //Vertical Early Determination
680           if (!iBestModeFake) { //Vertical is not the best, go on checking...
681             //select the best one from VL and VR
682             if (iCostVR < iCostVL) {
683               //I4_PRED_DDR(4)
684               iCurMode = I4_PRED_DDR;
685 
686               pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
687 
688               pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
689 
690               iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
691                          lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
692 
693               if (iCurCost < iBestCost) {
694                 iBestMode = iCurMode;
695                 iBestCost = iCurCost;
696                 iBestPredBufferNum = 1 - iBestPredBufferNum;
697               }
698             } else {
699               //I4_PRED_DDL(3)
700               iCurMode = I4_PRED_DDL;
701 
702               pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
703 
704               pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
705 
706               iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
707                          lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
708 
709               if (iCurCost < iBestCost) {
710                 iBestMode = iCurMode;
711                 iBestCost = iCurCost;
712                 iBestPredBufferNum = 1 - iBestPredBufferNum;
713               }
714             }
715           }
716         } else if (iAvailCount == 7) {
717           iCurMode = I4_PRED_DDR;
718 
719           pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
720 
721           pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
722           iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
723                      lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
724 
725           if (iCurCost < iBestCost) {
726             iBestMode = iCurMode;
727             iBestCost = iCurCost;
728             iBestPredBufferNum = 1 - iBestPredBufferNum;
729           }
730 
731           iCurMode = I4_PRED_VR;
732 
733           pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
734 
735           pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
736 
737           iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
738                      lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
739 
740           if (iCurCost < iBestCost) {
741             iBestMode = iCurMode;
742             iBestCost = iCurCost;
743             iBestPredBufferNum = 1 - iBestPredBufferNum;
744           }
745         }
746       } else {
747         iBestModeFake = true; //indicating whether H is the best fake mode
748         //I4_PRED_HD(6) and I4_PRED_HU(8)
749         iCurMode = I4_PRED_HD;
750 
751         pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
752 
753         pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
754         iCostHD = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
755                              lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
756 
757         if (iCurCost < iBestCost) {
758           iBestMode = iCurMode;
759           iBestCost = iCurCost;
760           iBestPredBufferNum = 1 - iBestPredBufferNum;
761         }
762 
763         if (iCurCost < iCostH)
764           iBestModeFake = false;
765 
766         iCurMode = I4_PRED_HU;
767 
768         pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
769 
770         pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
771         iCostHU = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
772                              lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
773 
774         if (iCurCost < iBestCost) {
775           iBestMode = iCurMode;
776           iBestCost = iCurCost;
777           iBestPredBufferNum = 1 - iBestPredBufferNum;
778         }
779 
780         if (iCurCost < iCostH)
781           iBestModeFake = false;
782 
783         if (!iBestModeFake) { //Horizontal is not the best, go on checking...
784           //select the best one from VL and VR
785           if (iCostHD < iCostHU) {
786             //I4_PRED_DDR(4)
787             iCurMode = I4_PRED_DDR;
788 
789             pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
790 
791             pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
792             iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
793                        lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
794 
795             if (iCurCost < iBestCost) {
796               iBestMode = iCurMode;
797               iBestCost = iCurCost;
798               iBestPredBufferNum = 1 - iBestPredBufferNum;
799             }
800           } else if (iAvailCount == 9) {
801             //I4_PRED_DDL(3)
802             iCurMode = I4_PRED_DDL;
803 
804             pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
805             pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
806 
807             iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
808                        lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
809 
810             if (iCurCost < iBestCost) {
811               iBestMode = iCurMode;
812               iBestCost = iCurCost;
813               iBestPredBufferNum = 1 - iBestPredBufferNum;
814             }
815 
816           }
817         }
818       }
819     } else {
820       iBestCost = INT_MAX;
821       iBestMode = I4_PRED_INVALID;
822       for (j = 0; j < iAvailCount; j++) {
823         // I4x4_MODE_CHECK(pAvailMode[j], iCurCost);
824         iCurMode = kpAvailMode[j];
825 
826         pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
827 
828         pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
829         iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
830                    lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
831 
832         if (iCurCost < iBestCost) {
833           iBestMode = iCurMode;
834           iBestCost = iCurCost;
835           iBestPredBufferNum = 1 - iBestPredBufferNum;
836         }
837       }
838     }
839     pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
840     iCosti4x4 += iBestCost;
841     if (iCosti4x4 >= iBestCostLuma) {
842       break;
843     }
844 
845     //step 5: update pred mode and sample avail cache
846     iFinalMode = g_kiMapModeI4x4[iBestMode];
847     if (iPredMode == iFinalMode) {
848       *pPrevIntra4x4PredModeFlag++ = true;
849     } else {
850       *pPrevIntra4x4PredModeFlag++ = false;
851       *pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode : (iFinalMode - 1));
852     }
853     pRemIntra4x4PredModeFlag++;
854     // pCurMb->pIntra4x4PredMode[scan4[i]] = iFinalMode;
855     pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
856     //step 6: encoding I_4x4
857     WelsEncRecI4x4Y (pEncCtx, pCurMb, pMbCache, i);
858   }
859   ST32 (pCurMb->pIntra4x4PredMode, LD32 (&pMbCache->iIntraPredMode[33]));
860   pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
861   pCurMb->pIntra4x4PredMode[5] = pMbCache->iIntraPredMode[20];
862   pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
863   iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
864   return iCosti4x4;
865 }
866 
WelsMdIntraChroma(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SMbCache * pMbCache,int32_t iLambda)867 int32_t WelsMdIntraChroma (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda) {
868   const int8_t* kpAvailMode;
869   int32_t iAvailCount = 0;
870   int32_t iChmaIdx = 0;
871   uint8_t* pPredIntraChma[2]    = {pMbCache->pMemPredChroma, pMbCache->pMemPredChroma + 128};
872   uint8_t* pDstChma             = pPredIntraChma[0];
873   uint8_t* pEncCb               = pMbCache->SPicData.pEncMb[1];
874   uint8_t* pEncCr               = pMbCache->SPicData.pEncMb[2];
875   uint8_t* pDecCb               = pMbCache->SPicData.pCsMb[1];//pMbCache->SPicData.pDecMb[1];
876   uint8_t* pDecCr               = pMbCache->SPicData.pCsMb[2];//pMbCache->SPicData.pDecMb[2];
877   const int32_t kiLineSizeEnc   = pCurDqLayer->iEncStride[1];
878   const int32_t kiLineSizeDec   = pCurDqLayer->iCsStride[1];//pMbCache->SPicData.i_stride_dec[1];
879 
880   int32_t i, iCurMode, iCurCost, iBestMode, iBestCost = INT_MAX;
881 
882   int32_t iOffset = pMbCache->uiNeighborIntra & 0x07;
883   iAvailCount = g_kiIntraChromaAvailMode[iOffset][4];
884   kpAvailMode = g_kiIntraChromaAvailMode[iOffset];
885   if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra8x8Combined3) {
886     iBestCost = pFunc->sSampleDealingFuncs.pfIntra8x8Combined3 (pDecCb, kiLineSizeDec, pEncCb, kiLineSizeEnc, &iBestMode,
887                 iLambda, pDstChma, pDecCr, pEncCr);
888     iCurMode = kpAvailMode[3];
889     pFunc->pfGetChromaPred[iCurMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
890     pFunc->pfGetChromaPred[iCurMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
891 
892     iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma, 8, pEncCb, kiLineSizeEnc) +
893                pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma + 64, 8, pEncCr, kiLineSizeEnc) +
894                iLambda * 4;
895     if (iCurCost < iBestCost) {
896       iBestMode = iCurMode;
897       iBestCost = iCurCost;
898     } else {
899       pFunc->pfGetChromaPred[iBestMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
900       pFunc->pfGetChromaPred[iBestMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
901     }
902     iBestCost += iLambda;
903     iChmaIdx = 1;
904   } else {
905     iBestMode = kpAvailMode[0];
906     for (i = 0; i < iAvailCount; ++ i) {
907       iCurMode = kpAvailMode[i];
908 
909       assert (iCurMode >= 0 && iCurMode < 7);
910 
911       // pDstCb = &pMbCache->mem_pred_intra_cb[iCurMode<<6];
912       // pDstCr = &pMbCache->mem_pred_intra_cr[iCurMode<<6];
913       pFunc->pfGetChromaPred[iCurMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
914       iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma, 8, pEncCb, kiLineSizeEnc);
915 
916       pFunc->pfGetChromaPred[iCurMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
917       iCurCost += pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma + 64, 8, pEncCr, kiLineSizeEnc) +
918                   iLambda * BsSizeUE (g_kiMapModeIntraChroma[iCurMode]);
919       if (iCurCost < iBestCost) {
920         iBestMode = iCurMode;
921         iBestCost = iCurCost;
922         iChmaIdx = iChmaIdx ^ 0x01;
923         pDstChma = pPredIntraChma[iChmaIdx];
924       }
925     }
926   }
927 
928   pMbCache->pBestPredIntraChroma = pPredIntraChma[iChmaIdx ^ 0x01];
929   pMbCache->uiChmaI8x8Mode = iBestMode;
930   return iBestCost;
931 }
WelsMdIntraFinePartition(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)932 int32_t WelsMdIntraFinePartition (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
933   int32_t iCosti4x4 = WelsMdI4x4 (pEncCtx, pWelsMd, pCurMb, pMbCache);
934 
935   if (iCosti4x4 < pWelsMd->iCostLuma) {
936     pCurMb->uiMbType = MB_TYPE_INTRA4x4;
937     pWelsMd->iCostLuma = iCosti4x4;
938   }
939   return pWelsMd->iCostLuma;
940 }
941 
WelsMdIntraFinePartitionVaa(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)942 int32_t WelsMdIntraFinePartitionVaa (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
943 
944   if (MdIntraAnalysisVaaInfo (pEncCtx, pMbCache->SPicData.pEncMb[0])) {
945     int32_t iCosti4x4 = WelsMdI4x4Fast (pEncCtx, pWelsMd, pCurMb, pMbCache);
946 
947     if (iCosti4x4 < pWelsMd->iCostLuma) {
948       pCurMb->uiMbType = MB_TYPE_INTRA4x4;
949       pWelsMd->iCostLuma = iCosti4x4;
950     }
951   }
952 
953   return pWelsMd->iCostLuma;
954 }
955 
WelsMdIntraMb(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)956 void WelsMdIntraMb (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
957   //initial prediction memory for I_16x16
958   pWelsMd->iCostLuma = WelsMdI16x16 (pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
959   pCurMb->uiMbType = MB_TYPE_INTRA16x16;
960 
961   WelsMdIntraSecondaryModesEnc (pEncCtx, pWelsMd, pCurMb, pMbCache);
962 }
963 
InitMe(const SWelsMD & sWelsMd,const int32_t iBlockSize,uint8_t * pEnc,uint8_t * pRef,SScreenBlockFeatureStorage * pRefFeatureStorage,SWelsME & sWelsMe)964 static inline void InitMe (const SWelsMD& sWelsMd, const int32_t iBlockSize, uint8_t* pEnc, uint8_t* pRef,
965                            SScreenBlockFeatureStorage* pRefFeatureStorage,
966                            SWelsME& sWelsMe) {
967   sWelsMe.iCurMeBlockPixX = sWelsMd.iMbPixX;
968   sWelsMe.iCurMeBlockPixY = sWelsMd.iMbPixY;
969   sWelsMe.uiBlockSize = iBlockSize;
970   sWelsMe.pMvdCost = sWelsMd.pMvdCost;
971 
972   sWelsMe.pEncMb = pEnc;
973   sWelsMe.pRefMb = sWelsMe.pColoRefMb = pRef;
974 
975   sWelsMe.pRefFeatureStorage = pRefFeatureStorage;
976 }
977 
WelsMdP16x16(SWelsFuncPtrList * pFunc,SDqLayer * pCurLayer,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb)978 int32_t WelsMdP16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb) {
979   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
980   SWelsME* pMe16x16 = &pWelsMd->sMe.sMe16x16;
981   uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
982   const int32_t kiMbWidth  = pCurLayer->iMbWidth;  // for assign once
983   const int32_t kiMbHeight = pCurLayer->iMbHeight;
984   InitMe (*pWelsMd, BLOCK_16x16, pMbCache->SPicData.pEncMb[0], pMbCache->SPicData.pRefMb[0],
985           pCurLayer->pRefPic->pScreenBlockFeatureStorage,
986           *pMe16x16);
987   //not putting the line below into InitMe to avoid judging mode in InitMe
988   pMe16x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb;
989 
990   pSlice->uiMvcNum = 0;
991   pSlice->sMvc[pSlice->uiMvcNum++] = pMe16x16->sMvBase;
992   //spatial motion vector predictors
993   if (uiNeighborAvail & LEFT_MB_POS) { //left available
994     pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb - 1)->sP16x16Mv;
995   }
996   if (uiNeighborAvail & TOP_MB_POS) { //top available
997     pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb - kiMbWidth)->sP16x16Mv;
998   }
999   //temporal motion vector predictors
1000   if (pCurLayer->pRefPic->iPictureType == P_SLICE) {
1001     if (pCurMb->iMbX < kiMbWidth - 1) {
1002       SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY + 1];
1003       pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
1004       pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
1005       ++ pSlice->uiMvcNum;
1006     }
1007     if (pCurMb->iMbY < kiMbHeight - 1) {
1008       SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY + kiMbWidth];
1009       pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
1010       pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
1011       ++ pSlice->uiMvcNum;
1012     }
1013   }
1014 
1015   PredMv (&pMbCache->sMvComponents, 0, 4, 0, & (pMe16x16->sMvp));
1016   pFunc->pfMotionSearch[0] (pFunc, pCurLayer, pMe16x16, pSlice);
1017 
1018   pCurMb->sP16x16Mv = pMe16x16->sMv;
1019   pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = pMe16x16->sMv;
1020 
1021   return pMe16x16->uiSatdCost;
1022 }
WelsMdP16x8(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice)1023 int32_t WelsMdP16x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
1024   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1025   int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
1026   int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
1027   SWelsME* sMe16x8;
1028   int32_t i = 0, iPixelY;
1029   int32_t iCostP16x8 = 0;
1030   do {
1031     sMe16x8 = &pWelsMd->sMe.sMe16x8[i];
1032     iPixelY = (i << 3);
1033     InitMe (*pWelsMd, BLOCK_16x8,
1034             pMbCache->SPicData.pEncMb[0] + (iPixelY * iStrideEnc),
1035             pMbCache->SPicData.pRefMb[0] + (iPixelY * iStrideRef),
1036             pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1037             *sMe16x8);
1038     //not putting the lines below into InitMe to avoid judging mode in InitMe
1039     sMe16x8->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1040     sMe16x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
1041 
1042     pSlice->sMvc[0] = sMe16x8->sMvBase;
1043     pSlice->uiMvcNum = 1;
1044 
1045     PredInter16x8Mv (pMbCache, i << 3, 0, & (sMe16x8->sMvp));
1046     pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe16x8, pSlice);
1047     UpdateP16x8Motion2Cache (pMbCache, i << 3, pWelsMd->uiRef, & (sMe16x8->sMv));
1048     iCostP16x8 += sMe16x8->uiSatdCost;
1049     ++i;
1050   } while (i < 2);
1051   return iCostP16x8;
1052 }
WelsMdP8x16(SWelsFuncPtrList * pFunc,SDqLayer * pCurLayer,SWelsMD * pWelsMd,SSlice * pSlice)1053 int32_t WelsMdP8x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
1054   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1055   SWelsME* sMe8x16;
1056   int32_t i = 0, iPixelX;
1057   int32_t iCostP8x16 = 0;
1058   do {
1059     iPixelX = (i << 3);
1060     sMe8x16 = &pWelsMd->sMe.sMe8x16[i];
1061     InitMe (*pWelsMd, BLOCK_8x16,
1062             pMbCache->SPicData.pEncMb[0] + iPixelX,
1063             pMbCache->SPicData.pRefMb[0] + iPixelX,
1064             pCurLayer->pRefPic->pScreenBlockFeatureStorage,
1065             *sMe8x16);
1066     //not putting the lines below into InitMe to avoid judging mode in InitMe
1067     sMe8x16->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1068     sMe8x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
1069 
1070     pSlice->sMvc[0] = sMe8x16->sMvBase;
1071     pSlice->uiMvcNum = 1;
1072 
1073     PredInter8x16Mv (pMbCache, i << 2, 0, & (sMe8x16->sMvp));
1074     pFunc->pfMotionSearch[0] (pFunc, pCurLayer, sMe8x16, pSlice);
1075     UpdateP8x16Motion2Cache (pMbCache, i << 2, pWelsMd->uiRef, & (sMe8x16->sMv));
1076     iCostP8x16 += sMe8x16->uiSatdCost;
1077     ++i;
1078   } while (i < 2);
1079   return iCostP8x16;
1080 }
WelsMdP8x8(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice)1081 int32_t WelsMdP8x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
1082   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1083   int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1084   int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1085   SWelsME* sMe8x8;
1086   int32_t i, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1087   int32_t iCostP8x8 = 0;
1088   for (i = 0; i < 4; i++) {
1089     iIdxX = i & 1;
1090     iIdxY = i >> 1;
1091     iPixelX = (iIdxX << 3);
1092     iPixelY = (iIdxY << 3);
1093     iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1094     iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1095 
1096     sMe8x8 = &pWelsMd->sMe.sMe8x8[i];
1097     InitMe (*pWelsMd, BLOCK_8x8,
1098             pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1099             pMbCache->SPicData.pRefMb[0] + iStrideRef,
1100             pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1101             *sMe8x8);
1102     //not putting these three lines below into InitMe to avoid judging mode in InitMe
1103     sMe8x8->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1104     sMe8x8->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1105     sMe8x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1106 
1107 
1108     pSlice->sMvc[0] = sMe8x8->sMvBase;
1109     pSlice->uiMvcNum = 1;
1110 
1111     PredMv (&pMbCache->sMvComponents, i << 2, 2, pWelsMd->uiRef, & (sMe8x8->sMvp));
1112     pFunc->pfMotionSearch[pWelsMd->iBlock8x8StaticIdc[i]] (pFunc, pCurDqLayer, sMe8x8, pSlice);
1113     UpdateP8x8Motion2Cache (pMbCache, i << 2, pWelsMd->uiRef, & (sMe8x8->sMv));
1114     iCostP8x8 += sMe8x8->uiSatdCost;
1115 //    sMe8x8++;
1116   }
1117   return iCostP8x8;
1118 }
1119 
WelsMdP4x4(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice,const int32_t ki8x8Idx)1120 int32_t WelsMdP4x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
1121                     const int32_t ki8x8Idx) {
1122   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1123   int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1124   int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1125   SWelsME* sMe4x4;
1126   int32_t i4x4Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1127   int32_t iCostP4x4 = 0;
1128   for (i4x4Idx = 0; i4x4Idx < 4; ++i4x4Idx) {
1129     int32_t iPartIdx = (ki8x8Idx << 2) + i4x4Idx;
1130     iIdxX = ((ki8x8Idx & 1) << 1) + (i4x4Idx & 1);
1131     iIdxY = ((ki8x8Idx >> 1) << 1) + (i4x4Idx >> 1);
1132     iPixelX = (iIdxX << 2);
1133     iPixelY = (iIdxY << 2);
1134     iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1135     iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1136 
1137     sMe4x4 = &pWelsMd->sMe.sMe4x4[ki8x8Idx][i4x4Idx];
1138     InitMe (*pWelsMd, BLOCK_4x4,
1139             pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1140             pMbCache->SPicData.pRefMb[0] + iStrideRef,
1141             pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1142             *sMe4x4);
1143     //not putting these three lines below into InitMe to avoid judging mode in InitMe
1144     sMe4x4->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1145     sMe4x4->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1146     sMe4x4->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1147 
1148     pSlice->sMvc[0] = sMe4x4->sMvBase;
1149     pSlice->uiMvcNum = 1;
1150 
1151     PredMv (&pMbCache->sMvComponents, iPartIdx, 1, pWelsMd->uiRef, & (sMe4x4->sMvp));
1152     pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe4x4, pSlice);
1153     UpdateP4x4Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe4x4->sMv));
1154     iCostP4x4 += sMe4x4->uiSatdCost;
1155   }
1156   return iCostP4x4;
1157 }
1158 
WelsMdP8x4(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice,const int32_t ki8x8Idx)1159 int32_t WelsMdP8x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
1160                     const int32_t ki8x8Idx) {
1161   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1162   int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1163   int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1164   SWelsME* sMe8x4;
1165   int32_t i8x4Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1166   int32_t iCostP8x4 = 0;
1167   for (i8x4Idx = 0; i8x4Idx < 2; ++i8x4Idx) {
1168     int32_t iPartIdx = (ki8x8Idx << 2) + (i8x4Idx << 1);
1169     iIdxX = ((ki8x8Idx & 1) << 1);
1170     iIdxY = ((ki8x8Idx >> 1) << 1) + i8x4Idx;
1171     iPixelX = (iIdxX << 2);
1172     iPixelY = (iIdxY << 2);
1173     iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1174     iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1175 
1176     sMe8x4 = &pWelsMd->sMe.sMe8x4[ki8x8Idx][i8x4Idx];
1177     InitMe (*pWelsMd, BLOCK_8x4,
1178             pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1179             pMbCache->SPicData.pRefMb[0] + iStrideRef,
1180             pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1181             *sMe8x4);
1182     //not putting these three lines below into InitMe to avoid judging mode in InitMe
1183     sMe8x4->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1184     sMe8x4->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1185     sMe8x4->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1186 
1187     pSlice->sMvc[0] = sMe8x4->sMvBase;
1188     pSlice->uiMvcNum = 1;
1189 
1190     PredMv (&pMbCache->sMvComponents, iPartIdx, 2, pWelsMd->uiRef, & (sMe8x4->sMvp));
1191     pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe8x4, pSlice);
1192     UpdateP8x4Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe8x4->sMv));
1193     iCostP8x4 += sMe8x4->uiSatdCost;
1194   }
1195   return iCostP8x4;
1196 }
1197 
WelsMdP4x8(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice,const int32_t ki8x8Idx)1198 int32_t WelsMdP4x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
1199                     const int32_t ki8x8Idx) {
1200   //Wayne, to be modified
1201   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1202   int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1203   int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1204   SWelsME* sMe4x8;
1205   int32_t i4x8Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1206   int32_t iCostP4x8 = 0;
1207   for (i4x8Idx = 0; i4x8Idx < 2; ++i4x8Idx) {
1208     int32_t iPartIdx = (ki8x8Idx << 2) + i4x8Idx;
1209     iIdxX = ((ki8x8Idx & 1) << 1) + i4x8Idx;
1210     iIdxY = ((ki8x8Idx >> 1) << 1);
1211     iPixelX = (iIdxX << 2);
1212     iPixelY = (iIdxY << 2);
1213     iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1214     iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1215 
1216     sMe4x8 = &pWelsMd->sMe.sMe4x8[ki8x8Idx][i4x8Idx];
1217     InitMe (*pWelsMd, BLOCK_4x8,
1218             pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1219             pMbCache->SPicData.pRefMb[0] + iStrideRef,
1220             pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1221             *sMe4x8);
1222     //not putting these three lines below into InitMe to avoid judging mode in InitMe
1223     sMe4x8->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1224     sMe4x8->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1225     sMe4x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1226 
1227     pSlice->sMvc[0] = sMe4x8->sMvBase;
1228     pSlice->uiMvcNum = 1;
1229 
1230     PredMv (&pMbCache->sMvComponents, iPartIdx, 1, pWelsMd->uiRef, & (sMe4x8->sMvp));
1231     pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe4x8, pSlice);
1232     UpdateP4x8Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe4x8->sMv));
1233     iCostP4x8 += sMe4x8->uiSatdCost;
1234   }
1235   return iCostP4x8;
1236 }
1237 
WelsMdInterFinePartition(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,int32_t iBestCost)1238 void WelsMdInterFinePartition (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, int32_t iBestCost) {
1239   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1240 //  SMbCache *pMbCache = &pSlice->sMbCacheInfo;
1241   int32_t iCost = 0;
1242 
1243 //  WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x8, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1244 
1245   iCost = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1246 
1247   if (iCost < iBestCost) {
1248     int32_t iCostPart;
1249     pCurMb->uiMbType = MB_TYPE_8x8;
1250     memset (pCurMb->uiSubMbType, SUB_MB_TYPE_8x8, 4);
1251 
1252 //    WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1253     iCostPart = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1254     if (iCostPart <= iCost) {
1255       iCost = iCostPart;
1256       pCurMb->uiMbType = MB_TYPE_16x8;
1257       //pCurMb->mb_partition = 2;
1258     }
1259 
1260 //    WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1261     iCostPart = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1262     if (iCostPart <= iCost) {
1263       iCost = iCostPart;
1264       pCurMb->uiMbType = MB_TYPE_8x16;
1265       //pCurMb->mb_partition = 2;
1266     }
1267   }
1268 }
1269 
WelsMdInterFinePartitionVaa(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,int32_t iBestCost)1270 void WelsMdInterFinePartitionVaa (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
1271                                   int32_t iBestCost) {
1272   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1273 //  SMbCache *pMbCache = &pSlice->sMbCacheInfo;
1274   int32_t iCostP8x16, iCostP16x8, iCostP8x8;
1275   uint8_t uiMbSign = pEncCtx->pFuncList->pfGetMbSignFromInterVaa (&pEncCtx->pVaa->sVaaCalcInfo.pSad8x8[pCurMb->iMbXY][0]);
1276 
1277   if (uiMbSign == 15) {
1278     return;
1279   }
1280 
1281 //  iCost = pWelsMd->sMe16x16.uiSatdCost;
1282 
1283   switch (uiMbSign) {
1284   case 3:
1285   case 12:
1286 //    WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1287     iCostP16x8 = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1288     if (iCostP16x8 < iBestCost) {
1289       iBestCost = iCostP16x8;
1290       pCurMb->uiMbType = MB_TYPE_16x8;
1291       //pCurMb->mb_partition = 2;
1292     }
1293     break;
1294 
1295   case 5:
1296   case 10:
1297 //    WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1298     iCostP8x16 = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1299     if (iCostP8x16 < iBestCost) {
1300       iBestCost = iCostP8x16;
1301       pCurMb->uiMbType = MB_TYPE_8x16;
1302       //pCurMb->mb_partition = 2;
1303     }
1304     break;
1305 
1306   case 6:
1307   case 9:
1308     iCostP8x8 = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1309     if (iCostP8x8 < iBestCost) {
1310       iBestCost = iCostP8x8;
1311       pCurMb->uiMbType = MB_TYPE_8x8;
1312       memset (pCurMb->uiSubMbType, SUB_MB_TYPE_8x8, 4);
1313     }
1314     break;
1315 
1316   default:
1317     iCostP8x8 = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1318     if (iCostP8x8 < iBestCost) {
1319       iBestCost = iCostP8x8;
1320       pCurMb->uiMbType = MB_TYPE_8x8;
1321       memset (pCurMb->uiSubMbType, SUB_MB_TYPE_8x8, 4);
1322 
1323       iCostP16x8 = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1324       if (iCostP16x8 <= iBestCost) {
1325         iBestCost = iCostP16x8;
1326         pCurMb->uiMbType = MB_TYPE_16x8;
1327       }
1328 
1329       iCostP8x16 = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1330       if (iCostP8x16 <= iBestCost) {
1331         iBestCost = iCostP8x16;
1332         pCurMb->uiMbType = MB_TYPE_8x16;
1333       }
1334     }
1335     break;
1336   }
1337   pWelsMd->iCostLuma = iBestCost;
1338 }
1339 
1340 
VaaBackgroundMbDataUpdate(SWelsFuncPtrList * pFunc,SVAAFrameInfo * pVaaInfo,SMB * pCurMb)1341 inline void VaaBackgroundMbDataUpdate (SWelsFuncPtrList* pFunc, SVAAFrameInfo* pVaaInfo, SMB* pCurMb) {
1342   const int32_t kiPicStride     = pVaaInfo->iPicStride;
1343   const int32_t kiPicStrideUV   = pVaaInfo->iPicStrideUV;
1344   const int32_t kiOffsetY       = (pCurMb->iMbY * kiPicStride + pCurMb->iMbX) << 4;
1345   const int32_t kiOffsetUV      = (pCurMb->iMbY * kiPicStrideUV + pCurMb->iMbX) << 3;
1346 
1347   pFunc->pfCopy16x16Aligned (pVaaInfo->pCurY + kiOffsetY, kiPicStride, pVaaInfo->pRefY + kiOffsetY, kiPicStride);
1348   pFunc->pfCopy8x8Aligned (pVaaInfo->pCurU + kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefU + kiOffsetUV, kiPicStrideUV);
1349   pFunc->pfCopy8x8Aligned (pVaaInfo->pCurV + kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefV + kiOffsetUV, kiPicStrideUV);
1350 }
1351 
WelsMdBackgroundMbEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache,SSlice * pSlice,bool bSkipMbFlag)1352 void WelsMdBackgroundMbEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache, SSlice* pSlice,
1353                             bool bSkipMbFlag) {
1354   SDqLayer* pCurDqLayer         = pEncCtx->pCurDqLayer;
1355   SWelsFuncPtrList* pFunc       = pEncCtx->pFuncList;
1356   SMVUnitXY sMvp                = { 0 };
1357   uint8_t* pRefLuma             = pMbCache->SPicData.pRefMb[0];
1358   uint8_t* pRefCb               = pMbCache->SPicData.pRefMb[1];
1359   uint8_t* pRefCr               = pMbCache->SPicData.pRefMb[2];
1360   int32_t iLineSizeY            = pCurDqLayer->pRefPic->iLineSize[0];
1361   int32_t iLineSizeUV           = pCurDqLayer->pRefPic->iLineSize[1];
1362   uint8_t* pDstLuma             = pMbCache->pSkipMb;
1363   uint8_t* pDstCb               = pMbCache->pSkipMb + 256;
1364   uint8_t* pDstCr               = pMbCache->pSkipMb + 256 + 64;
1365 
1366   if (!bSkipMbFlag) {
1367     pDstLuma    = pMbCache->pMemPredLuma;
1368     pDstCb      = pMbCache->pMemPredChroma;
1369     pDstCr      = pMbCache->pMemPredChroma + 64;
1370   }
1371   //MC
1372   pFunc->sMcFuncs.pMcLumaFunc (pRefLuma, iLineSizeY, pDstLuma, 16, 0, 0, 16, 16);
1373   pFunc->sMcFuncs.pMcChromaFunc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
1374   pFunc->sMcFuncs.pMcChromaFunc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
1375 
1376   pCurMb->uiCbp = 0;
1377   pMbCache->bCollocatedPredFlag = true;
1378   pWelsMd->iCostLuma = 0;//BGD&RC integration
1379   pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1380                         pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY);
1381   ST32 (&pCurMb->sP16x16Mv, 0);
1382   ST32 (&pCurDqLayer->pDecPic->sMvList[pCurMb->iMbXY], 0);
1383 
1384   if (bSkipMbFlag) {
1385     pCurMb->uiMbType = MB_TYPE_BACKGROUND;
1386 
1387     //update motion info to current MB
1388     ST32 (pCurMb->pRefIndex, 0);
1389     pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
1390 
1391     pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
1392     pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
1393                                                           pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
1394 
1395     WelsRecPskip (pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
1396     VaaBackgroundMbDataUpdate (pEncCtx->pFuncList, pEncCtx->pVaa, pCurMb);
1397     return;
1398   }
1399 
1400   pCurMb->uiMbType = MB_TYPE_16x16;
1401 
1402   pWelsMd->sMe.sMe16x16.sMv.iMvX = 0;
1403   pWelsMd->sMe.sMe16x16.sMv.iMvY = 0;
1404   PredMv (&pMbCache->sMvComponents, 0, 4, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMvp);
1405   pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
1406 
1407   UpdateP16x16MotionInfo (pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
1408 
1409   if (pWelsMd->bMdUsingSad)
1410     pWelsMd->iCostLuma = pCurMb->pSadCost[0];
1411   else
1412     pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1413                          pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY);
1414 
1415   WelsInterMbEncode (pEncCtx, pSlice, pCurMb);
1416   WelsPMbChromaEncode (pEncCtx, pSlice, pCurMb);
1417 
1418   pFunc->pfCopy16x16Aligned (pMbCache->SPicData.pCsMb[0], pCurDqLayer->iCsStride[0], pMbCache->pMemPredLuma,     16);
1419   pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[1], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma,    8);
1420   pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[2], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma + 64, 8);
1421 }
1422 
WelsMdPSkipEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)1423 bool WelsMdPSkipEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
1424   SDqLayer* pCurLayer           = pEncCtx->pCurDqLayer;
1425   SWelsFuncPtrList* pFunc       = pEncCtx->pFuncList;
1426 
1427   uint8_t* pRefLuma = pMbCache->SPicData.pRefMb[0];
1428   uint8_t* pRefCb   = pMbCache->SPicData.pRefMb[1];
1429   uint8_t* pRefCr   = pMbCache->SPicData.pRefMb[2];
1430   int32_t iLineSizeY  = pCurLayer->pRefPic->iLineSize[0];
1431   int32_t iLineSizeUV = pCurLayer->pRefPic->iLineSize[1];
1432 
1433   uint8_t* pDstLuma = pMbCache->pSkipMb;
1434   uint8_t* pDstCb   = pMbCache->pSkipMb + 256;
1435   uint8_t* pDstCr   = pMbCache->pSkipMb + 256 + 64;
1436 
1437   SMVUnitXY sMvp = { 0 };
1438   int32_t n;
1439 
1440   int32_t iEncStride = pCurLayer->iEncStride[0];
1441   uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
1442   int32_t* pStrideEncBlockOffset = pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId];
1443   int32_t* pEncBlockOffset;
1444 
1445   int32_t iSadCostLuma = 0;
1446   int32_t iSadCostChroma = 0;
1447   int32_t iSadCostMb = 0;
1448 
1449   PredSkipMv (pMbCache, &sMvp);
1450 
1451   // Special case, need to clip the vector //
1452   SMVUnitXY sQpelMvp = { static_cast<int16_t> (sMvp.iMvX >> 2), static_cast<int16_t> (sMvp.iMvY >> 2) };
1453   n = (pCurMb->iMbX << 4) + sQpelMvp.iMvX;
1454   if (n < -29)
1455     return false;
1456   else if (n > (int32_t) ((pCurLayer->iMbWidth << 4) + 12))
1457     return false;
1458 
1459   n = (pCurMb->iMbY << 4) + sQpelMvp.iMvY;
1460   if (n < -29)
1461     return false;
1462   else if (n > (int32_t) ((pCurLayer->iMbHeight << 4) + 12))
1463     return false;
1464 
1465   //luma
1466   pRefLuma += sQpelMvp.iMvY * iLineSizeY + sQpelMvp.iMvX;
1467   pFunc->sMcFuncs.pMcLumaFunc (pRefLuma, iLineSizeY, pDstLuma, 16, sMvp.iMvX, sMvp.iMvY, 16, 16);
1468   iSadCostLuma    = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1469                     pCurLayer->iEncStride[0], pDstLuma, 16);
1470 
1471   const int32_t iStrideUV = (sQpelMvp.iMvY >> 1) * iLineSizeUV + (sQpelMvp.iMvX >> 1);
1472   pRefCb += iStrideUV;
1473   pFunc->sMcFuncs.pMcChromaFunc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
1474   iSadCostChroma  = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[1],
1475                     pCurLayer->iEncStride[1], pDstCb, 8);
1476 
1477   pRefCr += iStrideUV;
1478   pFunc->sMcFuncs.pMcChromaFunc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
1479   iSadCostChroma += pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[2],
1480                     pCurLayer->iEncStride[2], pDstCr, 8);
1481 
1482   iSadCostMb = iSadCostLuma + iSadCostChroma;
1483 
1484   if (iSadCostMb == 0                             ||
1485       iSadCostMb < pWelsMd->iSadPredSkip   ||
1486       (pCurLayer->pRefPic->iPictureType == P_SLICE     &&
1487        pMbCache->uiRefMbType == MB_TYPE_SKIP    &&
1488        iSadCostMb < pCurLayer->pRefPic->pMbSkipSad[pCurMb->iMbXY])) {
1489     //update motion info to current MB
1490     ST32 (pCurMb->pRefIndex, 0);
1491     pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
1492 
1493     if (pWelsMd->bMdUsingSad) {
1494       pCurMb->pSadCost[0] = iSadCostLuma;
1495       pWelsMd->iCostLuma = pCurMb->pSadCost[0];
1496     } else
1497       pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1498                            pCurLayer->iEncStride[0], pDstLuma, 16);
1499 
1500     pWelsMd->iCostSkipMb = iSadCostMb;
1501 
1502     pCurMb->sP16x16Mv = sMvp;
1503     pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
1504 
1505     return true;
1506   }
1507 
1508   WelsDctMb (pMbCache->pCoeffLevel,  pEncMb, iEncStride, pDstLuma, pEncCtx->pFuncList->pfDctFourT4);
1509 
1510   if (WelsTryPYskip (pEncCtx, pCurMb, pMbCache)) {
1511     iEncStride = pEncCtx->pCurDqLayer->iEncStride[1];
1512     pEncMb = pMbCache->SPicData.pEncMb[1];
1513     pEncBlockOffset = pStrideEncBlockOffset + 16;
1514     pFunc->pfDctFourT4 (pMbCache->pCoeffLevel + 256, & (pEncMb[*pEncBlockOffset]), iEncStride, pMbCache->pSkipMb + 256, 8);
1515     if (WelsTryPUVskip (pEncCtx, pCurMb, pMbCache, 1)) {
1516       pEncMb = pMbCache->SPicData.pEncMb[2];
1517       pEncBlockOffset = pStrideEncBlockOffset + 20;
1518       pFunc->pfDctFourT4 (pMbCache->pCoeffLevel + 320, & (pEncMb[*pEncBlockOffset]), iEncStride, pMbCache->pSkipMb + 320, 8);
1519       if (WelsTryPUVskip (pEncCtx, pCurMb, pMbCache, 2)) {
1520         //update motion info to current MB
1521         ST32 (pCurMb->pRefIndex, 0);
1522         pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
1523 
1524         if (pWelsMd->bMdUsingSad) {
1525           pCurMb->pSadCost[0] = iSadCostLuma;
1526           pWelsMd->iCostLuma = pCurMb->pSadCost[0];
1527         } else
1528           pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1529                                pCurLayer->iEncStride[0], pDstLuma, 16);
1530 
1531         pWelsMd->iCostSkipMb = iSadCostMb;
1532 
1533         pCurMb->sP16x16Mv = sMvp;
1534         pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
1535 
1536         return true;
1537       }
1538     }
1539   }
1540   return false;
1541 }
1542 
1543 const int32_t g_kiPixStrideIdx8x8[4] = {  0,                                             ME_REFINE_BUF_WIDTH_BLK8,
1544                                           ME_REFINE_BUF_STRIDE_BLK8, ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8
1545                                        };
1546 const int32_t g_kiPixStrideIdx4x4[4][4] = {
1547   {
1548     0,
1549     0 + ME_REFINE_BUF_WIDTH_BLK4,
1550     0 + ME_REFINE_BUF_STRIDE_BLK4,
1551     0 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1552   }, //[0][]
1553   {
1554     ME_REFINE_BUF_WIDTH_BLK8,
1555     ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
1556     ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
1557     ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1558   }, //[1][]
1559   {
1560     ME_REFINE_BUF_STRIDE_BLK8,
1561     ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
1562     ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
1563     ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1564   }, //[2][]
1565   {
1566     ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8,
1567     ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
1568     ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
1569     ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1570   } //[3][]
1571 };
1572 
WelsMdInterMbRefinement(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)1573 void WelsMdInterMbRefinement (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
1574   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1575   SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1576   uint8_t* pTmpRefCb, *pTmpRefCr, *pTmpDstCb, *pTmpDstCr;
1577   int32_t iMvStride, iRefBlk4Stride, iDstBlk4Stride;
1578   SMVUnitXY* pMv;
1579   int32_t iBestSadCost = 0, iBestSatdCost = 0;
1580   SMeRefinePointer sMeRefine;
1581 
1582   int32_t i, j, iIdx, iPixStride;
1583 
1584   uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
1585   uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2];
1586   uint8_t* pDstCb = pMbCache->pMemPredChroma;
1587   uint8_t* pDstCr = pMbCache->pMemPredChroma + 64;
1588   uint8_t* pDstLuma = pMbCache->pMemPredLuma;
1589 
1590   int32_t iLineSizeRefUV = pCurDqLayer->pRefPic->iLineSize[1];
1591 
1592   switch (pCurMb->uiMbType) {
1593   case MB_TYPE_16x16:
1594     //luma
1595     InitMeRefinePointer (&sMeRefine, pMbCache, 0);
1596     sMeRefine.pfCopyBlockByMode =
1597       pFunc->pfCopy16x16NotAligned; // dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
1598     MeRefineFracPixel (pEncCtx, pDstLuma, &pWelsMd->sMe.sMe16x16, &sMeRefine, 16, 16);
1599     UpdateP16x16MotionInfo (pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
1600 
1601     pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
1602     //save the best cost of final mode
1603     iBestSadCost  = pWelsMd->sMe.sMe16x16.uiSadCost;
1604     iBestSatdCost = pWelsMd->sMe.sMe16x16.uiSatdCost;
1605 
1606     //chroma
1607     pMv = &pWelsMd->sMe.sMe16x16.sMv;
1608     iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1609     pTmpRefCb = pRefCb + iMvStride;
1610     pTmpRefCr = pRefCr + iMvStride;
1611     pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cb
1612     pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cr
1613 
1614     pWelsMd->iCostSkipMb = pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1615                            pCurDqLayer->iEncStride[0], pDstLuma, 16);
1616     pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[1],
1617                             pCurDqLayer->iEncStride[1], pDstCb, 8);
1618     pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[2],
1619                             pCurDqLayer->iEncStride[2], pDstCr, 8);
1620     break;
1621 
1622   case MB_TYPE_16x8:
1623     iPixStride = 0;
1624     sMeRefine.pfCopyBlockByMode =
1625       pFunc->pfCopy16x8NotAligned; // dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
1626     for (i = 0; i < 2; i++) {
1627       //luma
1628       iIdx = i << 3;
1629       InitMeRefinePointer (&sMeRefine, pMbCache, iPixStride);
1630       iPixStride += ME_REFINE_BUF_STRIDE_BLK8;
1631       PredInter16x8Mv (pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMvp);
1632       MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe16x8[i], &sMeRefine, 16, 8);
1633       UpdateP16x8MotionInfo (pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMv);
1634       pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe16x8[i].sMvp;
1635       //save the best cost of final mode
1636       iBestSadCost += pWelsMd->sMe.sMe16x8[i].uiSadCost;
1637       iBestSatdCost += pWelsMd->sMe.sMe16x8[i].uiSatdCost;
1638 
1639       //chroma
1640       iRefBlk4Stride = (i << 2) * iLineSizeRefUV;
1641       iDstBlk4Stride = i << 5; // 4*8
1642       pMv = &pWelsMd->sMe.sMe16x8[i].sMv;
1643       iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1644       pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
1645       pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
1646       pTmpDstCb = pDstCb + iDstBlk4Stride;
1647       pTmpDstCr = pDstCr + iDstBlk4Stride;
1648       pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cb
1649       pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cr
1650     }
1651     break;
1652 
1653   case MB_TYPE_8x16:
1654     iPixStride = 0;
1655     sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x16Aligned;
1656     for (i = 0; i < 2; i++) {
1657       //luma
1658       iIdx = i << 2;
1659       InitMeRefinePointer (&sMeRefine, pMbCache, iPixStride);
1660       iPixStride += ME_REFINE_BUF_WIDTH_BLK8;
1661       PredInter8x16Mv (pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMvp);
1662       MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe8x16[i], &sMeRefine, 8, 16);
1663       update_P8x16_motion_info (pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMv);
1664       pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x16[i].sMvp;
1665       //save the best cost of final mode
1666       iBestSadCost += pWelsMd->sMe.sMe8x16[i].uiSadCost;
1667       iBestSatdCost += pWelsMd->sMe.sMe8x16[i].uiSatdCost;
1668 
1669       //chroma
1670       iRefBlk4Stride = iIdx; //4
1671       pMv = &pWelsMd->sMe.sMe8x16[i].sMv;
1672       iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1673       pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
1674       pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
1675       pTmpDstCb = pDstCb + iRefBlk4Stride;
1676       pTmpDstCr = pDstCr + iRefBlk4Stride;
1677       pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cb
1678       pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cr
1679     }
1680     break;
1681   case MB_TYPE_8x8:
1682     pMbCache->sMvComponents.iRefIndexCache [9] = pMbCache->sMvComponents.iRefIndexCache [21] = REF_NOT_AVAIL;
1683     for (i = 0; i < 4; i++) {
1684       int32_t iBlk8Idx = i << 2; //0, 4, 8, 12
1685       int32_t iBlk4X, iBlk4Y, iBlk4x4Idx;
1686 
1687       pCurMb->pRefIndex[i] = pWelsMd->uiRef;
1688       switch (pCurMb->uiSubMbType[i]) {
1689       case SUB_MB_TYPE_8x8:
1690         sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x8Aligned;
1691         //luma
1692         InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx8x8[i]);
1693         PredMv (&pMbCache->sMvComponents, iBlk8Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMvp);
1694         MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk8Idx], &pWelsMd->sMe.sMe8x8[i], &sMeRefine, 8, 8);
1695         UpdateP8x8MotionInfo (pMbCache, pCurMb, iBlk8Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMv);
1696         pMbCache->sMbMvp[g_kuiMbCountScan4Idx[iBlk8Idx]] = pWelsMd->sMe.sMe8x8[i].sMvp;
1697         iBestSadCost += pWelsMd->sMe.sMe8x8[i].uiSadCost;
1698         iBestSatdCost += pWelsMd->sMe.sMe8x8[i].uiSatdCost;
1699 
1700         //chroma
1701         pMv = &pWelsMd->sMe.sMe8x8[i].sMv;
1702         iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1703 
1704         iBlk4X = (i & 1) << 2;
1705         iBlk4Y = (i >> 1) << 2;
1706         iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1707         iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1708 
1709         pTmpRefCb = pRefCb + iRefBlk4Stride;
1710         pTmpDstCb = pDstCb + iDstBlk4Stride;
1711         pTmpRefCr = pRefCr + iRefBlk4Stride;
1712         pTmpDstCr = pDstCr + iDstBlk4Stride;
1713         pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1714             4, 4); //Cb
1715         pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1716             4, 4); //Cr
1717         break;
1718       case SUB_MB_TYPE_4x4:
1719         sMeRefine.pfCopyBlockByMode = pFunc->pfCopy4x4;
1720         //luma
1721         for (j = 0; j < 4; ++j) {
1722           iBlk4x4Idx = iBlk8Idx + j;
1723           InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j]);
1724           PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 1, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x4[i][j].sMvp);
1725           MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe4x4[i][j], &sMeRefine, 4, 4);
1726           UpdateP4x4MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x4[i][j].sMv);
1727           pMbCache->sMbMvp[g_kuiMbCountScan4Idx[iBlk4x4Idx]] = pWelsMd->sMe.sMe4x4[i][j].sMvp;
1728           iBestSadCost += pWelsMd->sMe.sMe4x4[i][j].uiSadCost;
1729           iBestSatdCost += pWelsMd->sMe.sMe4x4[i][j].uiSatdCost;
1730 
1731           //chroma
1732           pMv = &pWelsMd->sMe.sMe4x4[i][j].sMv;
1733           iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1734 
1735           iBlk4X = (((i & 1) << 1) + (j & 1)) << 1;
1736           iBlk4Y = (((i >> 1) << 1) + (j >> 1)) << 1;
1737           iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1738           iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1739 
1740           pTmpRefCb = pRefCb + iRefBlk4Stride;
1741           pTmpDstCb = pDstCb + iDstBlk4Stride;
1742           pTmpRefCr = pRefCr + iRefBlk4Stride;
1743           pTmpDstCr = pDstCr + iDstBlk4Stride;
1744           pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1745               2, 2); //Cb
1746           pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1747               2, 2); //Cr
1748         }
1749         break;
1750       case SUB_MB_TYPE_8x4:
1751         sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x4;
1752         //luma
1753         for (j = 0; j < 2; ++j) {
1754           iBlk4x4Idx = iBlk8Idx + (j << 1);
1755           InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j << 1]);
1756           PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x4[i][j].sMvp);
1757           MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe8x4[i][j], &sMeRefine, 8, 4);
1758           UpdateP8x4MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x4[i][j].sMv);
1759           pMbCache->sMbMvp[g_kuiMbCountScan4Idx[    iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
1760           //pMbCache->sMbMvp[g_kuiMbCountScan4Idx[1 + iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
1761           iBestSadCost += pWelsMd->sMe.sMe8x4[i][j].uiSadCost;
1762           iBestSatdCost += pWelsMd->sMe.sMe8x4[i][j].uiSatdCost;
1763 
1764           //chroma
1765           pMv = &pWelsMd->sMe.sMe8x4[i][j].sMv;
1766           iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1767 
1768           iBlk4X = ((i & 1) << 1) << 1;
1769           iBlk4Y = (((i >> 1) << 1) + j) << 1;
1770           iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1771           iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1772 
1773           pTmpRefCb = pRefCb + iRefBlk4Stride;
1774           pTmpDstCb = pDstCb + iDstBlk4Stride;
1775           pTmpRefCr = pRefCr + iRefBlk4Stride;
1776           pTmpDstCr = pDstCr + iDstBlk4Stride;
1777           pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1778               4, 2); //Cb
1779           pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1780               4, 2); //Cr
1781         }
1782         break;
1783       case SUB_MB_TYPE_4x8:
1784         sMeRefine.pfCopyBlockByMode = pFunc->pfCopy4x8;
1785         //luma
1786         for (j = 0; j < 2; ++j) {
1787           iBlk4x4Idx = iBlk8Idx + j;
1788           InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j]);
1789           PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 1, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x8[i][j].sMvp);
1790           MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe4x8[i][j], &sMeRefine, 4, 8);
1791           UpdateP4x8MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x8[i][j].sMv);
1792           pMbCache->sMbMvp[g_kuiMbCountScan4Idx[    iBlk4x4Idx]] = pWelsMd->sMe.sMe4x8[i][j].sMvp;
1793           //pMbCache->sMbMvp[g_kuiMbCountScan4Idx[4 + iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
1794           iBestSadCost += pWelsMd->sMe.sMe4x8[i][j].uiSadCost;
1795           iBestSatdCost += pWelsMd->sMe.sMe4x8[i][j].uiSatdCost;
1796 
1797           //chroma
1798           pMv = &pWelsMd->sMe.sMe4x8[i][j].sMv;
1799           iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1800 
1801           iBlk4X = (((i & 1) << 1) + j) << 1;
1802           iBlk4Y = (((i >> 1) << 1)) << 1;
1803           iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1804           iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1805 
1806           pTmpRefCb = pRefCb + iRefBlk4Stride;
1807           pTmpDstCb = pDstCb + iDstBlk4Stride;
1808           pTmpRefCr = pRefCr + iRefBlk4Stride;
1809           pTmpDstCr = pDstCr + iDstBlk4Stride;
1810           pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1811               2, 4); //Cb
1812           pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1813               2, 4); //Cr
1814         }
1815         break;
1816       }
1817     }
1818     break;
1819   default:
1820     break;
1821   }
1822   pCurMb->pSadCost[0] = iBestSadCost;
1823   if (pWelsMd->bMdUsingSad)
1824     pWelsMd->iCostLuma = iBestSadCost;
1825   else
1826     pWelsMd->iCostLuma = iBestSatdCost;
1827 
1828 }
WelsMdFirstIntraMode(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)1829 bool WelsMdFirstIntraMode (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
1830   SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1831 
1832   int32_t iCostI16x16 = WelsMdI16x16 (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
1833 
1834   //compare cost_p16x16 with cost_i16x16
1835   if (iCostI16x16 < pWelsMd->iCostLuma) {
1836     pCurMb->uiMbType = MB_TYPE_INTRA16x16;
1837     pWelsMd->iCostLuma = iCostI16x16;
1838 
1839     pFunc->pfIntraFineMd (pEncCtx, pWelsMd, pCurMb, pMbCache);
1840 
1841     //add pEnc&rec to MD--2010.3.15
1842     if (IS_INTRA16x16 (pCurMb->uiMbType)) {
1843       pCurMb->uiCbp = 0;
1844       WelsEncRecI16x16Y (pEncCtx, pCurMb, pMbCache);
1845     }
1846 
1847     //chroma
1848     pWelsMd->iCostChroma = WelsMdIntraChroma (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
1849     WelsIMbChromaEncode (pEncCtx, pCurMb, pMbCache);  //add pEnc&rec to MD--2010.3.15
1850     pCurMb->uiChromPredMode = pMbCache->uiChmaI8x8Mode;
1851     pCurMb->pSadCost[0] = 0;
1852     return true; //intra_mb_type is best
1853   }
1854 
1855   return false;
1856 }
1857 
WelsMdInterMb(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,SMbCache * pUnused)1858 void WelsMdInterMb (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pUnused) {
1859   SDqLayer* pCurDqLayer             = pEncCtx->pCurDqLayer;
1860   SMbCache* pMbCache                = &pSlice->sMbCacheInfo;
1861   const uint32_t kuiNeighborAvail   = pCurMb->uiNeighborAvail;
1862   const int32_t kiMbWidth           = pCurDqLayer->iMbWidth;
1863   const  SMB* top_mb                = pCurMb - kiMbWidth;
1864   const bool bMbLeftAvailPskip      = ((kuiNeighborAvail & LEFT_MB_POS) ? IS_SKIP ((pCurMb - 1)->uiMbType) : false);
1865   const bool bMbTopAvailPskip       = ((kuiNeighborAvail & TOP_MB_POS) ? IS_SKIP (top_mb->uiMbType) : false);
1866   const bool bMbTopLeftAvailPskip   = ((kuiNeighborAvail & TOPLEFT_MB_POS) ? IS_SKIP ((top_mb - 1)->uiMbType) : false);
1867   const bool bMbTopRightAvailPskip = ((kuiNeighborAvail & TOPRIGHT_MB_POS) ? IS_SKIP ((top_mb + 1)->uiMbType) : false);
1868   bool bTrySkip = bMbLeftAvailPskip || bMbTopAvailPskip || bMbTopLeftAvailPskip || bMbTopRightAvailPskip;
1869   bool bKeepSkip = bMbLeftAvailPskip && bMbTopAvailPskip && bMbTopRightAvailPskip;
1870   bool bSkip = false;
1871 
1872   //try BGD skip
1873   if (pEncCtx->pFuncList->pfInterMdBackgroundDecision (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip)) {
1874     return;
1875   }
1876 
1877   //try static or scrolled Pskip
1878   if (pEncCtx->pFuncList->pfSCDPSkipDecision (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache)) {
1879     return;
1880   }
1881 
1882   //step 1: try SKIP
1883   bSkip = WelsMdInterJudgePskip (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip);
1884 
1885   if (bSkip) {
1886     if (bKeepSkip) {
1887       WelsMdInterDecidedPskip (pEncCtx,  pSlice,  pCurMb, pMbCache);
1888       return;
1889     }
1890   } else {
1891     PredictSad (pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb);
1892 
1893     //step 2: P_16x16
1894     pWelsMd->iCostLuma = WelsMdP16x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
1895     pCurMb->uiMbType = MB_TYPE_16x16;
1896   }
1897 
1898   WelsMdInterSecondaryModesEnc (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip);
1899 }
1900 
1901 
1902 
1903 //////
1904 //  try the ordinary Pskip
1905 //////
WelsMdInterJudgePskip(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache,bool bTrySkip)1906 bool WelsMdInterJudgePskip (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
1907                             bool bTrySkip) {
1908   bool bRet = true;
1909   if (((pEncCtx->pRefPic->iPictureType == P_SLICE) && (pMbCache->uiRefMbType == MB_TYPE_SKIP
1910        || pMbCache->uiRefMbType == MB_TYPE_BACKGROUND)) ||
1911       bTrySkip) {
1912     PredictSadSkip (pMbCache->sMvComponents.iRefIndexCache, pMbCache->bMbTypeSkip, pMbCache->iSadCostSkip, 0,
1913                     & (pWelsMd->iSadPredSkip));
1914     bRet = WelsMdPSkipEnc (pEncCtx, pWelsMd, pCurMb, pMbCache) ? true : false;
1915     return bRet;
1916   }
1917 
1918   return false;
1919 }
1920 
1921 //////
1922 //  try the ordinary Pskip
1923 //////
WelsMdInterUpdatePskip(SDqLayer * pCurDqLayer,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache)1924 void WelsMdInterUpdatePskip (SDqLayer* pCurDqLayer, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
1925   //add pEnc&rec to MD--2010.3.15
1926   pCurMb->uiCbp = 0;
1927   pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
1928   pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
1929                                                         pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
1930   pMbCache->bCollocatedPredFlag = (LD32 (&pCurMb->sMv[0]) == 0);
1931 }
1932 
1933 
1934 //////
1935 //  doublecheck if current MBTYPE is Pskip
1936 //////
WelsMdInterDoubleCheckPskip(SMB * pCurMb,SMbCache * pMbCache)1937 void WelsMdInterDoubleCheckPskip (SMB* pCurMb, SMbCache* pMbCache) {
1938   if (MB_TYPE_16x16 == pCurMb->uiMbType && 0 == pCurMb->uiCbp) {
1939     if (0 == pCurMb->pRefIndex[0]) {
1940       SMVUnitXY sMvp = { 0 };
1941 
1942       PredSkipMv (pMbCache, &sMvp);
1943       if (LD32 (&sMvp) == LD32 (&pCurMb->sMv[0])) {
1944         pCurMb->uiMbType = MB_TYPE_SKIP;
1945       }
1946     }
1947     pMbCache->bCollocatedPredFlag = (LD32 (&pCurMb->sMv[0]) == 0);
1948   }
1949 }
1950 
1951 //////
1952 //  Pskip mb encode
1953 //////
WelsMdInterDecidedPskip(sWelsEncCtx * pEncCtx,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache)1954 void WelsMdInterDecidedPskip (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
1955   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1956   pCurMb->uiMbType = MB_TYPE_SKIP;
1957   WelsRecPskip (pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
1958   WelsMdInterUpdatePskip (pCurDqLayer, pSlice, pCurMb, pMbCache);
1959 }
1960 
1961 //////
1962 //  inter mb encode
1963 //////
WelsMdInterEncode(sWelsEncCtx * pEncCtx,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache)1964 void WelsMdInterEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
1965   SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1966   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1967 
1968   //add pEnc&rec to MD--2010.3.15
1969   const int32_t kiCsStrideY = pCurDqLayer->iCsStride[0];
1970   const int32_t kiCsStrideUV = pCurDqLayer->iCsStride[1];
1971 
1972   //add pEnc&rec to MD--2010.3.15
1973   pCurMb->uiCbp = 0;
1974   WelsInterMbEncode (pEncCtx, pSlice, pCurMb);
1975   WelsPMbChromaEncode (pEncCtx, pSlice, pCurMb);
1976 
1977   pFunc->pfCopy16x16Aligned (pMbCache->SPicData.pCsMb[0], kiCsStrideY, pMbCache->pMemPredLuma,      16);
1978   pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[1], kiCsStrideUV, pMbCache->pMemPredChroma,    8);
1979   pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[2], kiCsStrideUV, pMbCache->pMemPredChroma + 64, 8);
1980 }
1981 
1982 
1983 
1984 //
1985 //
1986 //
WelsMdInterSaveSadAndRefMbType(Mb_Type * pRefMbtypeList,SMbCache * pMbCache,const SMB * pCurMb,const SWelsMD * pMd)1987 void WelsMdInterSaveSadAndRefMbType (Mb_Type* pRefMbtypeList, SMbCache* pMbCache, const SMB*  pCurMb,
1988                                      const SWelsMD* pMd) {
1989   const Mb_Type kmtCurMbtype = pCurMb->uiMbType;
1990 
1991   //sad
1992   pMbCache->pEncSad[0] = (kmtCurMbtype == MB_TYPE_SKIP) ? pMd->iCostSkipMb : 0;
1993   //uiMbType
1994   pRefMbtypeList[pCurMb->iMbXY] = kmtCurMbtype;
1995 }
1996 
WelsMdInterSecondaryModesEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache,const bool bSkip)1997 void WelsMdInterSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
1998                                    SMbCache* pMbCache, const bool bSkip) {
1999   //step 2: Intra
2000   const bool kbTrySkip = pEncCtx->pFuncList->pfFirstIntraMode (pEncCtx, pWelsMd, pCurMb, pMbCache);
2001   if (kbTrySkip)
2002     return;
2003 
2004   if (bSkip) {
2005     WelsMdInterDecidedPskip (pEncCtx,  pSlice,  pCurMb, pMbCache);
2006   } else {
2007     //Step 3: SubP16 MD
2008     pEncCtx->pFuncList->pfSetScrollingMv (pEncCtx->pVaa, pWelsMd); //SCC
2009     pEncCtx->pFuncList->pfInterFineMd (pEncCtx, pWelsMd, pSlice, pCurMb, pWelsMd->iCostLuma);
2010 
2011     //refinement for inter type
2012     WelsMdInterMbRefinement (pEncCtx, pWelsMd, pCurMb, pMbCache);
2013 
2014     //step 7: invoke encoding
2015     WelsMdInterEncode (pEncCtx, pSlice, pCurMb, pMbCache);
2016 
2017     //step 8: double check Pskip
2018     WelsMdInterDoubleCheckPskip (pCurMb, pMbCache);
2019   }
2020 }
2021 
2022 
WelsMdIntraSecondaryModesEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)2023 void WelsMdIntraSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
2024   SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
2025   //initial prediction memory for I_4x4
2026   pFunc->pfIntraFineMd (pEncCtx, pWelsMd, pCurMb, pMbCache); //WelsMdIntraFinePartitionVaa
2027 
2028   //add pEnc&rec to MD--2010.3.15
2029   if (IS_INTRA16x16 (pCurMb->uiMbType)) {
2030     pCurMb->uiCbp = 0;
2031     WelsEncRecI16x16Y (pEncCtx, pCurMb, pMbCache);
2032   }
2033 
2034   //chroma
2035   pWelsMd->iCostChroma = WelsMdIntraChroma (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
2036   WelsIMbChromaEncode (pEncCtx, pCurMb, pMbCache);  //add pEnc&rec to MD--2010.3.15
2037   pCurMb->uiChromPredMode = pMbCache->uiChmaI8x8Mode;
2038   pCurMb->pSadCost[0] = 0;
2039 }
2040 
2041 } // namespace WelsEnc
2042