1 /*!
2 * \copy
3 * Copyright (c) 2009-2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file svc_base_layer_md.c
33 *
34 * \brief mode decision
35 *
36 * \date 2009.08.10 Created
37 *
38 *************************************************************************************
39 */
40 #include "ls_defines.h"
41 #include "mv_pred.h"
42 #include "svc_enc_golomb.h"
43 #include "svc_base_layer_md.h"
44 #include "encoder.h"
45 #include "svc_encode_mb.h"
46 #include "svc_encode_slice.h"
47 namespace WelsEnc {
48 static const ALIGNED_DECLARE (int8_t, g_kiIntra16AvaliMode[8][5], 16) = {
49 { I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
50 { I16_PRED_DC_L, I16_PRED_H, I16_PRED_INVALID, I16_PRED_INVALID, 2 },
51 { I16_PRED_DC_T, I16_PRED_V, I16_PRED_INVALID, I16_PRED_INVALID, 2 },
52 { I16_PRED_V, I16_PRED_H, I16_PRED_DC, I16_PRED_INVALID, 3 },
53 { I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
54 { I16_PRED_DC_L, I16_PRED_H, I16_PRED_INVALID, I16_PRED_INVALID, 2 },
55 { I16_PRED_DC_T, I16_PRED_V, I16_PRED_INVALID, I16_PRED_INVALID, 2 },
56 { I16_PRED_V, I16_PRED_H, I16_PRED_DC, I16_PRED_P, 4 }
57 };
58
59 static const ALIGNED_DECLARE (uint8_t, g_kiIntra4AvailCount[16], 16) = {
60 #ifndef I4_PRED_MODE_EXTEND
61 1, 3, 2, 4, 1, 3, 2, 7, 1, 3, 4, 6, 1, 3, 4, 9
62 #else
63 1, 3, 4, 4, 1, 3, 4, 7, 1, 3, 4, 6, 1, 3, 4, 9
64 #endif //I4_PRED_MODE_EXTEND
65 };
66
67 //left_avail | (top_avail<<1) | (left_top_avail<<2) | (right_top_avail<<3);
68 static const ALIGNED_DECLARE (uint8_t, g_kiIntra4AvailMode[16][16], 16) = {
69 {
70 I4_PRED_DC_128, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
71 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
72 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
73 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
74 }, // 0000
75
76 {
77 I4_PRED_DC_L, I4_PRED_H, I4_PRED_HU, I4_PRED_INVALID,
78 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
79 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
80 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
81 }, // 0001
82
83 #ifndef I4_PRED_MODE_EXTEND
84 {
85 I4_PRED_DC_T, I4_PRED_V, I4_PRED_INVALID, I4_PRED_INVALID,
86 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
87 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
88 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
89 }, // 0010
90 #else
91 {
92 I4_PRED_DC_T, I4_PRED_V, I4_PRED_DDL_TOP, I4_PRED_VL_TOP,
93 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
94 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
95 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
96 }, // 0010
97 #endif //I4_PRED_MODE_EXTEND
98
99 {
100 I4_PRED_DC, I4_PRED_H, I4_PRED_V, I4_PRED_HU,
101 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
102 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
103 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
104 }, // 0011
105
106 {
107 I4_PRED_DC_128, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
108 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
109 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
110 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
111 }, // 0100
112
113 {
114 I4_PRED_DC_L, I4_PRED_H, I4_PRED_HU, I4_PRED_INVALID,
115 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
116 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
117 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
118 }, // 0101
119
120 #ifndef I4_PRED_MODE_EXTEND
121 {
122 I4_PRED_DC_T, I4_PRED_V, I4_PRED_INVALID, I4_PRED_INVALID,
123 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
124 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
125 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
126 }, // 0110
127 #else
128 {
129 I4_PRED_DC_T, I4_PRED_V, I4_PRED_DDL_TOP, I4_PRED_VL_TOP,
130 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
131 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
132 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
133 }, // 0110
134 #endif //I4_PRED_MODE_EXTEND
135
136 {
137 I4_PRED_DC, I4_PRED_H, I4_PRED_V, I4_PRED_HU,
138 I4_PRED_DDR, I4_PRED_VR, I4_PRED_HD, I4_PRED_INVALID,
139 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
140 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
141 }, // 0111
142
143 {
144 I4_PRED_DC_128, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
145 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
146 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
147 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
148 }, // 1000
149
150 {
151 I4_PRED_DC_L, I4_PRED_H, I4_PRED_HU, I4_PRED_INVALID,
152 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
153 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
154 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
155 }, // 1001
156
157 {
158 I4_PRED_DC_T, I4_PRED_V, I4_PRED_DDL, I4_PRED_VL,
159 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
160 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
161 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
162 }, // 1010
163
164 {
165 I4_PRED_DC, I4_PRED_H, I4_PRED_V, I4_PRED_HU,
166 I4_PRED_DDL, I4_PRED_VL, I4_PRED_INVALID, I4_PRED_INVALID,
167 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
168 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
169 }, // 1011
170
171 {
172 I4_PRED_DC_128, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
173 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
174 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
175 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
176 }, // 1100
177
178 {
179 I4_PRED_DC_L, I4_PRED_H, I4_PRED_HU, I4_PRED_INVALID,
180 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
181 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
182 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
183 }, // 1101
184
185 {
186 I4_PRED_DC_T, I4_PRED_V, I4_PRED_DDL, I4_PRED_VL,
187 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
188 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
189 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
190 }, // 1110
191
192 {
193 I4_PRED_DC, I4_PRED_H, I4_PRED_V, I4_PRED_HU,
194 I4_PRED_DDL, I4_PRED_VL, I4_PRED_DDR, I4_PRED_VR,
195 I4_PRED_HD, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
196 I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID
197 } // 1111
198
199 };
200 static const ALIGNED_DECLARE (int8_t, g_kiIntraChromaAvailMode[8][5], 16) = {
201 { C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
202 { C_PRED_DC_L, C_PRED_H, C_PRED_INVALID, C_PRED_INVALID, 2 },
203 { C_PRED_DC_T, C_PRED_V, C_PRED_INVALID, C_PRED_INVALID, 2 },
204 { C_PRED_V, C_PRED_H, C_PRED_DC, C_PRED_INVALID, 3 },
205 { C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
206 { C_PRED_DC_L, C_PRED_H, C_PRED_INVALID, C_PRED_INVALID, 2 },
207 { C_PRED_DC_T, C_PRED_V, C_PRED_INVALID, C_PRED_INVALID, 2 },
208 { C_PRED_V, C_PRED_H, C_PRED_DC, C_PRED_P, 4 }
209 };
210
211 // for cache hit, two table are total sizeof 64 Bytes
212 const int8_t g_kiCoordinateIdx4x4X[16] = { 0, 4, 0, 4,
213 8, 12, 8, 12,
214 0, 4, 0, 4,
215 8, 12, 8, 12
216 };
217
218 const int8_t g_kiCoordinateIdx4x4Y[16] = { 0, 0, 4, 4,
219 0, 0, 4, 4,
220 8, 8, 12, 12,
221 8, 8, 12, 12
222 };
223 static const ALIGNED_DECLARE (int8_t, g_kiNeighborIntraToI4x4[16][16], 16) = {
224 { 0, 1, 10, 7, 1, 1, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
225 { 1, 1, 15, 7, 1, 1, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
226 { 10, 15, 10, 7, 15, 7, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
227 { 11, 15, 15, 7, 15, 7, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
228 { 4, 1, 10, 7, 1, 1, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
229 { 5, 1, 15, 7, 1, 1, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
230 { 14, 15, 10, 7, 15, 7, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
231 { 15, 15, 15, 7, 15, 7, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
232 { 0, 1, 10, 7, 1, 9, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
233 { 1, 1, 15, 7, 1, 9, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
234 { 10, 15, 10, 7, 15, 15, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
235 { 11, 15, 15, 7, 15, 15, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
236 { 4, 1, 10, 7, 1, 9, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
237 { 5, 1, 15, 7, 1, 9, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
238 { 14, 15, 10, 7, 15, 15, 15, 7, 10, 15, 10, 7, 15, 7, 15, 7},
239 { 15, 15, 15, 7, 15, 15, 15, 7, 15, 15, 15, 7, 15, 7, 15, 7},
240 };
241
242 ALIGNED_DECLARE (const int8_t, g_kiMapModeI4x4[14], 16) = {
243 0, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 3, 7
244 };
245
PredIntra4x4Mode(int8_t * pIntraPredMode,int32_t iIdx4)246 int32_t PredIntra4x4Mode (int8_t* pIntraPredMode, int32_t iIdx4) {
247 int8_t iTopMode = pIntraPredMode[iIdx4 - 8];
248 int8_t iLeftMode = pIntraPredMode[iIdx4 - 1];
249 int8_t iBestMode;
250
251 if (-1 == iLeftMode || -1 == iTopMode) {
252 iBestMode = 2;
253 } else {
254 iBestMode = WELS_MIN (iLeftMode, iTopMode);
255 }
256 return iBestMode;
257 }
258
WelsMdIntraInit(sWelsEncCtx * pEncCtx,SMB * pCurMb,SMbCache * pMbCache,const int32_t iSliceFirstMbXY)259 void WelsMdIntraInit (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, const int32_t iSliceFirstMbXY) {
260 SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
261
262 const int32_t kiMbX = pCurMb->iMbX;
263 const int32_t kiMbY = pCurMb->iMbY;
264 const int32_t kiMbXY = pCurMb->iMbXY;
265
266 // step 3. locating current pEnc and pDec
267 // unroll loops here
268 if (0 == kiMbX || iSliceFirstMbXY == kiMbXY) {
269 int32_t iStrideY, iStrideUV;
270 int32_t iOffsetY, iOffsetUV;
271
272 iStrideY = pCurLayer->iEncStride[0];
273 iStrideUV = pCurLayer->iEncStride[1];
274 iOffsetY = (kiMbX + kiMbY * iStrideY) << 4;
275 iOffsetUV = (kiMbX + kiMbY * iStrideUV) << 3;
276 pMbCache->SPicData.pEncMb[0] = pCurLayer->pEncData[0] + iOffsetY;
277 pMbCache->SPicData.pEncMb[1] = pCurLayer->pEncData[1] + iOffsetUV;
278 pMbCache->SPicData.pEncMb[2] = pCurLayer->pEncData[2] + iOffsetUV;
279
280 iStrideY = pCurLayer->iCsStride[0];
281 iStrideUV = pCurLayer->iCsStride[1];
282 iOffsetY = (kiMbX + kiMbY * iStrideY) << 4;
283 iOffsetUV = (kiMbX + kiMbY * iStrideUV) << 3;
284 pMbCache->SPicData.pCsMb[0] = pCurLayer->pCsData[0] + iOffsetY;
285 pMbCache->SPicData.pCsMb[1] = pCurLayer->pCsData[1] + iOffsetUV;
286 pMbCache->SPicData.pCsMb[2] = pCurLayer->pCsData[2] + iOffsetUV;
287
288 iStrideY = pCurLayer->pDecPic->iLineSize[0];
289 iStrideUV = pCurLayer->pDecPic->iLineSize[1];
290 iOffsetY = (kiMbX + kiMbY * iStrideY) << 4;
291 iOffsetUV = (kiMbX + kiMbY * iStrideUV) << 3;
292 pMbCache->SPicData.pDecMb[0] = pCurLayer->pDecPic->pData[0] + iOffsetY;
293 pMbCache->SPicData.pDecMb[1] = pCurLayer->pDecPic->pData[1] + iOffsetUV;
294 pMbCache->SPicData.pDecMb[2] = pCurLayer->pDecPic->pData[2] + iOffsetUV;
295 } else {
296 pMbCache->SPicData.pEncMb[0] += MB_WIDTH_LUMA;
297 pMbCache->SPicData.pEncMb[1] += MB_WIDTH_CHROMA;
298 pMbCache->SPicData.pEncMb[2] += MB_WIDTH_CHROMA;
299
300 pMbCache->SPicData.pDecMb[0] += MB_WIDTH_LUMA;
301 pMbCache->SPicData.pDecMb[1] += MB_WIDTH_CHROMA;
302 pMbCache->SPicData.pDecMb[2] += MB_WIDTH_CHROMA;
303
304 pMbCache->SPicData.pCsMb[0] += MB_WIDTH_LUMA;
305 pMbCache->SPicData.pCsMb[1] += MB_WIDTH_CHROMA;
306 pMbCache->SPicData.pCsMb[2] += MB_WIDTH_CHROMA;
307 }
308
309 //step 2. initial pWelsMd
310 pCurMb->uiCbp = 0;
311
312 //step 4: locating scaled_tcoeff
313
314 //step 1. load neighbor cache
315 FillNeighborCacheIntra (pMbCache, pCurMb, pCurLayer->iMbWidth);
316 pMbCache->pMemPredLuma = pMbCache->pMemPredMb;// in WelsMdI16x16() will be changed, so re-init here!
317 pMbCache->pMemPredChroma = pMbCache->pMemPredMb +
318 256;// Init with default, maybe change in WelsMdI16x16 and svc_md_i16x16_sad
319 }
320
WelsMdInterInit(sWelsEncCtx * pEncCtx,SSlice * pSlice,SMB * pCurMb,const int32_t iSliceFirstMbXY)321 void WelsMdInterInit (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, const int32_t iSliceFirstMbXY) {
322 SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
323 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
324 const int32_t kiMbX = pCurMb->iMbX;
325 const int32_t kiMbY = pCurMb->iMbY;
326 const int32_t kiMbXY = pCurMb->iMbXY;
327 const int32_t kiMbWidth = pCurLayer->iMbWidth;
328 const int32_t kiMbHeight = pCurLayer->iMbHeight;
329
330 pMbCache->pEncSad = &pCurLayer->pDecPic->pMbSkipSad[kiMbXY];
331
332 //step 1. load neighbor cache
333 pEncCtx->pFuncList->pfFillInterNeighborCache (pMbCache, pCurMb, kiMbWidth,
334 pEncCtx->pVaa->pVaaBackgroundMbFlag + kiMbXY); //BGD spatial pFunc
335
336 //step 3: initial cost
337
338 //step 4. locating current p_ref
339 // merge loops
340 if (0 == kiMbX || iSliceFirstMbXY == kiMbXY) {
341 const int32_t kiRefStrideY = pCurLayer->pRefPic->iLineSize[0];
342 const int32_t kiRefStrideUV = pCurLayer->pRefPic->iLineSize[1];
343 const int32_t kiCurStrideY = (kiMbX + kiMbY * kiRefStrideY) << 4;
344 const int32_t kiCurStrideUV = (kiMbX + kiMbY * kiRefStrideUV) << 3;
345 pMbCache->SPicData.pRefMb[0] = pCurLayer->pRefPic->pData[0] + kiCurStrideY;
346 pMbCache->SPicData.pRefMb[1] = pCurLayer->pRefPic->pData[1] + kiCurStrideUV;
347 pMbCache->SPicData.pRefMb[2] = pCurLayer->pRefPic->pData[2] + kiCurStrideUV;
348 } else {
349 pMbCache->SPicData.pRefMb[0] += MB_WIDTH_LUMA;
350 pMbCache->SPicData.pRefMb[1] += MB_WIDTH_CHROMA;
351 pMbCache->SPicData.pRefMb[2] += MB_WIDTH_CHROMA;
352 }
353
354 pMbCache->uiRefMbType = pCurLayer->pRefPic->uiRefMbType[kiMbXY];
355 pMbCache->bCollocatedPredFlag = false;
356
357 //comment: sometimes, mode decision process may skip the md_p16x16 and md_pskip function,
358 ST32 (&pCurMb->sP16x16Mv, 0);
359 ST32 (&pCurLayer->pDecPic->sMvList[kiMbXY], 0);
360
361 SetMvWithinIntegerMvRange (kiMbWidth, kiMbHeight, kiMbX, kiMbY, pEncCtx->iMvRange, & (pSlice->sMvStartMin),
362 & (pSlice->sMvStartMax));
363 }
364
WelsMdI16x16(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SMbCache * pMbCache,int32_t iLambda)365 int32_t WelsMdI16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda) {
366 const int8_t* kpAvailMode;
367 int32_t iAvailCount;
368 int32_t iIdx = 0;
369 uint8_t* pPredI16x16[2] = {pMbCache->pMemPredMb, pMbCache->pMemPredMb + 256};
370 uint8_t* pDst = pPredI16x16[0];
371 uint8_t* pDec = pMbCache->SPicData.pCsMb[0];
372 uint8_t* pEnc = pMbCache->SPicData.pEncMb[0];
373 int32_t iLineSizeDec = pCurDqLayer->iCsStride[0];
374 int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
375 int32_t i, iCurCost, iCurMode, iBestMode, iBestCost = INT_MAX;
376
377 int32_t iOffset = pMbCache->uiNeighborIntra & 0x07;
378 iAvailCount = g_kiIntra16AvaliMode[iOffset][4];
379 kpAvailMode = g_kiIntra16AvaliMode[iOffset];
380 if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra16x16Combined3) {
381 iBestCost = pFunc->sSampleDealingFuncs.pfIntra16x16Combined3 (pDec, iLineSizeDec, pEnc, iLineSizeEnc, &iBestMode,
382 iLambda, pDst/*temp*/);
383 iCurMode = kpAvailMode[3];
384 pFunc->pfGetLumaI16x16Pred[iCurMode] (pDst, pDec, iLineSizeDec);
385 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16] (pDst, 16, pEnc, iLineSizeEnc) + iLambda * 4 ;
386 if (iCurCost < iBestCost) {
387 iBestMode = iCurMode;
388 iBestCost = iCurCost;
389 } else {
390 pFunc->pfGetLumaI16x16Pred[iBestMode] (pDst, pDec, iLineSizeDec);
391 }
392 iIdx = 1;
393 iBestCost += iLambda;
394 } else {
395 iBestMode = kpAvailMode[0];
396 for (i = 0; i < iAvailCount; ++ i) {
397 iCurMode = kpAvailMode[i];
398
399 assert (iCurMode >= 0 && iCurMode < 7);
400
401 pFunc->pfGetLumaI16x16Pred[iCurMode] (pDst, pDec, iLineSizeDec);
402 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16] (pDst, 16, pEnc, iLineSizeEnc);
403 iCurCost += iLambda * (BsSizeUE (g_kiMapModeI16x16[iCurMode]));
404 if (iCurCost < iBestCost) {
405 iBestMode = iCurMode;
406 iBestCost = iCurCost;
407 iIdx = iIdx ^ 0x01;
408 pDst = pPredI16x16[iIdx];
409 }
410 }
411 }
412 pMbCache->pMemPredChroma = pPredI16x16[iIdx];
413
414 pMbCache->pMemPredLuma = pPredI16x16[iIdx ^ 0x01];
415 pMbCache->uiLumaI16x16Mode = iBestMode;
416 return iBestCost;
417 }
WelsMdI4x4(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)418 int32_t WelsMdI4x4 (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
419 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
420 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
421 int32_t iLambda = pWelsMd->iLambda;
422 int32_t iBestCostLuma = pWelsMd->iCostLuma;
423 uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
424 uint8_t* pDecMb = pMbCache->SPicData.pCsMb[0];
425 const int32_t kiLineSizeEnc = pCurDqLayer->iEncStride[0];
426 const int32_t kiLineSizeDec = pCurDqLayer->iCsStride[0];
427
428 uint8_t* pCurEnc, *pCurDec, *pDst;
429
430 int32_t iPredMode, iCurMode, iBestMode, iFinalMode;
431 int32_t iCurCost, iBestCost;
432 int32_t iAvailCount;
433 const uint8_t* kpAvailMode;
434 int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
435 int32_t lambda[2] = {iLambda << 2, iLambda};
436 bool* pPrevIntra4x4PredModeFlag = pMbCache->pPrevIntra4x4PredModeFlag;
437 int8_t* pRemIntra4x4PredModeFlag = pMbCache->pRemIntra4x4PredModeFlag;
438 const uint8_t* kpIntra4x4AvailCount = &g_kiIntra4AvailCount[0];
439 const uint8_t* kpCache48CountScan4 = &g_kuiCache48CountScan4Idx[0];
440 const int8_t* kpNeighborIntraToI4x4 = g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
441 const int8_t* kpCoordinateIdxX = &g_kiCoordinateIdx4x4X[0];
442 const int8_t* kpCoordinateIdxY = &g_kiCoordinateIdx4x4Y[0];
443 int32_t iBestPredBufferNum = 0;
444 int32_t iCosti4x4 = 0;
445
446 #if defined(X86_ASM)
447 WelsPrefetchZero_mmx (g_kiMapModeI4x4);
448 WelsPrefetchZero_mmx ((int8_t*)&pFunc->pfGetLumaI4x4Pred);
449 #endif//X86_ASM
450
451 for (i = 0; i < 16; i++) {
452 const int32_t kiOffset = kpNeighborIntraToI4x4[i];
453
454 //step 1: locating current 4x4 block position in pEnc and pDecMb
455 iCoordinateX = kpCoordinateIdxX[i];
456 iCoordinateY = kpCoordinateIdxY[i];
457
458 iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
459 pCurEnc = pEncMb + iIdxStrideEnc;
460 iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
461 pCurDec = pDecMb + iIdxStrideDec;
462
463 //step 2: get predicted mode from neighbor
464 iPredMode = PredIntra4x4Mode (pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
465
466 //step 3: collect candidates of iPredMode
467 iAvailCount = kpIntra4x4AvailCount[kiOffset];
468 kpAvailMode = g_kiIntra4AvailMode[kiOffset];
469
470 //step 4: gain the best pred mode
471 iBestCost = INT_MAX;
472 iBestMode = kpAvailMode[0];
473
474 if (pFunc->sSampleDealingFuncs.pfIntra4x4Combined3 && (iAvailCount >= 6)) {
475 pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
476
477 iBestCost = pFunc->sSampleDealingFuncs.pfIntra4x4Combined3 (pCurDec, kiLineSizeDec, pCurEnc, kiLineSizeEnc, pDst,
478 &iBestMode,
479 lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
480 // ST64(&pMbCache->pMemPredBlk4[iBestMode<<4], LD64(mem_pred_blk4_temp));
481 // ST64(&pMbCache->pMemPredBlk4[8+(iBestMode<<4)], LD64(mem_pred_blk4_temp+8));
482
483 for (j = 3; j < iAvailCount; ++ j) {
484 iCurMode = kpAvailMode[j];
485
486 assert (iCurMode >= 0 && iCurMode < 14);
487
488 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
489
490 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
491 iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
492 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
493
494 if (iCurCost < iBestCost) {
495 iBestMode = iCurMode;
496 iBestCost = iCurCost;
497 iBestPredBufferNum = 1 - iBestPredBufferNum;
498 }
499 }
500 } else {
501 for (j = 0; j < iAvailCount; ++ j) {
502 iCurMode = kpAvailMode[j];
503
504 assert (iCurMode >= 0 && iCurMode < 14);
505
506 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
507
508 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
509 iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
510 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
511
512 if (iCurCost < iBestCost) {
513 iBestMode = iCurMode;
514 iBestCost = iCurCost;
515 iBestPredBufferNum = 1 - iBestPredBufferNum;
516 }
517 }
518 }
519 pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
520 iCosti4x4 += iBestCost;
521 if (iCosti4x4 >= iBestCostLuma) {
522 break;
523 }
524
525 //step 5: update pred mode and sample avail cache
526 iFinalMode = g_kiMapModeI4x4[iBestMode];
527 if (iPredMode == iFinalMode) {
528 *pPrevIntra4x4PredModeFlag++ = true;
529 } else {
530 *pPrevIntra4x4PredModeFlag++ = false;
531 *pRemIntra4x4PredModeFlag = (iFinalMode < iPredMode ? iFinalMode : (iFinalMode - 1));
532 }
533 pRemIntra4x4PredModeFlag++;
534 // pCurMb->pIntra4x4PredMode[g_kuiMbCountScan4Idx[i]] = iFinalMode;
535 pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
536
537 //step 6: encoding I_4x4
538 WelsEncRecI4x4Y (pEncCtx, pCurMb, pMbCache, i);
539 }
540 ST32 (pCurMb->pIntra4x4PredMode, LD32 (&pMbCache->iIntraPredMode[33]));
541 pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
542 pCurMb->pIntra4x4PredMode[5] = pMbCache->iIntraPredMode[20];
543 pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
544 iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
545 return iCosti4x4;
546 }
547
WelsMdI4x4Fast(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)548 int32_t WelsMdI4x4Fast (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
549 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
550 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
551 int32_t iLambda = pWelsMd->iLambda;
552 int32_t iBestCostLuma = pWelsMd->iCostLuma;
553 uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
554 uint8_t* pDecMb = pMbCache->SPicData.pCsMb[0];
555 const int32_t kiLineSizeEnc = pCurDqLayer->iEncStride[0];
556 const int32_t kiLineSizeDec = pCurDqLayer->iCsStride[0];
557
558 uint8_t* pCurEnc, *pCurDec, *pDst;
559 int8_t iPredMode, iCurMode, iBestMode, iFinalMode;
560 int32_t iCurCost, iBestCost;
561 int32_t iAvailCount;
562 const uint8_t* kpAvailMode;
563 int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
564 int32_t iCostH, iCostV, iCostVR, iCostHD, iCostVL, iCostHU, iBestModeFake;
565 int32_t lambda[2] = {iLambda << 2, iLambda};
566 bool* pPrevIntra4x4PredModeFlag = pMbCache->pPrevIntra4x4PredModeFlag;
567 int8_t* pRemIntra4x4PredModeFlag = pMbCache->pRemIntra4x4PredModeFlag;
568 const uint8_t* kpIntra4x4AvailCount = &g_kiIntra4AvailCount[0];
569 const uint8_t* kpCache48CountScan4 = &g_kuiCache48CountScan4Idx[0];
570 const int8_t* kpNeighborIntraToI4x4 = g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
571 const int8_t* kpCoordinateIdxX = &g_kiCoordinateIdx4x4X[0];
572 const int8_t* kpCoordinateIdxY = &g_kiCoordinateIdx4x4Y[0];
573 int32_t iBestPredBufferNum = 0;
574 int32_t iCosti4x4 = 0;
575 #if defined(X86_ASM)
576 WelsPrefetchZero_mmx (g_kiMapModeI4x4);
577 WelsPrefetchZero_mmx ((int8_t*)&pFunc->pfGetLumaI4x4Pred);
578 #endif//X86_ASM
579
580 for (i = 0; i < 16; i++) {
581 const int32_t kiOffset = kpNeighborIntraToI4x4[i];
582 // const int32_t i_next = (1+i) & 15; // next loop
583 // const uint8_t dummy_byte= pIntra4x4AvailCount[pNeighborIntraToI4x4[i_next]]; // prefetch pIntra4x4AvailCount of next loop to avoid cache missed
584
585 //step 1: locating current 4x4 block position in pEnc and pDecMb
586 iCoordinateX = kpCoordinateIdxX[i];
587 iCoordinateY = kpCoordinateIdxY[i];
588
589 iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
590 pCurEnc = pEncMb + iIdxStrideEnc;
591 iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
592 pCurDec = pDecMb + iIdxStrideDec;
593
594 //step 2: get predicted mode from neighbor
595 iPredMode = PredIntra4x4Mode (pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
596 //step 3: collect candidates of iPredMode
597 iAvailCount = kpIntra4x4AvailCount[kiOffset];
598 kpAvailMode = g_kiIntra4AvailMode[kiOffset];
599
600 if (iAvailCount == 9 || iAvailCount == 7) {
601 //I4_PRED_DC(2)
602
603 iBestMode = I4_PRED_DC;
604
605 pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
606
607 pFunc->pfGetLumaI4x4Pred[I4_PRED_DC] (pDst, pCurDec, kiLineSizeDec);
608 iBestCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
609 lambda[iPredMode == g_kiMapModeI4x4[iBestMode]];
610
611 //I4_PRED_H(1)
612 iCurMode = I4_PRED_H;
613
614 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
615
616 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
617 iCostH = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
618 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
619
620 if (iCurCost < iBestCost) {
621 iBestMode = iCurMode;
622 iBestCost = iCurCost;
623 iBestPredBufferNum = 1 - iBestPredBufferNum;
624 }
625
626 //I4_PRED_V(0)
627 iCurMode = I4_PRED_V;
628
629 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
630
631 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
632 iCostV = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
633 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
634
635 if (iCurCost < iBestCost) {
636 iBestMode = iCurMode;
637 iBestCost = iCurCost;
638 iBestPredBufferNum = 1 - iBestPredBufferNum;
639 }
640 if (iCostV < iCostH) {
641 if (iAvailCount == 9) {
642 iBestModeFake = true; //indicating whether V is the best fake mode
643
644 //I4_PRED_VR(5) and I4_PRED_VL(7)
645 iCurMode = I4_PRED_VR;
646
647 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
648
649 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
650 iCostVR = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
651 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
652
653 if (iCurCost < iBestCost) {
654 iBestMode = iCurMode;
655 iBestCost = iCurCost;
656 iBestPredBufferNum = 1 - iBestPredBufferNum;
657 }
658
659 if (iCurCost < iCostV)
660 iBestModeFake = false;
661
662 iCurMode = I4_PRED_VL;
663
664 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
665
666 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
667 iCostVL = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
668 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
669
670 if (iCurCost < iBestCost) {
671 iBestMode = iCurMode;
672 iBestCost = iCurCost;
673 iBestPredBufferNum = 1 - iBestPredBufferNum;
674 }
675
676 if (iCurCost < iCostV)
677 iBestModeFake = false;
678
679 //Vertical Early Determination
680 if (!iBestModeFake) { //Vertical is not the best, go on checking...
681 //select the best one from VL and VR
682 if (iCostVR < iCostVL) {
683 //I4_PRED_DDR(4)
684 iCurMode = I4_PRED_DDR;
685
686 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
687
688 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
689
690 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
691 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
692
693 if (iCurCost < iBestCost) {
694 iBestMode = iCurMode;
695 iBestCost = iCurCost;
696 iBestPredBufferNum = 1 - iBestPredBufferNum;
697 }
698 } else {
699 //I4_PRED_DDL(3)
700 iCurMode = I4_PRED_DDL;
701
702 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
703
704 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
705
706 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
707 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
708
709 if (iCurCost < iBestCost) {
710 iBestMode = iCurMode;
711 iBestCost = iCurCost;
712 iBestPredBufferNum = 1 - iBestPredBufferNum;
713 }
714 }
715 }
716 } else if (iAvailCount == 7) {
717 iCurMode = I4_PRED_DDR;
718
719 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
720
721 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
722 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
723 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
724
725 if (iCurCost < iBestCost) {
726 iBestMode = iCurMode;
727 iBestCost = iCurCost;
728 iBestPredBufferNum = 1 - iBestPredBufferNum;
729 }
730
731 iCurMode = I4_PRED_VR;
732
733 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
734
735 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
736
737 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
738 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
739
740 if (iCurCost < iBestCost) {
741 iBestMode = iCurMode;
742 iBestCost = iCurCost;
743 iBestPredBufferNum = 1 - iBestPredBufferNum;
744 }
745 }
746 } else {
747 iBestModeFake = true; //indicating whether H is the best fake mode
748 //I4_PRED_HD(6) and I4_PRED_HU(8)
749 iCurMode = I4_PRED_HD;
750
751 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
752
753 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
754 iCostHD = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
755 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
756
757 if (iCurCost < iBestCost) {
758 iBestMode = iCurMode;
759 iBestCost = iCurCost;
760 iBestPredBufferNum = 1 - iBestPredBufferNum;
761 }
762
763 if (iCurCost < iCostH)
764 iBestModeFake = false;
765
766 iCurMode = I4_PRED_HU;
767
768 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
769
770 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
771 iCostHU = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
772 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
773
774 if (iCurCost < iBestCost) {
775 iBestMode = iCurMode;
776 iBestCost = iCurCost;
777 iBestPredBufferNum = 1 - iBestPredBufferNum;
778 }
779
780 if (iCurCost < iCostH)
781 iBestModeFake = false;
782
783 if (!iBestModeFake) { //Horizontal is not the best, go on checking...
784 //select the best one from VL and VR
785 if (iCostHD < iCostHU) {
786 //I4_PRED_DDR(4)
787 iCurMode = I4_PRED_DDR;
788
789 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
790
791 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
792 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
793 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
794
795 if (iCurCost < iBestCost) {
796 iBestMode = iCurMode;
797 iBestCost = iCurCost;
798 iBestPredBufferNum = 1 - iBestPredBufferNum;
799 }
800 } else if (iAvailCount == 9) {
801 //I4_PRED_DDL(3)
802 iCurMode = I4_PRED_DDL;
803
804 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
805 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
806
807 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
808 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
809
810 if (iCurCost < iBestCost) {
811 iBestMode = iCurMode;
812 iBestCost = iCurCost;
813 iBestPredBufferNum = 1 - iBestPredBufferNum;
814 }
815
816 }
817 }
818 }
819 } else {
820 iBestCost = INT_MAX;
821 iBestMode = I4_PRED_INVALID;
822 for (j = 0; j < iAvailCount; j++) {
823 // I4x4_MODE_CHECK(pAvailMode[j], iCurCost);
824 iCurMode = kpAvailMode[j];
825
826 pDst = &pMbCache->pMemPredBlk4[ (1 - iBestPredBufferNum) << 4];
827
828 pFunc->pfGetLumaI4x4Pred[iCurMode] (pDst, pCurDec, kiLineSizeDec);
829 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4] (pDst, 4, pCurEnc, kiLineSizeEnc) +
830 lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
831
832 if (iCurCost < iBestCost) {
833 iBestMode = iCurMode;
834 iBestCost = iCurCost;
835 iBestPredBufferNum = 1 - iBestPredBufferNum;
836 }
837 }
838 }
839 pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
840 iCosti4x4 += iBestCost;
841 if (iCosti4x4 >= iBestCostLuma) {
842 break;
843 }
844
845 //step 5: update pred mode and sample avail cache
846 iFinalMode = g_kiMapModeI4x4[iBestMode];
847 if (iPredMode == iFinalMode) {
848 *pPrevIntra4x4PredModeFlag++ = true;
849 } else {
850 *pPrevIntra4x4PredModeFlag++ = false;
851 *pRemIntra4x4PredModeFlag = (iFinalMode < iPredMode ? iFinalMode : (iFinalMode - 1));
852 }
853 pRemIntra4x4PredModeFlag++;
854 // pCurMb->pIntra4x4PredMode[scan4[i]] = iFinalMode;
855 pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
856 //step 6: encoding I_4x4
857 WelsEncRecI4x4Y (pEncCtx, pCurMb, pMbCache, i);
858 }
859 ST32 (pCurMb->pIntra4x4PredMode, LD32 (&pMbCache->iIntraPredMode[33]));
860 pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
861 pCurMb->pIntra4x4PredMode[5] = pMbCache->iIntraPredMode[20];
862 pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
863 iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
864 return iCosti4x4;
865 }
866
WelsMdIntraChroma(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SMbCache * pMbCache,int32_t iLambda)867 int32_t WelsMdIntraChroma (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SMbCache* pMbCache, int32_t iLambda) {
868 const int8_t* kpAvailMode;
869 int32_t iAvailCount = 0;
870 int32_t iChmaIdx = 0;
871 uint8_t* pPredIntraChma[2] = {pMbCache->pMemPredChroma, pMbCache->pMemPredChroma + 128};
872 uint8_t* pDstChma = pPredIntraChma[0];
873 uint8_t* pEncCb = pMbCache->SPicData.pEncMb[1];
874 uint8_t* pEncCr = pMbCache->SPicData.pEncMb[2];
875 uint8_t* pDecCb = pMbCache->SPicData.pCsMb[1];//pMbCache->SPicData.pDecMb[1];
876 uint8_t* pDecCr = pMbCache->SPicData.pCsMb[2];//pMbCache->SPicData.pDecMb[2];
877 const int32_t kiLineSizeEnc = pCurDqLayer->iEncStride[1];
878 const int32_t kiLineSizeDec = pCurDqLayer->iCsStride[1];//pMbCache->SPicData.i_stride_dec[1];
879
880 int32_t i, iCurMode, iCurCost, iBestMode, iBestCost = INT_MAX;
881
882 int32_t iOffset = pMbCache->uiNeighborIntra & 0x07;
883 iAvailCount = g_kiIntraChromaAvailMode[iOffset][4];
884 kpAvailMode = g_kiIntraChromaAvailMode[iOffset];
885 if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra8x8Combined3) {
886 iBestCost = pFunc->sSampleDealingFuncs.pfIntra8x8Combined3 (pDecCb, kiLineSizeDec, pEncCb, kiLineSizeEnc, &iBestMode,
887 iLambda, pDstChma, pDecCr, pEncCr);
888 iCurMode = kpAvailMode[3];
889 pFunc->pfGetChromaPred[iCurMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
890 pFunc->pfGetChromaPred[iCurMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
891
892 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma, 8, pEncCb, kiLineSizeEnc) +
893 pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma + 64, 8, pEncCr, kiLineSizeEnc) +
894 iLambda * 4;
895 if (iCurCost < iBestCost) {
896 iBestMode = iCurMode;
897 iBestCost = iCurCost;
898 } else {
899 pFunc->pfGetChromaPred[iBestMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
900 pFunc->pfGetChromaPred[iBestMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
901 }
902 iBestCost += iLambda;
903 iChmaIdx = 1;
904 } else {
905 iBestMode = kpAvailMode[0];
906 for (i = 0; i < iAvailCount; ++ i) {
907 iCurMode = kpAvailMode[i];
908
909 assert (iCurMode >= 0 && iCurMode < 7);
910
911 // pDstCb = &pMbCache->mem_pred_intra_cb[iCurMode<<6];
912 // pDstCr = &pMbCache->mem_pred_intra_cr[iCurMode<<6];
913 pFunc->pfGetChromaPred[iCurMode] (pDstChma, pDecCb, kiLineSizeDec); //Cb
914 iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma, 8, pEncCb, kiLineSizeEnc);
915
916 pFunc->pfGetChromaPred[iCurMode] (pDstChma + 64, pDecCr, kiLineSizeDec); //Cr
917 iCurCost += pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8] (pDstChma + 64, 8, pEncCr, kiLineSizeEnc) +
918 iLambda * BsSizeUE (g_kiMapModeIntraChroma[iCurMode]);
919 if (iCurCost < iBestCost) {
920 iBestMode = iCurMode;
921 iBestCost = iCurCost;
922 iChmaIdx = iChmaIdx ^ 0x01;
923 pDstChma = pPredIntraChma[iChmaIdx];
924 }
925 }
926 }
927
928 pMbCache->pBestPredIntraChroma = pPredIntraChma[iChmaIdx ^ 0x01];
929 pMbCache->uiChmaI8x8Mode = iBestMode;
930 return iBestCost;
931 }
WelsMdIntraFinePartition(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)932 int32_t WelsMdIntraFinePartition (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
933 int32_t iCosti4x4 = WelsMdI4x4 (pEncCtx, pWelsMd, pCurMb, pMbCache);
934
935 if (iCosti4x4 < pWelsMd->iCostLuma) {
936 pCurMb->uiMbType = MB_TYPE_INTRA4x4;
937 pWelsMd->iCostLuma = iCosti4x4;
938 }
939 return pWelsMd->iCostLuma;
940 }
941
WelsMdIntraFinePartitionVaa(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)942 int32_t WelsMdIntraFinePartitionVaa (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
943
944 if (MdIntraAnalysisVaaInfo (pEncCtx, pMbCache->SPicData.pEncMb[0])) {
945 int32_t iCosti4x4 = WelsMdI4x4Fast (pEncCtx, pWelsMd, pCurMb, pMbCache);
946
947 if (iCosti4x4 < pWelsMd->iCostLuma) {
948 pCurMb->uiMbType = MB_TYPE_INTRA4x4;
949 pWelsMd->iCostLuma = iCosti4x4;
950 }
951 }
952
953 return pWelsMd->iCostLuma;
954 }
955
WelsMdIntraMb(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)956 void WelsMdIntraMb (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
957 //initial prediction memory for I_16x16
958 pWelsMd->iCostLuma = WelsMdI16x16 (pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
959 pCurMb->uiMbType = MB_TYPE_INTRA16x16;
960
961 WelsMdIntraSecondaryModesEnc (pEncCtx, pWelsMd, pCurMb, pMbCache);
962 }
963
InitMe(const SWelsMD & sWelsMd,const int32_t iBlockSize,uint8_t * pEnc,uint8_t * pRef,SScreenBlockFeatureStorage * pRefFeatureStorage,SWelsME & sWelsMe)964 static inline void InitMe (const SWelsMD& sWelsMd, const int32_t iBlockSize, uint8_t* pEnc, uint8_t* pRef,
965 SScreenBlockFeatureStorage* pRefFeatureStorage,
966 SWelsME& sWelsMe) {
967 sWelsMe.iCurMeBlockPixX = sWelsMd.iMbPixX;
968 sWelsMe.iCurMeBlockPixY = sWelsMd.iMbPixY;
969 sWelsMe.uiBlockSize = iBlockSize;
970 sWelsMe.pMvdCost = sWelsMd.pMvdCost;
971
972 sWelsMe.pEncMb = pEnc;
973 sWelsMe.pRefMb = sWelsMe.pColoRefMb = pRef;
974
975 sWelsMe.pRefFeatureStorage = pRefFeatureStorage;
976 }
977
WelsMdP16x16(SWelsFuncPtrList * pFunc,SDqLayer * pCurLayer,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb)978 int32_t WelsMdP16x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb) {
979 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
980 SWelsME* pMe16x16 = &pWelsMd->sMe.sMe16x16;
981 uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
982 const int32_t kiMbWidth = pCurLayer->iMbWidth; // for assign once
983 const int32_t kiMbHeight = pCurLayer->iMbHeight;
984 InitMe (*pWelsMd, BLOCK_16x16, pMbCache->SPicData.pEncMb[0], pMbCache->SPicData.pRefMb[0],
985 pCurLayer->pRefPic->pScreenBlockFeatureStorage,
986 *pMe16x16);
987 //not putting the line below into InitMe to avoid judging mode in InitMe
988 pMe16x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb;
989
990 pSlice->uiMvcNum = 0;
991 pSlice->sMvc[pSlice->uiMvcNum++] = pMe16x16->sMvBase;
992 //spatial motion vector predictors
993 if (uiNeighborAvail & LEFT_MB_POS) { //left available
994 pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb - 1)->sP16x16Mv;
995 }
996 if (uiNeighborAvail & TOP_MB_POS) { //top available
997 pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb - kiMbWidth)->sP16x16Mv;
998 }
999 //temporal motion vector predictors
1000 if (pCurLayer->pRefPic->iPictureType == P_SLICE) {
1001 if (pCurMb->iMbX < kiMbWidth - 1) {
1002 SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY + 1];
1003 pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
1004 pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
1005 ++ pSlice->uiMvcNum;
1006 }
1007 if (pCurMb->iMbY < kiMbHeight - 1) {
1008 SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY + kiMbWidth];
1009 pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
1010 pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
1011 ++ pSlice->uiMvcNum;
1012 }
1013 }
1014
1015 PredMv (&pMbCache->sMvComponents, 0, 4, 0, & (pMe16x16->sMvp));
1016 pFunc->pfMotionSearch[0] (pFunc, pCurLayer, pMe16x16, pSlice);
1017
1018 pCurMb->sP16x16Mv = pMe16x16->sMv;
1019 pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = pMe16x16->sMv;
1020
1021 return pMe16x16->uiSatdCost;
1022 }
WelsMdP16x8(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice)1023 int32_t WelsMdP16x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
1024 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1025 int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
1026 int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
1027 SWelsME* sMe16x8;
1028 int32_t i = 0, iPixelY;
1029 int32_t iCostP16x8 = 0;
1030 do {
1031 sMe16x8 = &pWelsMd->sMe.sMe16x8[i];
1032 iPixelY = (i << 3);
1033 InitMe (*pWelsMd, BLOCK_16x8,
1034 pMbCache->SPicData.pEncMb[0] + (iPixelY * iStrideEnc),
1035 pMbCache->SPicData.pRefMb[0] + (iPixelY * iStrideRef),
1036 pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1037 *sMe16x8);
1038 //not putting the lines below into InitMe to avoid judging mode in InitMe
1039 sMe16x8->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1040 sMe16x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
1041
1042 pSlice->sMvc[0] = sMe16x8->sMvBase;
1043 pSlice->uiMvcNum = 1;
1044
1045 PredInter16x8Mv (pMbCache, i << 3, 0, & (sMe16x8->sMvp));
1046 pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe16x8, pSlice);
1047 UpdateP16x8Motion2Cache (pMbCache, i << 3, pWelsMd->uiRef, & (sMe16x8->sMv));
1048 iCostP16x8 += sMe16x8->uiSatdCost;
1049 ++i;
1050 } while (i < 2);
1051 return iCostP16x8;
1052 }
WelsMdP8x16(SWelsFuncPtrList * pFunc,SDqLayer * pCurLayer,SWelsMD * pWelsMd,SSlice * pSlice)1053 int32_t WelsMdP8x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
1054 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1055 SWelsME* sMe8x16;
1056 int32_t i = 0, iPixelX;
1057 int32_t iCostP8x16 = 0;
1058 do {
1059 iPixelX = (i << 3);
1060 sMe8x16 = &pWelsMd->sMe.sMe8x16[i];
1061 InitMe (*pWelsMd, BLOCK_8x16,
1062 pMbCache->SPicData.pEncMb[0] + iPixelX,
1063 pMbCache->SPicData.pRefMb[0] + iPixelX,
1064 pCurLayer->pRefPic->pScreenBlockFeatureStorage,
1065 *sMe8x16);
1066 //not putting the lines below into InitMe to avoid judging mode in InitMe
1067 sMe8x16->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1068 sMe8x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
1069
1070 pSlice->sMvc[0] = sMe8x16->sMvBase;
1071 pSlice->uiMvcNum = 1;
1072
1073 PredInter8x16Mv (pMbCache, i << 2, 0, & (sMe8x16->sMvp));
1074 pFunc->pfMotionSearch[0] (pFunc, pCurLayer, sMe8x16, pSlice);
1075 UpdateP8x16Motion2Cache (pMbCache, i << 2, pWelsMd->uiRef, & (sMe8x16->sMv));
1076 iCostP8x16 += sMe8x16->uiSatdCost;
1077 ++i;
1078 } while (i < 2);
1079 return iCostP8x16;
1080 }
WelsMdP8x8(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice)1081 int32_t WelsMdP8x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice) {
1082 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1083 int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1084 int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1085 SWelsME* sMe8x8;
1086 int32_t i, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1087 int32_t iCostP8x8 = 0;
1088 for (i = 0; i < 4; i++) {
1089 iIdxX = i & 1;
1090 iIdxY = i >> 1;
1091 iPixelX = (iIdxX << 3);
1092 iPixelY = (iIdxY << 3);
1093 iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1094 iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1095
1096 sMe8x8 = &pWelsMd->sMe.sMe8x8[i];
1097 InitMe (*pWelsMd, BLOCK_8x8,
1098 pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1099 pMbCache->SPicData.pRefMb[0] + iStrideRef,
1100 pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1101 *sMe8x8);
1102 //not putting these three lines below into InitMe to avoid judging mode in InitMe
1103 sMe8x8->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1104 sMe8x8->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1105 sMe8x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1106
1107
1108 pSlice->sMvc[0] = sMe8x8->sMvBase;
1109 pSlice->uiMvcNum = 1;
1110
1111 PredMv (&pMbCache->sMvComponents, i << 2, 2, pWelsMd->uiRef, & (sMe8x8->sMvp));
1112 pFunc->pfMotionSearch[pWelsMd->iBlock8x8StaticIdc[i]] (pFunc, pCurDqLayer, sMe8x8, pSlice);
1113 UpdateP8x8Motion2Cache (pMbCache, i << 2, pWelsMd->uiRef, & (sMe8x8->sMv));
1114 iCostP8x8 += sMe8x8->uiSatdCost;
1115 // sMe8x8++;
1116 }
1117 return iCostP8x8;
1118 }
1119
WelsMdP4x4(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice,const int32_t ki8x8Idx)1120 int32_t WelsMdP4x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
1121 const int32_t ki8x8Idx) {
1122 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1123 int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1124 int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1125 SWelsME* sMe4x4;
1126 int32_t i4x4Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1127 int32_t iCostP4x4 = 0;
1128 for (i4x4Idx = 0; i4x4Idx < 4; ++i4x4Idx) {
1129 int32_t iPartIdx = (ki8x8Idx << 2) + i4x4Idx;
1130 iIdxX = ((ki8x8Idx & 1) << 1) + (i4x4Idx & 1);
1131 iIdxY = ((ki8x8Idx >> 1) << 1) + (i4x4Idx >> 1);
1132 iPixelX = (iIdxX << 2);
1133 iPixelY = (iIdxY << 2);
1134 iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1135 iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1136
1137 sMe4x4 = &pWelsMd->sMe.sMe4x4[ki8x8Idx][i4x4Idx];
1138 InitMe (*pWelsMd, BLOCK_4x4,
1139 pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1140 pMbCache->SPicData.pRefMb[0] + iStrideRef,
1141 pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1142 *sMe4x4);
1143 //not putting these three lines below into InitMe to avoid judging mode in InitMe
1144 sMe4x4->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1145 sMe4x4->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1146 sMe4x4->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1147
1148 pSlice->sMvc[0] = sMe4x4->sMvBase;
1149 pSlice->uiMvcNum = 1;
1150
1151 PredMv (&pMbCache->sMvComponents, iPartIdx, 1, pWelsMd->uiRef, & (sMe4x4->sMvp));
1152 pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe4x4, pSlice);
1153 UpdateP4x4Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe4x4->sMv));
1154 iCostP4x4 += sMe4x4->uiSatdCost;
1155 }
1156 return iCostP4x4;
1157 }
1158
WelsMdP8x4(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice,const int32_t ki8x8Idx)1159 int32_t WelsMdP8x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
1160 const int32_t ki8x8Idx) {
1161 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1162 int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1163 int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1164 SWelsME* sMe8x4;
1165 int32_t i8x4Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1166 int32_t iCostP8x4 = 0;
1167 for (i8x4Idx = 0; i8x4Idx < 2; ++i8x4Idx) {
1168 int32_t iPartIdx = (ki8x8Idx << 2) + (i8x4Idx << 1);
1169 iIdxX = ((ki8x8Idx & 1) << 1);
1170 iIdxY = ((ki8x8Idx >> 1) << 1) + i8x4Idx;
1171 iPixelX = (iIdxX << 2);
1172 iPixelY = (iIdxY << 2);
1173 iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1174 iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1175
1176 sMe8x4 = &pWelsMd->sMe.sMe8x4[ki8x8Idx][i8x4Idx];
1177 InitMe (*pWelsMd, BLOCK_8x4,
1178 pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1179 pMbCache->SPicData.pRefMb[0] + iStrideRef,
1180 pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1181 *sMe8x4);
1182 //not putting these three lines below into InitMe to avoid judging mode in InitMe
1183 sMe8x4->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1184 sMe8x4->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1185 sMe8x4->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1186
1187 pSlice->sMvc[0] = sMe8x4->sMvBase;
1188 pSlice->uiMvcNum = 1;
1189
1190 PredMv (&pMbCache->sMvComponents, iPartIdx, 2, pWelsMd->uiRef, & (sMe8x4->sMvp));
1191 pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe8x4, pSlice);
1192 UpdateP8x4Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe8x4->sMv));
1193 iCostP8x4 += sMe8x4->uiSatdCost;
1194 }
1195 return iCostP8x4;
1196 }
1197
WelsMdP4x8(SWelsFuncPtrList * pFunc,SDqLayer * pCurDqLayer,SWelsMD * pWelsMd,SSlice * pSlice,const int32_t ki8x8Idx)1198 int32_t WelsMdP4x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
1199 const int32_t ki8x8Idx) {
1200 //Wayne, to be modified
1201 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1202 int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
1203 int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
1204 SWelsME* sMe4x8;
1205 int32_t i4x8Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
1206 int32_t iCostP4x8 = 0;
1207 for (i4x8Idx = 0; i4x8Idx < 2; ++i4x8Idx) {
1208 int32_t iPartIdx = (ki8x8Idx << 2) + i4x8Idx;
1209 iIdxX = ((ki8x8Idx & 1) << 1) + i4x8Idx;
1210 iIdxY = ((ki8x8Idx >> 1) << 1);
1211 iPixelX = (iIdxX << 2);
1212 iPixelY = (iIdxY << 2);
1213 iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
1214 iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
1215
1216 sMe4x8 = &pWelsMd->sMe.sMe4x8[ki8x8Idx][i4x8Idx];
1217 InitMe (*pWelsMd, BLOCK_4x8,
1218 pMbCache->SPicData.pEncMb[0] + iStrideEnc,
1219 pMbCache->SPicData.pRefMb[0] + iStrideRef,
1220 pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
1221 *sMe4x8);
1222 //not putting these three lines below into InitMe to avoid judging mode in InitMe
1223 sMe4x8->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
1224 sMe4x8->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
1225 sMe4x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
1226
1227 pSlice->sMvc[0] = sMe4x8->sMvBase;
1228 pSlice->uiMvcNum = 1;
1229
1230 PredMv (&pMbCache->sMvComponents, iPartIdx, 1, pWelsMd->uiRef, & (sMe4x8->sMvp));
1231 pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe4x8, pSlice);
1232 UpdateP4x8Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe4x8->sMv));
1233 iCostP4x8 += sMe4x8->uiSatdCost;
1234 }
1235 return iCostP4x8;
1236 }
1237
WelsMdInterFinePartition(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,int32_t iBestCost)1238 void WelsMdInterFinePartition (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, int32_t iBestCost) {
1239 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1240 // SMbCache *pMbCache = &pSlice->sMbCacheInfo;
1241 int32_t iCost = 0;
1242
1243 // WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x8, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1244
1245 iCost = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1246
1247 if (iCost < iBestCost) {
1248 int32_t iCostPart;
1249 pCurMb->uiMbType = MB_TYPE_8x8;
1250 memset (pCurMb->uiSubMbType, SUB_MB_TYPE_8x8, 4);
1251
1252 // WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1253 iCostPart = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1254 if (iCostPart <= iCost) {
1255 iCost = iCostPart;
1256 pCurMb->uiMbType = MB_TYPE_16x8;
1257 //pCurMb->mb_partition = 2;
1258 }
1259
1260 // WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1261 iCostPart = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1262 if (iCostPart <= iCost) {
1263 iCost = iCostPart;
1264 pCurMb->uiMbType = MB_TYPE_8x16;
1265 //pCurMb->mb_partition = 2;
1266 }
1267 }
1268 }
1269
WelsMdInterFinePartitionVaa(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,int32_t iBestCost)1270 void WelsMdInterFinePartitionVaa (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
1271 int32_t iBestCost) {
1272 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1273 // SMbCache *pMbCache = &pSlice->sMbCacheInfo;
1274 int32_t iCostP8x16, iCostP16x8, iCostP8x8;
1275 uint8_t uiMbSign = pEncCtx->pFuncList->pfGetMbSignFromInterVaa (&pEncCtx->pVaa->sVaaCalcInfo.pSad8x8[pCurMb->iMbXY][0]);
1276
1277 if (uiMbSign == 15) {
1278 return;
1279 }
1280
1281 // iCost = pWelsMd->sMe16x16.uiSatdCost;
1282
1283 switch (uiMbSign) {
1284 case 3:
1285 case 12:
1286 // WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1287 iCostP16x8 = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1288 if (iCostP16x8 < iBestCost) {
1289 iBestCost = iCostP16x8;
1290 pCurMb->uiMbType = MB_TYPE_16x8;
1291 //pCurMb->mb_partition = 2;
1292 }
1293 break;
1294
1295 case 5:
1296 case 10:
1297 // WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
1298 iCostP8x16 = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1299 if (iCostP8x16 < iBestCost) {
1300 iBestCost = iCostP8x16;
1301 pCurMb->uiMbType = MB_TYPE_8x16;
1302 //pCurMb->mb_partition = 2;
1303 }
1304 break;
1305
1306 case 6:
1307 case 9:
1308 iCostP8x8 = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1309 if (iCostP8x8 < iBestCost) {
1310 iBestCost = iCostP8x8;
1311 pCurMb->uiMbType = MB_TYPE_8x8;
1312 memset (pCurMb->uiSubMbType, SUB_MB_TYPE_8x8, 4);
1313 }
1314 break;
1315
1316 default:
1317 iCostP8x8 = WelsMdP8x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1318 if (iCostP8x8 < iBestCost) {
1319 iBestCost = iCostP8x8;
1320 pCurMb->uiMbType = MB_TYPE_8x8;
1321 memset (pCurMb->uiSubMbType, SUB_MB_TYPE_8x8, 4);
1322
1323 iCostP16x8 = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1324 if (iCostP16x8 <= iBestCost) {
1325 iBestCost = iCostP16x8;
1326 pCurMb->uiMbType = MB_TYPE_16x8;
1327 }
1328
1329 iCostP8x16 = WelsMdP8x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
1330 if (iCostP8x16 <= iBestCost) {
1331 iBestCost = iCostP8x16;
1332 pCurMb->uiMbType = MB_TYPE_8x16;
1333 }
1334 }
1335 break;
1336 }
1337 pWelsMd->iCostLuma = iBestCost;
1338 }
1339
1340
VaaBackgroundMbDataUpdate(SWelsFuncPtrList * pFunc,SVAAFrameInfo * pVaaInfo,SMB * pCurMb)1341 inline void VaaBackgroundMbDataUpdate (SWelsFuncPtrList* pFunc, SVAAFrameInfo* pVaaInfo, SMB* pCurMb) {
1342 const int32_t kiPicStride = pVaaInfo->iPicStride;
1343 const int32_t kiPicStrideUV = pVaaInfo->iPicStrideUV;
1344 const int32_t kiOffsetY = (pCurMb->iMbY * kiPicStride + pCurMb->iMbX) << 4;
1345 const int32_t kiOffsetUV = (pCurMb->iMbY * kiPicStrideUV + pCurMb->iMbX) << 3;
1346
1347 pFunc->pfCopy16x16Aligned (pVaaInfo->pCurY + kiOffsetY, kiPicStride, pVaaInfo->pRefY + kiOffsetY, kiPicStride);
1348 pFunc->pfCopy8x8Aligned (pVaaInfo->pCurU + kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefU + kiOffsetUV, kiPicStrideUV);
1349 pFunc->pfCopy8x8Aligned (pVaaInfo->pCurV + kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefV + kiOffsetUV, kiPicStrideUV);
1350 }
1351
WelsMdBackgroundMbEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache,SSlice * pSlice,bool bSkipMbFlag)1352 void WelsMdBackgroundMbEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache, SSlice* pSlice,
1353 bool bSkipMbFlag) {
1354 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1355 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1356 SMVUnitXY sMvp = { 0 };
1357 uint8_t* pRefLuma = pMbCache->SPicData.pRefMb[0];
1358 uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
1359 uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2];
1360 int32_t iLineSizeY = pCurDqLayer->pRefPic->iLineSize[0];
1361 int32_t iLineSizeUV = pCurDqLayer->pRefPic->iLineSize[1];
1362 uint8_t* pDstLuma = pMbCache->pSkipMb;
1363 uint8_t* pDstCb = pMbCache->pSkipMb + 256;
1364 uint8_t* pDstCr = pMbCache->pSkipMb + 256 + 64;
1365
1366 if (!bSkipMbFlag) {
1367 pDstLuma = pMbCache->pMemPredLuma;
1368 pDstCb = pMbCache->pMemPredChroma;
1369 pDstCr = pMbCache->pMemPredChroma + 64;
1370 }
1371 //MC
1372 pFunc->sMcFuncs.pMcLumaFunc (pRefLuma, iLineSizeY, pDstLuma, 16, 0, 0, 16, 16);
1373 pFunc->sMcFuncs.pMcChromaFunc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
1374 pFunc->sMcFuncs.pMcChromaFunc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
1375
1376 pCurMb->uiCbp = 0;
1377 pMbCache->bCollocatedPredFlag = true;
1378 pWelsMd->iCostLuma = 0;//BGD&RC integration
1379 pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1380 pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY);
1381 ST32 (&pCurMb->sP16x16Mv, 0);
1382 ST32 (&pCurDqLayer->pDecPic->sMvList[pCurMb->iMbXY], 0);
1383
1384 if (bSkipMbFlag) {
1385 pCurMb->uiMbType = MB_TYPE_BACKGROUND;
1386
1387 //update motion info to current MB
1388 ST32 (pCurMb->pRefIndex, 0);
1389 pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
1390
1391 pCurMb->uiLumaQp = pSlice->uiLastMbQp;
1392 pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
1393 pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
1394
1395 WelsRecPskip (pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
1396 VaaBackgroundMbDataUpdate (pEncCtx->pFuncList, pEncCtx->pVaa, pCurMb);
1397 return;
1398 }
1399
1400 pCurMb->uiMbType = MB_TYPE_16x16;
1401
1402 pWelsMd->sMe.sMe16x16.sMv.iMvX = 0;
1403 pWelsMd->sMe.sMe16x16.sMv.iMvY = 0;
1404 PredMv (&pMbCache->sMvComponents, 0, 4, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMvp);
1405 pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
1406
1407 UpdateP16x16MotionInfo (pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
1408
1409 if (pWelsMd->bMdUsingSad)
1410 pWelsMd->iCostLuma = pCurMb->pSadCost[0];
1411 else
1412 pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1413 pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY);
1414
1415 WelsInterMbEncode (pEncCtx, pSlice, pCurMb);
1416 WelsPMbChromaEncode (pEncCtx, pSlice, pCurMb);
1417
1418 pFunc->pfCopy16x16Aligned (pMbCache->SPicData.pCsMb[0], pCurDqLayer->iCsStride[0], pMbCache->pMemPredLuma, 16);
1419 pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[1], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma, 8);
1420 pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[2], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma + 64, 8);
1421 }
1422
WelsMdPSkipEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)1423 bool WelsMdPSkipEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
1424 SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
1425 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1426
1427 uint8_t* pRefLuma = pMbCache->SPicData.pRefMb[0];
1428 uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
1429 uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2];
1430 int32_t iLineSizeY = pCurLayer->pRefPic->iLineSize[0];
1431 int32_t iLineSizeUV = pCurLayer->pRefPic->iLineSize[1];
1432
1433 uint8_t* pDstLuma = pMbCache->pSkipMb;
1434 uint8_t* pDstCb = pMbCache->pSkipMb + 256;
1435 uint8_t* pDstCr = pMbCache->pSkipMb + 256 + 64;
1436
1437 SMVUnitXY sMvp = { 0 };
1438 int32_t n;
1439
1440 int32_t iEncStride = pCurLayer->iEncStride[0];
1441 uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
1442 int32_t* pStrideEncBlockOffset = pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId];
1443 int32_t* pEncBlockOffset;
1444
1445 int32_t iSadCostLuma = 0;
1446 int32_t iSadCostChroma = 0;
1447 int32_t iSadCostMb = 0;
1448
1449 PredSkipMv (pMbCache, &sMvp);
1450
1451 // Special case, need to clip the vector //
1452 SMVUnitXY sQpelMvp = { static_cast<int16_t> (sMvp.iMvX >> 2), static_cast<int16_t> (sMvp.iMvY >> 2) };
1453 n = (pCurMb->iMbX << 4) + sQpelMvp.iMvX;
1454 if (n < -29)
1455 return false;
1456 else if (n > (int32_t) ((pCurLayer->iMbWidth << 4) + 12))
1457 return false;
1458
1459 n = (pCurMb->iMbY << 4) + sQpelMvp.iMvY;
1460 if (n < -29)
1461 return false;
1462 else if (n > (int32_t) ((pCurLayer->iMbHeight << 4) + 12))
1463 return false;
1464
1465 //luma
1466 pRefLuma += sQpelMvp.iMvY * iLineSizeY + sQpelMvp.iMvX;
1467 pFunc->sMcFuncs.pMcLumaFunc (pRefLuma, iLineSizeY, pDstLuma, 16, sMvp.iMvX, sMvp.iMvY, 16, 16);
1468 iSadCostLuma = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1469 pCurLayer->iEncStride[0], pDstLuma, 16);
1470
1471 const int32_t iStrideUV = (sQpelMvp.iMvY >> 1) * iLineSizeUV + (sQpelMvp.iMvX >> 1);
1472 pRefCb += iStrideUV;
1473 pFunc->sMcFuncs.pMcChromaFunc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
1474 iSadCostChroma = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[1],
1475 pCurLayer->iEncStride[1], pDstCb, 8);
1476
1477 pRefCr += iStrideUV;
1478 pFunc->sMcFuncs.pMcChromaFunc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
1479 iSadCostChroma += pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[2],
1480 pCurLayer->iEncStride[2], pDstCr, 8);
1481
1482 iSadCostMb = iSadCostLuma + iSadCostChroma;
1483
1484 if (iSadCostMb == 0 ||
1485 iSadCostMb < pWelsMd->iSadPredSkip ||
1486 (pCurLayer->pRefPic->iPictureType == P_SLICE &&
1487 pMbCache->uiRefMbType == MB_TYPE_SKIP &&
1488 iSadCostMb < pCurLayer->pRefPic->pMbSkipSad[pCurMb->iMbXY])) {
1489 //update motion info to current MB
1490 ST32 (pCurMb->pRefIndex, 0);
1491 pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
1492
1493 if (pWelsMd->bMdUsingSad) {
1494 pCurMb->pSadCost[0] = iSadCostLuma;
1495 pWelsMd->iCostLuma = pCurMb->pSadCost[0];
1496 } else
1497 pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1498 pCurLayer->iEncStride[0], pDstLuma, 16);
1499
1500 pWelsMd->iCostSkipMb = iSadCostMb;
1501
1502 pCurMb->sP16x16Mv = sMvp;
1503 pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
1504
1505 return true;
1506 }
1507
1508 WelsDctMb (pMbCache->pCoeffLevel, pEncMb, iEncStride, pDstLuma, pEncCtx->pFuncList->pfDctFourT4);
1509
1510 if (WelsTryPYskip (pEncCtx, pCurMb, pMbCache)) {
1511 iEncStride = pEncCtx->pCurDqLayer->iEncStride[1];
1512 pEncMb = pMbCache->SPicData.pEncMb[1];
1513 pEncBlockOffset = pStrideEncBlockOffset + 16;
1514 pFunc->pfDctFourT4 (pMbCache->pCoeffLevel + 256, & (pEncMb[*pEncBlockOffset]), iEncStride, pMbCache->pSkipMb + 256, 8);
1515 if (WelsTryPUVskip (pEncCtx, pCurMb, pMbCache, 1)) {
1516 pEncMb = pMbCache->SPicData.pEncMb[2];
1517 pEncBlockOffset = pStrideEncBlockOffset + 20;
1518 pFunc->pfDctFourT4 (pMbCache->pCoeffLevel + 320, & (pEncMb[*pEncBlockOffset]), iEncStride, pMbCache->pSkipMb + 320, 8);
1519 if (WelsTryPUVskip (pEncCtx, pCurMb, pMbCache, 2)) {
1520 //update motion info to current MB
1521 ST32 (pCurMb->pRefIndex, 0);
1522 pFunc->pfUpdateMbMv (pCurMb->sMv, sMvp);
1523
1524 if (pWelsMd->bMdUsingSad) {
1525 pCurMb->pSadCost[0] = iSadCostLuma;
1526 pWelsMd->iCostLuma = pCurMb->pSadCost[0];
1527 } else
1528 pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1529 pCurLayer->iEncStride[0], pDstLuma, 16);
1530
1531 pWelsMd->iCostSkipMb = iSadCostMb;
1532
1533 pCurMb->sP16x16Mv = sMvp;
1534 pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
1535
1536 return true;
1537 }
1538 }
1539 }
1540 return false;
1541 }
1542
1543 const int32_t g_kiPixStrideIdx8x8[4] = { 0, ME_REFINE_BUF_WIDTH_BLK8,
1544 ME_REFINE_BUF_STRIDE_BLK8, ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8
1545 };
1546 const int32_t g_kiPixStrideIdx4x4[4][4] = {
1547 {
1548 0,
1549 0 + ME_REFINE_BUF_WIDTH_BLK4,
1550 0 + ME_REFINE_BUF_STRIDE_BLK4,
1551 0 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1552 }, //[0][]
1553 {
1554 ME_REFINE_BUF_WIDTH_BLK8,
1555 ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
1556 ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
1557 ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1558 }, //[1][]
1559 {
1560 ME_REFINE_BUF_STRIDE_BLK8,
1561 ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
1562 ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
1563 ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1564 }, //[2][]
1565 {
1566 ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8,
1567 ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
1568 ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
1569 ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
1570 } //[3][]
1571 };
1572
WelsMdInterMbRefinement(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)1573 void WelsMdInterMbRefinement (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
1574 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1575 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1576 uint8_t* pTmpRefCb, *pTmpRefCr, *pTmpDstCb, *pTmpDstCr;
1577 int32_t iMvStride, iRefBlk4Stride, iDstBlk4Stride;
1578 SMVUnitXY* pMv;
1579 int32_t iBestSadCost = 0, iBestSatdCost = 0;
1580 SMeRefinePointer sMeRefine;
1581
1582 int32_t i, j, iIdx, iPixStride;
1583
1584 uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
1585 uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2];
1586 uint8_t* pDstCb = pMbCache->pMemPredChroma;
1587 uint8_t* pDstCr = pMbCache->pMemPredChroma + 64;
1588 uint8_t* pDstLuma = pMbCache->pMemPredLuma;
1589
1590 int32_t iLineSizeRefUV = pCurDqLayer->pRefPic->iLineSize[1];
1591
1592 switch (pCurMb->uiMbType) {
1593 case MB_TYPE_16x16:
1594 //luma
1595 InitMeRefinePointer (&sMeRefine, pMbCache, 0);
1596 sMeRefine.pfCopyBlockByMode =
1597 pFunc->pfCopy16x16NotAligned; // dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
1598 MeRefineFracPixel (pEncCtx, pDstLuma, &pWelsMd->sMe.sMe16x16, &sMeRefine, 16, 16);
1599 UpdateP16x16MotionInfo (pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
1600
1601 pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
1602 //save the best cost of final mode
1603 iBestSadCost = pWelsMd->sMe.sMe16x16.uiSadCost;
1604 iBestSatdCost = pWelsMd->sMe.sMe16x16.uiSatdCost;
1605
1606 //chroma
1607 pMv = &pWelsMd->sMe.sMe16x16.sMv;
1608 iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1609 pTmpRefCb = pRefCb + iMvStride;
1610 pTmpRefCr = pRefCr + iMvStride;
1611 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cb
1612 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cr
1613
1614 pWelsMd->iCostSkipMb = pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
1615 pCurDqLayer->iEncStride[0], pDstLuma, 16);
1616 pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[1],
1617 pCurDqLayer->iEncStride[1], pDstCb, 8);
1618 pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[2],
1619 pCurDqLayer->iEncStride[2], pDstCr, 8);
1620 break;
1621
1622 case MB_TYPE_16x8:
1623 iPixStride = 0;
1624 sMeRefine.pfCopyBlockByMode =
1625 pFunc->pfCopy16x8NotAligned; // dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
1626 for (i = 0; i < 2; i++) {
1627 //luma
1628 iIdx = i << 3;
1629 InitMeRefinePointer (&sMeRefine, pMbCache, iPixStride);
1630 iPixStride += ME_REFINE_BUF_STRIDE_BLK8;
1631 PredInter16x8Mv (pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMvp);
1632 MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe16x8[i], &sMeRefine, 16, 8);
1633 UpdateP16x8MotionInfo (pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMv);
1634 pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe16x8[i].sMvp;
1635 //save the best cost of final mode
1636 iBestSadCost += pWelsMd->sMe.sMe16x8[i].uiSadCost;
1637 iBestSatdCost += pWelsMd->sMe.sMe16x8[i].uiSatdCost;
1638
1639 //chroma
1640 iRefBlk4Stride = (i << 2) * iLineSizeRefUV;
1641 iDstBlk4Stride = i << 5; // 4*8
1642 pMv = &pWelsMd->sMe.sMe16x8[i].sMv;
1643 iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1644 pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
1645 pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
1646 pTmpDstCb = pDstCb + iDstBlk4Stride;
1647 pTmpDstCr = pDstCr + iDstBlk4Stride;
1648 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cb
1649 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cr
1650 }
1651 break;
1652
1653 case MB_TYPE_8x16:
1654 iPixStride = 0;
1655 sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x16Aligned;
1656 for (i = 0; i < 2; i++) {
1657 //luma
1658 iIdx = i << 2;
1659 InitMeRefinePointer (&sMeRefine, pMbCache, iPixStride);
1660 iPixStride += ME_REFINE_BUF_WIDTH_BLK8;
1661 PredInter8x16Mv (pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMvp);
1662 MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe8x16[i], &sMeRefine, 8, 16);
1663 update_P8x16_motion_info (pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMv);
1664 pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x16[i].sMvp;
1665 //save the best cost of final mode
1666 iBestSadCost += pWelsMd->sMe.sMe8x16[i].uiSadCost;
1667 iBestSatdCost += pWelsMd->sMe.sMe8x16[i].uiSatdCost;
1668
1669 //chroma
1670 iRefBlk4Stride = iIdx; //4
1671 pMv = &pWelsMd->sMe.sMe8x16[i].sMv;
1672 iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1673 pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
1674 pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
1675 pTmpDstCb = pDstCb + iRefBlk4Stride;
1676 pTmpDstCr = pDstCr + iRefBlk4Stride;
1677 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cb
1678 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cr
1679 }
1680 break;
1681 case MB_TYPE_8x8:
1682 pMbCache->sMvComponents.iRefIndexCache [9] = pMbCache->sMvComponents.iRefIndexCache [21] = REF_NOT_AVAIL;
1683 for (i = 0; i < 4; i++) {
1684 int32_t iBlk8Idx = i << 2; //0, 4, 8, 12
1685 int32_t iBlk4X, iBlk4Y, iBlk4x4Idx;
1686
1687 pCurMb->pRefIndex[i] = pWelsMd->uiRef;
1688 switch (pCurMb->uiSubMbType[i]) {
1689 case SUB_MB_TYPE_8x8:
1690 sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x8Aligned;
1691 //luma
1692 InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx8x8[i]);
1693 PredMv (&pMbCache->sMvComponents, iBlk8Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMvp);
1694 MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk8Idx], &pWelsMd->sMe.sMe8x8[i], &sMeRefine, 8, 8);
1695 UpdateP8x8MotionInfo (pMbCache, pCurMb, iBlk8Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMv);
1696 pMbCache->sMbMvp[g_kuiMbCountScan4Idx[iBlk8Idx]] = pWelsMd->sMe.sMe8x8[i].sMvp;
1697 iBestSadCost += pWelsMd->sMe.sMe8x8[i].uiSadCost;
1698 iBestSatdCost += pWelsMd->sMe.sMe8x8[i].uiSatdCost;
1699
1700 //chroma
1701 pMv = &pWelsMd->sMe.sMe8x8[i].sMv;
1702 iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1703
1704 iBlk4X = (i & 1) << 2;
1705 iBlk4Y = (i >> 1) << 2;
1706 iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1707 iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1708
1709 pTmpRefCb = pRefCb + iRefBlk4Stride;
1710 pTmpDstCb = pDstCb + iDstBlk4Stride;
1711 pTmpRefCr = pRefCr + iRefBlk4Stride;
1712 pTmpDstCr = pDstCr + iDstBlk4Stride;
1713 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1714 4, 4); //Cb
1715 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1716 4, 4); //Cr
1717 break;
1718 case SUB_MB_TYPE_4x4:
1719 sMeRefine.pfCopyBlockByMode = pFunc->pfCopy4x4;
1720 //luma
1721 for (j = 0; j < 4; ++j) {
1722 iBlk4x4Idx = iBlk8Idx + j;
1723 InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j]);
1724 PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 1, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x4[i][j].sMvp);
1725 MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe4x4[i][j], &sMeRefine, 4, 4);
1726 UpdateP4x4MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x4[i][j].sMv);
1727 pMbCache->sMbMvp[g_kuiMbCountScan4Idx[iBlk4x4Idx]] = pWelsMd->sMe.sMe4x4[i][j].sMvp;
1728 iBestSadCost += pWelsMd->sMe.sMe4x4[i][j].uiSadCost;
1729 iBestSatdCost += pWelsMd->sMe.sMe4x4[i][j].uiSatdCost;
1730
1731 //chroma
1732 pMv = &pWelsMd->sMe.sMe4x4[i][j].sMv;
1733 iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1734
1735 iBlk4X = (((i & 1) << 1) + (j & 1)) << 1;
1736 iBlk4Y = (((i >> 1) << 1) + (j >> 1)) << 1;
1737 iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1738 iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1739
1740 pTmpRefCb = pRefCb + iRefBlk4Stride;
1741 pTmpDstCb = pDstCb + iDstBlk4Stride;
1742 pTmpRefCr = pRefCr + iRefBlk4Stride;
1743 pTmpDstCr = pDstCr + iDstBlk4Stride;
1744 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1745 2, 2); //Cb
1746 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1747 2, 2); //Cr
1748 }
1749 break;
1750 case SUB_MB_TYPE_8x4:
1751 sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x4;
1752 //luma
1753 for (j = 0; j < 2; ++j) {
1754 iBlk4x4Idx = iBlk8Idx + (j << 1);
1755 InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j << 1]);
1756 PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x4[i][j].sMvp);
1757 MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe8x4[i][j], &sMeRefine, 8, 4);
1758 UpdateP8x4MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x4[i][j].sMv);
1759 pMbCache->sMbMvp[g_kuiMbCountScan4Idx[ iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
1760 //pMbCache->sMbMvp[g_kuiMbCountScan4Idx[1 + iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
1761 iBestSadCost += pWelsMd->sMe.sMe8x4[i][j].uiSadCost;
1762 iBestSatdCost += pWelsMd->sMe.sMe8x4[i][j].uiSatdCost;
1763
1764 //chroma
1765 pMv = &pWelsMd->sMe.sMe8x4[i][j].sMv;
1766 iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1767
1768 iBlk4X = ((i & 1) << 1) << 1;
1769 iBlk4Y = (((i >> 1) << 1) + j) << 1;
1770 iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1771 iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1772
1773 pTmpRefCb = pRefCb + iRefBlk4Stride;
1774 pTmpDstCb = pDstCb + iDstBlk4Stride;
1775 pTmpRefCr = pRefCr + iRefBlk4Stride;
1776 pTmpDstCr = pDstCr + iDstBlk4Stride;
1777 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1778 4, 2); //Cb
1779 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1780 4, 2); //Cr
1781 }
1782 break;
1783 case SUB_MB_TYPE_4x8:
1784 sMeRefine.pfCopyBlockByMode = pFunc->pfCopy4x8;
1785 //luma
1786 for (j = 0; j < 2; ++j) {
1787 iBlk4x4Idx = iBlk8Idx + j;
1788 InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j]);
1789 PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 1, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x8[i][j].sMvp);
1790 MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe4x8[i][j], &sMeRefine, 4, 8);
1791 UpdateP4x8MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x8[i][j].sMv);
1792 pMbCache->sMbMvp[g_kuiMbCountScan4Idx[ iBlk4x4Idx]] = pWelsMd->sMe.sMe4x8[i][j].sMvp;
1793 //pMbCache->sMbMvp[g_kuiMbCountScan4Idx[4 + iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
1794 iBestSadCost += pWelsMd->sMe.sMe4x8[i][j].uiSadCost;
1795 iBestSatdCost += pWelsMd->sMe.sMe4x8[i][j].uiSatdCost;
1796
1797 //chroma
1798 pMv = &pWelsMd->sMe.sMe4x8[i][j].sMv;
1799 iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
1800
1801 iBlk4X = (((i & 1) << 1) + j) << 1;
1802 iBlk4Y = (((i >> 1) << 1)) << 1;
1803 iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
1804 iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
1805
1806 pTmpRefCb = pRefCb + iRefBlk4Stride;
1807 pTmpDstCb = pDstCb + iDstBlk4Stride;
1808 pTmpRefCr = pRefCr + iRefBlk4Stride;
1809 pTmpDstCr = pDstCr + iDstBlk4Stride;
1810 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
1811 2, 4); //Cb
1812 pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
1813 2, 4); //Cr
1814 }
1815 break;
1816 }
1817 }
1818 break;
1819 default:
1820 break;
1821 }
1822 pCurMb->pSadCost[0] = iBestSadCost;
1823 if (pWelsMd->bMdUsingSad)
1824 pWelsMd->iCostLuma = iBestSadCost;
1825 else
1826 pWelsMd->iCostLuma = iBestSatdCost;
1827
1828 }
WelsMdFirstIntraMode(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)1829 bool WelsMdFirstIntraMode (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
1830 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1831
1832 int32_t iCostI16x16 = WelsMdI16x16 (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
1833
1834 //compare cost_p16x16 with cost_i16x16
1835 if (iCostI16x16 < pWelsMd->iCostLuma) {
1836 pCurMb->uiMbType = MB_TYPE_INTRA16x16;
1837 pWelsMd->iCostLuma = iCostI16x16;
1838
1839 pFunc->pfIntraFineMd (pEncCtx, pWelsMd, pCurMb, pMbCache);
1840
1841 //add pEnc&rec to MD--2010.3.15
1842 if (IS_INTRA16x16 (pCurMb->uiMbType)) {
1843 pCurMb->uiCbp = 0;
1844 WelsEncRecI16x16Y (pEncCtx, pCurMb, pMbCache);
1845 }
1846
1847 //chroma
1848 pWelsMd->iCostChroma = WelsMdIntraChroma (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
1849 WelsIMbChromaEncode (pEncCtx, pCurMb, pMbCache); //add pEnc&rec to MD--2010.3.15
1850 pCurMb->uiChromPredMode = pMbCache->uiChmaI8x8Mode;
1851 pCurMb->pSadCost[0] = 0;
1852 return true; //intra_mb_type is best
1853 }
1854
1855 return false;
1856 }
1857
WelsMdInterMb(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,SMbCache * pUnused)1858 void WelsMdInterMb (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pUnused) {
1859 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1860 SMbCache* pMbCache = &pSlice->sMbCacheInfo;
1861 const uint32_t kuiNeighborAvail = pCurMb->uiNeighborAvail;
1862 const int32_t kiMbWidth = pCurDqLayer->iMbWidth;
1863 const SMB* top_mb = pCurMb - kiMbWidth;
1864 const bool bMbLeftAvailPskip = ((kuiNeighborAvail & LEFT_MB_POS) ? IS_SKIP ((pCurMb - 1)->uiMbType) : false);
1865 const bool bMbTopAvailPskip = ((kuiNeighborAvail & TOP_MB_POS) ? IS_SKIP (top_mb->uiMbType) : false);
1866 const bool bMbTopLeftAvailPskip = ((kuiNeighborAvail & TOPLEFT_MB_POS) ? IS_SKIP ((top_mb - 1)->uiMbType) : false);
1867 const bool bMbTopRightAvailPskip = ((kuiNeighborAvail & TOPRIGHT_MB_POS) ? IS_SKIP ((top_mb + 1)->uiMbType) : false);
1868 bool bTrySkip = bMbLeftAvailPskip || bMbTopAvailPskip || bMbTopLeftAvailPskip || bMbTopRightAvailPskip;
1869 bool bKeepSkip = bMbLeftAvailPskip && bMbTopAvailPskip && bMbTopRightAvailPskip;
1870 bool bSkip = false;
1871
1872 //try BGD skip
1873 if (pEncCtx->pFuncList->pfInterMdBackgroundDecision (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip)) {
1874 return;
1875 }
1876
1877 //try static or scrolled Pskip
1878 if (pEncCtx->pFuncList->pfSCDPSkipDecision (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache)) {
1879 return;
1880 }
1881
1882 //step 1: try SKIP
1883 bSkip = WelsMdInterJudgePskip (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip);
1884
1885 if (bSkip) {
1886 if (bKeepSkip) {
1887 WelsMdInterDecidedPskip (pEncCtx, pSlice, pCurMb, pMbCache);
1888 return;
1889 }
1890 } else {
1891 PredictSad (pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb);
1892
1893 //step 2: P_16x16
1894 pWelsMd->iCostLuma = WelsMdP16x16 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
1895 pCurMb->uiMbType = MB_TYPE_16x16;
1896 }
1897
1898 WelsMdInterSecondaryModesEnc (pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip);
1899 }
1900
1901
1902
1903 //////
1904 // try the ordinary Pskip
1905 //////
WelsMdInterJudgePskip(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache,bool bTrySkip)1906 bool WelsMdInterJudgePskip (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache,
1907 bool bTrySkip) {
1908 bool bRet = true;
1909 if (((pEncCtx->pRefPic->iPictureType == P_SLICE) && (pMbCache->uiRefMbType == MB_TYPE_SKIP
1910 || pMbCache->uiRefMbType == MB_TYPE_BACKGROUND)) ||
1911 bTrySkip) {
1912 PredictSadSkip (pMbCache->sMvComponents.iRefIndexCache, pMbCache->bMbTypeSkip, pMbCache->iSadCostSkip, 0,
1913 & (pWelsMd->iSadPredSkip));
1914 bRet = WelsMdPSkipEnc (pEncCtx, pWelsMd, pCurMb, pMbCache) ? true : false;
1915 return bRet;
1916 }
1917
1918 return false;
1919 }
1920
1921 //////
1922 // try the ordinary Pskip
1923 //////
WelsMdInterUpdatePskip(SDqLayer * pCurDqLayer,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache)1924 void WelsMdInterUpdatePskip (SDqLayer* pCurDqLayer, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
1925 //add pEnc&rec to MD--2010.3.15
1926 pCurMb->uiCbp = 0;
1927 pCurMb->uiLumaQp = pSlice->uiLastMbQp;
1928 pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51 (pCurMb->uiLumaQp +
1929 pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
1930 pMbCache->bCollocatedPredFlag = (LD32 (&pCurMb->sMv[0]) == 0);
1931 }
1932
1933
1934 //////
1935 // doublecheck if current MBTYPE is Pskip
1936 //////
WelsMdInterDoubleCheckPskip(SMB * pCurMb,SMbCache * pMbCache)1937 void WelsMdInterDoubleCheckPskip (SMB* pCurMb, SMbCache* pMbCache) {
1938 if (MB_TYPE_16x16 == pCurMb->uiMbType && 0 == pCurMb->uiCbp) {
1939 if (0 == pCurMb->pRefIndex[0]) {
1940 SMVUnitXY sMvp = { 0 };
1941
1942 PredSkipMv (pMbCache, &sMvp);
1943 if (LD32 (&sMvp) == LD32 (&pCurMb->sMv[0])) {
1944 pCurMb->uiMbType = MB_TYPE_SKIP;
1945 }
1946 }
1947 pMbCache->bCollocatedPredFlag = (LD32 (&pCurMb->sMv[0]) == 0);
1948 }
1949 }
1950
1951 //////
1952 // Pskip mb encode
1953 //////
WelsMdInterDecidedPskip(sWelsEncCtx * pEncCtx,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache)1954 void WelsMdInterDecidedPskip (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
1955 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1956 pCurMb->uiMbType = MB_TYPE_SKIP;
1957 WelsRecPskip (pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
1958 WelsMdInterUpdatePskip (pCurDqLayer, pSlice, pCurMb, pMbCache);
1959 }
1960
1961 //////
1962 // inter mb encode
1963 //////
WelsMdInterEncode(sWelsEncCtx * pEncCtx,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache)1964 void WelsMdInterEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, SMbCache* pMbCache) {
1965 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
1966 SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
1967
1968 //add pEnc&rec to MD--2010.3.15
1969 const int32_t kiCsStrideY = pCurDqLayer->iCsStride[0];
1970 const int32_t kiCsStrideUV = pCurDqLayer->iCsStride[1];
1971
1972 //add pEnc&rec to MD--2010.3.15
1973 pCurMb->uiCbp = 0;
1974 WelsInterMbEncode (pEncCtx, pSlice, pCurMb);
1975 WelsPMbChromaEncode (pEncCtx, pSlice, pCurMb);
1976
1977 pFunc->pfCopy16x16Aligned (pMbCache->SPicData.pCsMb[0], kiCsStrideY, pMbCache->pMemPredLuma, 16);
1978 pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[1], kiCsStrideUV, pMbCache->pMemPredChroma, 8);
1979 pFunc->pfCopy8x8Aligned (pMbCache->SPicData.pCsMb[2], kiCsStrideUV, pMbCache->pMemPredChroma + 64, 8);
1980 }
1981
1982
1983
1984 //
1985 //
1986 //
WelsMdInterSaveSadAndRefMbType(Mb_Type * pRefMbtypeList,SMbCache * pMbCache,const SMB * pCurMb,const SWelsMD * pMd)1987 void WelsMdInterSaveSadAndRefMbType (Mb_Type* pRefMbtypeList, SMbCache* pMbCache, const SMB* pCurMb,
1988 const SWelsMD* pMd) {
1989 const Mb_Type kmtCurMbtype = pCurMb->uiMbType;
1990
1991 //sad
1992 pMbCache->pEncSad[0] = (kmtCurMbtype == MB_TYPE_SKIP) ? pMd->iCostSkipMb : 0;
1993 //uiMbType
1994 pRefMbtypeList[pCurMb->iMbXY] = kmtCurMbtype;
1995 }
1996
WelsMdInterSecondaryModesEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SSlice * pSlice,SMB * pCurMb,SMbCache * pMbCache,const bool bSkip)1997 void WelsMdInterSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
1998 SMbCache* pMbCache, const bool bSkip) {
1999 //step 2: Intra
2000 const bool kbTrySkip = pEncCtx->pFuncList->pfFirstIntraMode (pEncCtx, pWelsMd, pCurMb, pMbCache);
2001 if (kbTrySkip)
2002 return;
2003
2004 if (bSkip) {
2005 WelsMdInterDecidedPskip (pEncCtx, pSlice, pCurMb, pMbCache);
2006 } else {
2007 //Step 3: SubP16 MD
2008 pEncCtx->pFuncList->pfSetScrollingMv (pEncCtx->pVaa, pWelsMd); //SCC
2009 pEncCtx->pFuncList->pfInterFineMd (pEncCtx, pWelsMd, pSlice, pCurMb, pWelsMd->iCostLuma);
2010
2011 //refinement for inter type
2012 WelsMdInterMbRefinement (pEncCtx, pWelsMd, pCurMb, pMbCache);
2013
2014 //step 7: invoke encoding
2015 WelsMdInterEncode (pEncCtx, pSlice, pCurMb, pMbCache);
2016
2017 //step 8: double check Pskip
2018 WelsMdInterDoubleCheckPskip (pCurMb, pMbCache);
2019 }
2020 }
2021
2022
WelsMdIntraSecondaryModesEnc(sWelsEncCtx * pEncCtx,SWelsMD * pWelsMd,SMB * pCurMb,SMbCache * pMbCache)2023 void WelsMdIntraSecondaryModesEnc (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
2024 SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
2025 //initial prediction memory for I_4x4
2026 pFunc->pfIntraFineMd (pEncCtx, pWelsMd, pCurMb, pMbCache); //WelsMdIntraFinePartitionVaa
2027
2028 //add pEnc&rec to MD--2010.3.15
2029 if (IS_INTRA16x16 (pCurMb->uiMbType)) {
2030 pCurMb->uiCbp = 0;
2031 WelsEncRecI16x16Y (pEncCtx, pCurMb, pMbCache);
2032 }
2033
2034 //chroma
2035 pWelsMd->iCostChroma = WelsMdIntraChroma (pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
2036 WelsIMbChromaEncode (pEncCtx, pCurMb, pMbCache); //add pEnc&rec to MD--2010.3.15
2037 pCurMb->uiChromPredMode = pMbCache->uiChmaI8x8Mode;
2038 pCurMb->pSadCost[0] = 0;
2039 }
2040
2041 } // namespace WelsEnc
2042