1 /*!
2 * \copy
3 * Copyright (c) 2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33 #include "decode_mb_aux.h"
34 #include "cpu_core.h"
35
36 namespace WelsEnc {
37 /****************************************************************************
38 * Dequant and Ihdm functions
39 ****************************************************************************/
WelsIHadamard4x4Dc(int16_t * pRes)40 void WelsIHadamard4x4Dc (int16_t* pRes) { //pBuffer size : 4x4
41 int16_t iTemp[4];
42 int32_t i = 4;
43
44 while (--i >= 0) {
45 const int32_t kiIdx = i << 2;
46 const int32_t kiIdx1 = 1 + kiIdx;
47 const int32_t kiIdx2 = 1 + kiIdx1;
48 const int32_t kiIdx3 = 1 + kiIdx2;
49
50 iTemp[0] = pRes[kiIdx ] + pRes[kiIdx2];
51 iTemp[1] = pRes[kiIdx ] - pRes[kiIdx2];
52 iTemp[2] = pRes[kiIdx1] - pRes[kiIdx3];
53 iTemp[3] = pRes[kiIdx1] + pRes[kiIdx3];
54
55 pRes[kiIdx ] = iTemp[0] + iTemp[3];
56 pRes[kiIdx1] = iTemp[1] + iTemp[2];
57 pRes[kiIdx2] = iTemp[1] - iTemp[2];
58 pRes[kiIdx3] = iTemp[0] - iTemp[3];
59 }
60
61 i = 4;
62 while (--i >= 0) {
63 const int32_t kiI4 = 4 + i;
64 const int32_t kiI8 = 4 + kiI4;
65 const int32_t kiI12 = 4 + kiI8;
66
67 iTemp[0] = pRes[i ] + pRes[kiI8 ];
68 iTemp[1] = pRes[i ] - pRes[kiI8 ];
69 iTemp[2] = pRes[kiI4 ] - pRes[kiI12];
70 iTemp[3] = pRes[kiI4 ] + pRes[kiI12];
71
72 pRes[i ] = iTemp[0] + iTemp[3];
73 pRes[kiI4 ] = iTemp[1] + iTemp[2];
74 pRes[kiI8 ] = iTemp[1] - iTemp[2];
75 pRes[kiI12] = iTemp[0] - iTemp[3];
76 }
77 }
78
79 /* for qp < 12 */
WelsDequantLumaDc4x4(int16_t * pRes,const int32_t kiQp)80 void WelsDequantLumaDc4x4 (int16_t* pRes, const int32_t kiQp) {
81 int32_t i = 15;
82 const uint16_t kuiDequantValue = g_kuiDequantCoeff[kiQp % 6][0];
83 const int16_t kiQF0 = kiQp / 6;
84 const int16_t kiQF1 = 2 - kiQF0;
85 const int16_t kiQF0S = 1 << (1 - kiQF0);
86
87 while (i >= 0) {
88 pRes[i ] = (pRes[i ] * kuiDequantValue + kiQF0S) >> kiQF1;
89 pRes[i - 1] = (pRes[i - 1] * kuiDequantValue + kiQF0S) >> kiQF1;
90 pRes[i - 2] = (pRes[i - 2] * kuiDequantValue + kiQF0S) >> kiQF1;
91 pRes[i - 3] = (pRes[i - 3] * kuiDequantValue + kiQF0S) >> kiQF1;
92
93 i -= 4;
94 }
95 }
96
97 /* for qp >= 12 */
WelsDequantIHadamard4x4_c(int16_t * pRes,const uint16_t kuiMF)98 void WelsDequantIHadamard4x4_c (int16_t* pRes, const uint16_t kuiMF) {
99 int16_t iTemp[4];
100 int32_t i;
101
102 for (i = 0; i < 16; i += 4) {
103 iTemp[0] = pRes[i ] + pRes[i + 2];
104 iTemp[1] = pRes[i ] - pRes[i + 2];
105 iTemp[2] = pRes[i + 1] - pRes[i + 3];
106 iTemp[3] = pRes[i + 1] + pRes[i + 3];
107
108 pRes[i ] = iTemp[0] + iTemp[3];
109 pRes[i + 1] = iTemp[1] + iTemp[2];
110 pRes[i + 2] = iTemp[1] - iTemp[2];
111 pRes[i + 3] = iTemp[0] - iTemp[3];
112 }
113
114 for (i = 0; i < 4; i++) {
115 iTemp[0] = pRes[i ] + pRes[i + 8 ];
116 iTemp[1] = pRes[i ] - pRes[i + 8 ];
117 iTemp[2] = pRes[i + 4 ] - pRes[i + 12];
118 iTemp[3] = pRes[i + 4 ] + pRes[i + 12];
119
120 pRes[i ] = (iTemp[0] + iTemp[3]) * kuiMF;
121 pRes[i + 4 ] = (iTemp[1] + iTemp[2]) * kuiMF;
122 pRes[i + 8 ] = (iTemp[1] - iTemp[2]) * kuiMF;
123 pRes[i + 12] = (iTemp[0] - iTemp[3]) * kuiMF;
124 }
125 }
126
WelsDequantIHadamard2x2Dc(int16_t * pDct,const uint16_t kuiMF)127 void WelsDequantIHadamard2x2Dc (int16_t* pDct, const uint16_t kuiMF) {
128 const int16_t kiSumU = pDct[0] + pDct[2];
129 const int16_t kiDelU = pDct[0] - pDct[2];
130 const int16_t kiSumD = pDct[1] + pDct[3];
131 const int16_t kiDelD = pDct[1] - pDct[3];
132
133 pDct[0] = ((kiSumU + kiSumD) * kuiMF) >> 1;
134 pDct[1] = ((kiSumU - kiSumD) * kuiMF) >> 1;
135 pDct[2] = ((kiDelU + kiDelD) * kuiMF) >> 1;
136 pDct[3] = ((kiDelU - kiDelD) * kuiMF) >> 1;
137 }
138
WelsDequant4x4_c(int16_t * pRes,const uint16_t * kpMF)139 void WelsDequant4x4_c (int16_t* pRes, const uint16_t* kpMF) {
140 int32_t i;
141 for (i = 0; i < 8; i++) {
142 pRes[i] *= kpMF[i];
143 pRes[i + 8] *= kpMF[i];
144 }
145 }
146
WelsDequantFour4x4_c(int16_t * pRes,const uint16_t * kpMF)147 void WelsDequantFour4x4_c (int16_t* pRes, const uint16_t* kpMF) {
148 int32_t i;
149 for (i = 0; i < 8; i++) {
150 pRes[i] *= kpMF[i];
151 pRes[i + 8] *= kpMF[i];
152 pRes[i + 16] *= kpMF[i];
153 pRes[i + 24] *= kpMF[i];
154 pRes[i + 32] *= kpMF[i];
155 pRes[i + 40] *= kpMF[i];
156 pRes[i + 48] *= kpMF[i];
157 pRes[i + 56] *= kpMF[i];
158 }
159 }
160
161 /****************************************************************************
162 * IDCT functions, final output = prediction(CS) + IDCT(scaled_coeff)
163 ****************************************************************************/
WelsIDctT4Rec_c(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)164 void WelsIDctT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct) {
165 int32_t i;
166 int16_t iTemp[16];
167
168 int32_t iDstStridex2 = iStride << 1;
169 int32_t iDstStridex3 = iStride + iDstStridex2;
170 int32_t iPredStridex2 = iPredStride << 1;
171 int32_t iPredStridex3 = iPredStride + iPredStridex2;
172
173 for (i = 0; i < 4; i ++) { //horizon
174 int32_t iIdx = i << 2;
175 const int32_t kiHorSumU = pDct[iIdx] + pDct[iIdx + 2]; // add 0-2
176 const int32_t kiHorDelU = pDct[iIdx] - pDct[iIdx + 2]; // sub 0-2
177 const int32_t kiHorSumD = pDct[iIdx + 1] + (pDct[iIdx + 3] >> 1);
178 const int32_t kiHorDelD = (pDct[iIdx + 1] >> 1) - pDct[iIdx + 3];
179
180 iTemp[iIdx ] = kiHorSumU + kiHorSumD;
181 iTemp[iIdx + 1] = kiHorDelU + kiHorDelD;
182 iTemp[iIdx + 2] = kiHorDelU - kiHorDelD;
183 iTemp[iIdx + 3] = kiHorSumU - kiHorSumD;
184 }
185
186 for (i = 0; i < 4; i ++) { //vertical
187 const int32_t kiVerSumL = iTemp[i] + iTemp[8 + i];
188 const int32_t kiVerDelL = iTemp[i] - iTemp[8 + i];
189 const int32_t kiVerDelR = (iTemp[4 + i] >> 1) - iTemp[12 + i];
190 const int32_t kiVerSumR = iTemp[4 + i] + (iTemp[12 + i] >> 1);
191
192 pRec[i ] = WelsClip1 (pPred[i ] + ((kiVerSumL + kiVerSumR + 32) >> 6));
193 pRec[iStride + i ] = WelsClip1 (pPred[iPredStride + i ] + ((kiVerDelL + kiVerDelR + 32) >> 6));
194 pRec[iDstStridex2 + i] = WelsClip1 (pPred[iPredStridex2 + i] + ((kiVerDelL - kiVerDelR + 32) >> 6));
195 pRec[iDstStridex3 + i] = WelsClip1 (pPred[iPredStridex3 + i] + ((kiVerSumL - kiVerSumR + 32) >> 6));
196 }
197 }
198
WelsIDctFourT4Rec_c(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)199 void WelsIDctFourT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct) {
200 int32_t iDstStridex4 = iStride << 2;
201 int32_t iPredStridex4 = iPredStride << 2;
202 WelsIDctT4Rec_c (pRec, iStride, pPred, iPredStride, pDct);
203 WelsIDctT4Rec_c (&pRec[4], iStride, &pPred[4], iPredStride, pDct + 16);
204 WelsIDctT4Rec_c (&pRec[iDstStridex4 ], iStride, &pPred[iPredStridex4 ], iPredStride, pDct + 32);
205 WelsIDctT4Rec_c (&pRec[iDstStridex4 + 4], iStride, &pPred[iPredStridex4 + 4], iPredStride, pDct + 48);
206
207 }
208
WelsIDctT4RecOnMb(uint8_t * pDst,int32_t iDstStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct,PIDctFunc pfIDctFourT4)209 void WelsIDctT4RecOnMb (uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct,
210 PIDctFunc pfIDctFourT4) {
211 int32_t iDstStridex8 = iDstStride << 3;
212 int32_t iPredStridex8 = iPredStride << 3;
213
214 pfIDctFourT4 (&pDst[0], iDstStride, &pPred[0], iPredStride, pDct);
215 pfIDctFourT4 (&pDst[8], iDstStride, &pPred[8], iPredStride, pDct + 64);
216 pfIDctFourT4 (&pDst[iDstStridex8], iDstStride, &pPred[iPredStridex8], iPredStride, pDct + 128);
217 pfIDctFourT4 (&pDst[iDstStridex8 + 8], iDstStride, &pPred[iPredStridex8 + 8], iPredStride, pDct + 192);
218 }
219
220 /*
221 * pfIDctI16x16Dc: do luma idct of an MB for I16x16 mode, when only dc value are non-zero
222 */
WelsIDctRecI16x16Dc_c(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDctDc)223 void WelsIDctRecI16x16Dc_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDctDc) {
224 int32_t i, j;
225
226 for (i = 0; i < 16; i ++) {
227 for (j = 0; j < 16; j++) {
228 pRec[j] = WelsClip1 (pPred[j] + ((pDctDc[ (i & 0x0C) + (j >> 2)] + 32) >> 6));
229 }
230 pRec += iStride;
231 pPred += iPredStride;
232 }
233 }
234
WelsGetEncBlockStrideOffset(int32_t * pBlock,const int32_t kiStrideY,const int32_t kiStrideUV)235 void WelsGetEncBlockStrideOffset (int32_t* pBlock, const int32_t kiStrideY, const int32_t kiStrideUV) {
236 int32_t i, j, k, r;
237 for (j = 0; j < 4; j++) {
238 i = j << 2;
239 k = (j & 0x01) << 1;
240 r = j & 0x02;
241 pBlock[i] = (0 + k + (0 + r) * kiStrideY) << 2;
242 pBlock[i + 1] = (1 + k + (0 + r) * kiStrideY) << 2;
243 pBlock[i + 2] = (0 + k + (1 + r) * kiStrideY) << 2;
244 pBlock[i + 3] = (1 + k + (1 + r) * kiStrideY) << 2;
245
246 pBlock[16 + j] =
247 pBlock[20 + j] = ((j & 0x01) + r * kiStrideUV) << 2;
248 }
249 }
250
WelsInitReconstructionFuncs(SWelsFuncPtrList * pFuncList,uint32_t uiCpuFlag)251 void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
252 pFuncList->pfDequantization4x4 = WelsDequant4x4_c;
253 pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_c;
254 pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_c;
255
256 pFuncList->pfIDctT4 = WelsIDctT4Rec_c;
257 pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_c;
258 pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_c;
259
260 #if defined(X86_ASM)
261 if (uiCpuFlag & WELS_CPU_MMXEXT) {
262 pFuncList->pfIDctT4 = WelsIDctT4Rec_mmx;
263 }
264 if (uiCpuFlag & WELS_CPU_SSE2) {
265 pFuncList->pfDequantization4x4 = WelsDequant4x4_sse2;
266 pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_sse2;
267 pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_sse2;
268
269 pFuncList->pfIDctT4 = WelsIDctT4Rec_sse2;
270 pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_sse2;
271 pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
272 }
273 #if defined(HAVE_AVX2)
274 if (uiCpuFlag & WELS_CPU_AVX2) {
275 pFuncList->pfIDctT4 = WelsIDctT4Rec_avx2;
276 pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_avx2;
277 }
278 #endif
279
280 #endif//X86_ASM
281
282 #if defined(HAVE_NEON)
283 if (uiCpuFlag & WELS_CPU_NEON) {
284 pFuncList->pfDequantization4x4 = WelsDequant4x4_neon;
285 pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_neon;
286 pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_neon;
287
288 pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_neon;
289 pFuncList->pfIDctT4 = WelsIDctT4Rec_neon;
290 pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_neon;
291 }
292 #endif
293
294 #if defined(HAVE_NEON_AARCH64)
295 if (uiCpuFlag & WELS_CPU_NEON) {
296 pFuncList->pfDequantization4x4 = WelsDequant4x4_AArch64_neon;
297 pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_AArch64_neon;
298 pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_AArch64_neon;
299
300 pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_AArch64_neon;
301 pFuncList->pfIDctT4 = WelsIDctT4Rec_AArch64_neon;
302 pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_AArch64_neon;
303 }
304 #endif
305
306 #if defined(HAVE_MMI)
307 if (uiCpuFlag & WELS_CPU_MMI) {
308 pFuncList->pfIDctT4 = WelsIDctT4Rec_mmi;
309 pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_mmi;
310 pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_mmi;
311 }
312 #endif//HAVE_MMI
313 }
314 }
315