1 #include "deblocking_common.h"
2 #include "macros.h"
3
4 // C code only
DeblockLumaLt4_c(uint8_t * pPix,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta,int8_t * pTc)5 void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
6 int8_t* pTc) {
7 for (int32_t i = 0; i < 16; i++) {
8 int32_t iTc0 = pTc[i >> 2];
9 if (iTc0 >= 0) {
10 int32_t p0 = pPix[-iStrideX];
11 int32_t p1 = pPix[-2 * iStrideX];
12 int32_t p2 = pPix[-3 * iStrideX];
13 int32_t q0 = pPix[0];
14 int32_t q1 = pPix[iStrideX];
15 int32_t q2 = pPix[2 * iStrideX];
16 bool bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
17 bool bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
18 bool bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
19 int32_t iTc = iTc0;
20 if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
21 bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
22 bool bDetaQ2Q0 = WELS_ABS (q2 - q0) < iBeta;
23 if (bDetaP2P0) {
24 pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 * (1 << 1))) >> 1, -iTc0, iTc0);
25 iTc++;
26 }
27 if (bDetaQ2Q0) {
28 pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 * (1 << 1))) >> 1, -iTc0, iTc0);
29 iTc++;
30 }
31 int32_t iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc, iTc);
32 pPix[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
33 pPix[0] = WelsClip1 (q0 - iDeta); /* q0' */
34 }
35 }
36 pPix += iStrideY;
37 }
38 }
DeblockLumaEq4_c(uint8_t * pPix,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta)39 void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
40 int32_t p0, p1, p2, q0, q1, q2;
41 int32_t iDetaP0Q0;
42 bool bDetaP1P0, bDetaQ1Q0;
43 for (int32_t i = 0; i < 16; i++) {
44 p0 = pPix[-iStrideX];
45 p1 = pPix[-2 * iStrideX];
46 p2 = pPix[-3 * iStrideX];
47 q0 = pPix[0];
48 q1 = pPix[iStrideX];
49 q2 = pPix[2 * iStrideX];
50 iDetaP0Q0 = WELS_ABS (p0 - q0);
51 bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
52 bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
53 if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
54 if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
55 bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
56 bool bDetaQ2Q0 = WELS_ABS (q2 - q0) < iBeta;
57 if (bDetaP2P0) {
58 const int32_t p3 = pPix[-4 * iStrideX];
59 pPix[-iStrideX] = (p2 + (p1 * (1 << 1)) + (p0 * (1 << 1)) + (q0 * (1 << 1)) + q1 + 4) >> 3; //p0
60 pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2; //p1
61 pPix[-3 * iStrideX] = ((p3 * (1 << 1)) + p2 + (p2 * (1 << 1)) + p1 + p0 + q0 + 4) >> 3; //p2
62 } else {
63 pPix[-1 * iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2; //p0
64 }
65 if (bDetaQ2Q0) {
66 const int32_t q3 = pPix[3 * iStrideX];
67 pPix[0] = (p1 + (p0 * (1 << 1)) + (q0 * (1 << 1)) + (q1 * (1 << 1)) + q2 + 4) >> 3; //q0
68 pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2; //q1
69 pPix[2 * iStrideX] = ((q3 * (1 << 1)) + q2 + (q2 * (1 << 1)) + q1 + q0 + p0 + 4) >> 3; //q2
70 } else {
71 pPix[0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2; //q0
72 }
73 } else {
74 pPix[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2; //p0
75 pPix[ 0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2; //q0
76 }
77 }
78 pPix += iStrideY;
79 }
80 }
DeblockLumaLt4V_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)81 void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
82 DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
83 }
DeblockLumaLt4H_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)84 void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
85 DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
86 }
DeblockLumaEq4V_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)87 void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
88 DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
89 }
DeblockLumaEq4H_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)90 void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
91 DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
92 }
DeblockChromaLt4_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta,int8_t * pTc)93 void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
94 int32_t iBeta, int8_t* pTc) {
95 int32_t p0, p1, q0, q1, iDeta;
96 bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
97
98 for (int32_t i = 0; i < 8; i++) {
99 int32_t iTc0 = pTc[i >> 1];
100 if (iTc0 > 0) {
101 p0 = pPixCb[-iStrideX];
102 p1 = pPixCb[-2 * iStrideX];
103 q0 = pPixCb[0];
104 q1 = pPixCb[iStrideX];
105
106 bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
107 bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
108 bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
109 if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
110 iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
111 pPixCb[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
112 pPixCb[0] = WelsClip1 (q0 - iDeta); /* q0' */
113 }
114
115
116 p0 = pPixCr[-iStrideX];
117 p1 = pPixCr[-2 * iStrideX];
118 q0 = pPixCr[0];
119 q1 = pPixCr[iStrideX];
120
121 bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
122 bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
123 bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
124
125 if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
126 iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
127 pPixCr[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
128 pPixCr[0] = WelsClip1 (q0 - iDeta); /* q0' */
129 }
130 }
131 pPixCb += iStrideY;
132 pPixCr += iStrideY;
133 }
134 }
DeblockChromaEq4_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta)135 void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
136 int32_t iBeta) {
137 int32_t p0, p1, q0, q1;
138 bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
139 for (int32_t i = 0; i < 8; i++) {
140 //cb
141 p0 = pPixCb[-iStrideX];
142 p1 = pPixCb[-2 * iStrideX];
143 q0 = pPixCb[0];
144 q1 = pPixCb[iStrideX];
145 bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
146 bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
147 bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
148 if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
149 pPixCb[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2; /* p0' */
150 pPixCb[0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2; /* q0' */
151 }
152
153 //cr
154 p0 = pPixCr[-iStrideX];
155 p1 = pPixCr[-2 * iStrideX];
156 q0 = pPixCr[0];
157 q1 = pPixCr[iStrideX];
158 bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
159 bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
160 bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
161 if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
162 pPixCr[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2; /* p0' */
163 pPixCr[0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2; /* q0' */
164 }
165 pPixCr += iStrideY;
166 pPixCb += iStrideY;
167 }
168 }
DeblockChromaLt4V_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)169 void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
170 int8_t* tc) {
171 DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
172 }
DeblockChromaLt4H_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)173 void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
174 int8_t* tc) {
175 DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
176 }
DeblockChromaEq4V_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)177 void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
178 DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
179 }
DeblockChromaEq4H_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)180 void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
181 DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
182 }
183
DeblockChromaLt42_c(uint8_t * pPixCbCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta,int8_t * pTc)184 void DeblockChromaLt42_c (uint8_t* pPixCbCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
185 int32_t iBeta, int8_t* pTc) {
186 int32_t p0, p1, q0, q1, iDeta;
187 bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
188
189 for (int32_t i = 0; i < 8; i++) {
190 int32_t iTc0 = pTc[i >> 1];
191 if (iTc0 > 0) {
192 p0 = pPixCbCr[-iStrideX];
193 p1 = pPixCbCr[-2 * iStrideX];
194 q0 = pPixCbCr[0];
195 q1 = pPixCbCr[iStrideX];
196
197 bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
198 bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
199 bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
200 if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
201 iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
202 pPixCbCr[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
203 pPixCbCr[0] = WelsClip1 (q0 - iDeta); /* q0' */
204 }
205
206
207 }
208 pPixCbCr += iStrideY;
209 }
210 }
DeblockChromaEq42_c(uint8_t * pPixCbCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta)211 void DeblockChromaEq42_c (uint8_t* pPixCbCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
212 int32_t iBeta) {
213 int32_t p0, p1, q0, q1;
214 bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
215 for (int32_t i = 0; i < 8; i++) {
216 p0 = pPixCbCr[-iStrideX];
217 p1 = pPixCbCr[-2 * iStrideX];
218 q0 = pPixCbCr[0];
219 q1 = pPixCbCr[iStrideX];
220 bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
221 bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
222 bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
223 if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
224 pPixCbCr[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2; /* p0' */
225 pPixCbCr[0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2; /* q0' */
226 }
227
228 pPixCbCr += iStrideY;
229 }
230 }
231
DeblockChromaLt4V2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)232 void DeblockChromaLt4V2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
233 int8_t* tc) {
234 DeblockChromaLt42_c (pPixCbCr, iStride, 1, iAlpha, iBeta, tc);
235 }
DeblockChromaLt4H2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)236 void DeblockChromaLt4H2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
237 int8_t* tc) {
238
239 DeblockChromaLt42_c (pPixCbCr, 1, iStride, iAlpha, iBeta, tc);
240 }
DeblockChromaEq4V2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)241 void DeblockChromaEq4V2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
242 DeblockChromaEq42_c (pPixCbCr, iStride, 1, iAlpha, iBeta);
243 }
DeblockChromaEq4H2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)244 void DeblockChromaEq4H2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
245 DeblockChromaEq42_c (pPixCbCr, 1, iStride, iAlpha, iBeta);
246 }
247
WelsNonZeroCount_c(int8_t * pNonZeroCount)248 void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
249 int32_t i;
250 for (i = 0; i < 24; i++) {
251 pNonZeroCount[i] = !!pNonZeroCount[i];
252 }
253 }
254
255 #ifdef X86_ASM
256 extern "C" {
DeblockLumaLt4H_ssse3(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)257 void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
258 ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
259
260 DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
261 DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
262 DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
263 }
264
DeblockLumaEq4H_ssse3(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta)265 void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
266 ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
267
268 DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
269 DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
270 DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
271 }
272
273 }
274
275 #endif
276
277 #ifdef HAVE_MMI
278 extern "C" {
DeblockLumaLt4H_mmi(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)279 void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
280 ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
281
282 DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
283 DeblockLumaLt4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
284 DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
285 }
286
DeblockLumaEq4H_mmi(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta)287 void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
288 ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
289
290 DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
291 DeblockLumaEq4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta);
292 DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
293 }
294 }
295 #endif//HAVE_MMI
296