1 /*!
2 * \copy
3 * Copyright (c) 2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33
34 #include "ls_defines.h"
35 #include "encode_mb_aux.h"
36 #include "cpu_core.h"
37 namespace WelsEnc {
38
39 ALIGNED_DECLARE (const int16_t, g_kiQuantInterFF[58][8], 16) = {
40 /* 0*/ { 0, 1, 0, 1, 1, 1, 1, 1 },
41 /* 1*/ { 0, 1, 0, 1, 1, 1, 1, 1 },
42 /* 2*/ { 1, 1, 1, 1, 1, 1, 1, 1 },
43 /* 3*/ { 1, 1, 1, 1, 1, 1, 1, 1 },
44 /* 4*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
45 /* 5*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
46 /* 6*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
47 /* 7*/ { 1, 1, 1, 1, 1, 2, 1, 2 },
48 /* 8*/ { 1, 2, 1, 2, 2, 3, 2, 3 },
49 /* 9*/ { 1, 2, 1, 2, 2, 3, 2, 3 },
50 /*10*/ { 1, 2, 1, 2, 2, 3, 2, 3 },
51 /*11*/ { 1, 2, 1, 2, 2, 4, 2, 4 },
52 /*12*/ { 2, 3, 2, 3, 3, 4, 3, 4 },
53 /*13*/ { 2, 3, 2, 3, 3, 5, 3, 5 },
54 /*14*/ { 2, 3, 2, 3, 3, 5, 3, 5 },
55 /*15*/ { 2, 4, 2, 4, 4, 6, 4, 6 },
56 /*16*/ { 3, 4, 3, 4, 4, 7, 4, 7 },
57 /*17*/ { 3, 5, 3, 5, 5, 8, 5, 8 },
58 /*18*/ { 3, 5, 3, 5, 5, 8, 5, 8 },
59 /*19*/ { 4, 6, 4, 6, 6, 9, 6, 9 },
60 /*20*/ { 4, 7, 4, 7, 7, 10, 7, 10 },
61 /*21*/ { 5, 8, 5, 8, 8, 12, 8, 12 },
62 /*22*/ { 5, 8, 5, 8, 8, 13, 8, 13 },
63 /*23*/ { 6, 10, 6, 10, 10, 15, 10, 15 },
64 /*24*/ { 7, 11, 7, 11, 11, 17, 11, 17 },
65 /*25*/ { 7, 12, 7, 12, 12, 19, 12, 19 },
66 /*26*/ { 9, 13, 9, 13, 13, 21, 13, 21 },
67 /*27*/ { 9, 15, 9, 15, 15, 24, 15, 24 },
68 /*28*/ { 11, 17, 11, 17, 17, 26, 17, 26 },
69 /*29*/ { 12, 19, 12, 19, 19, 30, 19, 30 },
70 /*30*/ { 13, 22, 13, 22, 22, 33, 22, 33 },
71 /*31*/ { 15, 23, 15, 23, 23, 38, 23, 38 },
72 /*32*/ { 17, 27, 17, 27, 27, 42, 27, 42 },
73 /*33*/ { 19, 30, 19, 30, 30, 48, 30, 48 },
74 /*34*/ { 21, 33, 21, 33, 33, 52, 33, 52 },
75 /*35*/ { 24, 38, 24, 38, 38, 60, 38, 60 },
76 /*36*/ { 27, 43, 27, 43, 43, 67, 43, 67 },
77 /*37*/ { 29, 47, 29, 47, 47, 75, 47, 75 },
78 /*38*/ { 35, 53, 35, 53, 53, 83, 53, 83 },
79 /*39*/ { 37, 60, 37, 60, 60, 96, 60, 96 },
80 /*40*/ { 43, 67, 43, 67, 67, 104, 67, 104 },
81 /*41*/ { 48, 77, 48, 77, 77, 121, 77, 121 },
82 /*42*/ { 53, 87, 53, 87, 87, 133, 87, 133 },
83 /*43*/ { 59, 93, 59, 93, 93, 150, 93, 150 },
84 /*44*/ { 69, 107, 69, 107, 107, 167, 107, 167 },
85 /*45*/ { 75, 120, 75, 120, 120, 192, 120, 192 },
86 /*46*/ { 85, 133, 85, 133, 133, 208, 133, 208 },
87 /*47*/ { 96, 153, 96, 153, 153, 242, 153, 242 },
88 /*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 },
89 /*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 },
90 /*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 },
91 /*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 },
92 /* from here below is only for intra */
93 /*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 },
94 /*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 },
95 /*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 },
96 /*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 },
97 /*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 },
98 /*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 },
99 };
100
101
102
103 ALIGNED_DECLARE (const int16_t, g_kiQuantMF[52][8], 16) = {
104 /* 0*/ {26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 },
105 /* 1*/ {23832, 14980, 23832, 14980, 14980, 9320, 14980, 9320 },
106 /* 2*/ {20164, 13108, 20164, 13108, 13108, 8388, 13108, 8388 },
107 /* 3*/ {18724, 11650, 18724, 11650, 11650, 7294, 11650, 7294 },
108 /* 4*/ {16384, 10486, 16384, 10486, 10486, 6710, 10486, 6710 },
109 /* 5*/ {14564, 9118, 14564, 9118, 9118, 5786, 9118, 5786 },
110 /* 6*/ {13107, 8066, 13107, 8066, 8066, 5243, 8066, 5243 },
111 /* 7*/ {11916, 7490, 11916, 7490, 7490, 4660, 7490, 4660 },
112 /* 8*/ {10082, 6554, 10082, 6554, 6554, 4194, 6554, 4194 },
113 /* 9*/ { 9362, 5825, 9362, 5825, 5825, 3647, 5825, 3647 },
114 /*10*/ { 8192, 5243, 8192, 5243, 5243, 3355, 5243, 3355 },
115 /*11*/ { 7282, 4559, 7282, 4559, 4559, 2893, 4559, 2893 },
116 /*12*/ { 6554, 4033, 6554, 4033, 4033, 2622, 4033, 2622 },
117 /*13*/ { 5958, 3745, 5958, 3745, 3745, 2330, 3745, 2330 },
118 /*14*/ { 5041, 3277, 5041, 3277, 3277, 2097, 3277, 2097 },
119 /*15*/ { 4681, 2913, 4681, 2913, 2913, 1824, 2913, 1824 },
120 /*16*/ { 4096, 2622, 4096, 2622, 2622, 1678, 2622, 1678 },
121 /*17*/ { 3641, 2280, 3641, 2280, 2280, 1447, 2280, 1447 },
122 /*18*/ { 3277, 2017, 3277, 2017, 2017, 1311, 2017, 1311 },
123 /*19*/ { 2979, 1873, 2979, 1873, 1873, 1165, 1873, 1165 },
124 /*20*/ { 2521, 1639, 2521, 1639, 1639, 1049, 1639, 1049 },
125 /*21*/ { 2341, 1456, 2341, 1456, 1456, 912, 1456, 912 },
126 /*22*/ { 2048, 1311, 2048, 1311, 1311, 839, 1311, 839 },
127 /*23*/ { 1821, 1140, 1821, 1140, 1140, 723, 1140, 723 },
128 /*24*/ { 1638, 1008, 1638, 1008, 1008, 655, 1008, 655 },
129 /*25*/ { 1490, 936, 1490, 936, 936, 583, 936, 583 },
130 /*26*/ { 1260, 819, 1260, 819, 819, 524, 819, 524 },
131 /*27*/ { 1170, 728, 1170, 728, 728, 456, 728, 456 },
132 /*28*/ { 1024, 655, 1024, 655, 655, 419, 655, 419 },
133 /*29*/ { 910, 570, 910, 570, 570, 362, 570, 362 },
134 /*30*/ { 819, 504, 819, 504, 504, 328, 504, 328 },
135 /*31*/ { 745, 468, 745, 468, 468, 291, 468, 291 },
136 /*32*/ { 630, 410, 630, 410, 410, 262, 410, 262 },
137 /*33*/ { 585, 364, 585, 364, 364, 228, 364, 228 },
138 /*34*/ { 512, 328, 512, 328, 328, 210, 328, 210 },
139 /*35*/ { 455, 285, 455, 285, 285, 181, 285, 181 },
140 /*36*/ { 410, 252, 410, 252, 252, 164, 252, 164 },
141 /*37*/ { 372, 234, 372, 234, 234, 146, 234, 146 },
142 /*38*/ { 315, 205, 315, 205, 205, 131, 205, 131 },
143 /*39*/ { 293, 182, 293, 182, 182, 114, 182, 114 },
144 /*40*/ { 256, 164, 256, 164, 164, 105, 164, 105 },
145 /*41*/ { 228, 142, 228, 142, 142, 90, 142, 90 },
146 /*42*/ { 205, 126, 205, 126, 126, 82, 126, 82 },
147 /*43*/ { 186, 117, 186, 117, 117, 73, 117, 73 },
148 /*44*/ { 158, 102, 158, 102, 102, 66, 102, 66 },
149 /*45*/ { 146, 91, 146, 91, 91, 57, 91, 57 },
150 /*46*/ { 128, 82, 128, 82, 82, 52, 82, 52 },
151 /*47*/ { 114, 71, 114, 71, 71, 45, 71, 45 },
152 /*48*/ { 102, 63, 102, 63, 63, 41, 63, 41 },
153 /*49*/ { 93, 59, 93, 59, 59, 36, 59, 36 },
154 /*50*/ { 79, 51, 79, 51, 51, 33, 51, 33 },
155 /*51*/ { 73, 46, 73, 46, 46, 28, 46, 28 }
156 };
157
158 /****************************************************************************
159 * HDM and Quant functions
160 ****************************************************************************/
161 #define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign)
162 #define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16
163 #define WELS_NEW_QUANT(pDct,iFF,iMF) WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF))
WelsQuant4x4_c(int16_t * pDct,const int16_t * pFF,const int16_t * pMF)164 void WelsQuant4x4_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) {
165 int32_t i, j, iSign;
166 for (i = 0; i < 16; i += 4) {
167 j = i & 0x07;
168 iSign = WELS_SIGN (pDct[i]);
169 pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
170 iSign = WELS_SIGN (pDct[i + 1]);
171 pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
172 iSign = WELS_SIGN (pDct[i + 2]);
173 pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
174 iSign = WELS_SIGN (pDct[i + 3]);
175 pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
176 }
177 }
178
WelsQuant4x4Dc_c(int16_t * pDct,int16_t iFF,int16_t iMF)179 void WelsQuant4x4Dc_c (int16_t* pDct, int16_t iFF, int16_t iMF) {
180 int32_t i, iSign;
181 for (i = 0; i < 16; i += 4) {
182 iSign = WELS_SIGN (pDct[i]);
183 pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF);
184 iSign = WELS_SIGN (pDct[i + 1]);
185 pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], iFF, iMF);
186 iSign = WELS_SIGN (pDct[i + 2]);
187 pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], iFF, iMF);
188 iSign = WELS_SIGN (pDct[i + 3]);
189 pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], iFF, iMF);
190 }
191 }
192
WelsQuantFour4x4_c(int16_t * pDct,const int16_t * pFF,const int16_t * pMF)193 void WelsQuantFour4x4_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) {
194 int32_t i, j, iSign;
195
196 for (i = 0; i < 64; i += 4) {
197 j = i & 0x07;
198 iSign = WELS_SIGN (pDct[i]);
199 pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
200 iSign = WELS_SIGN (pDct[i + 1]);
201 pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
202 iSign = WELS_SIGN (pDct[i + 2]);
203 pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
204 iSign = WELS_SIGN (pDct[i + 3]);
205 pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
206 }
207 }
208
WelsQuantFour4x4Max_c(int16_t * pDct,const int16_t * pFF,const int16_t * pMF,int16_t * pMax)209 void WelsQuantFour4x4Max_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax) {
210 int32_t i, j, k, iSign;
211 int16_t iMaxAbs;
212 for (k = 0; k < 4; k++) {
213 iMaxAbs = 0;
214 for (i = 0; i < 16; i++) {
215 j = i & 0x07;
216 iSign = WELS_SIGN (pDct[i]);
217 pDct[i] = NEW_QUANT (pDct[i], pFF[j], pMF[j]);
218 if (iMaxAbs < pDct[i]) iMaxAbs = pDct[i];
219 pDct[i] = WELS_ABS_LC (pDct[i]);
220 }
221 pDct += 16;
222 pMax[k] = iMaxAbs;
223 }
224 }
225
WelsHadamardQuant2x2Skip_c(int16_t * pRs,int16_t iFF,int16_t iMF)226 int32_t WelsHadamardQuant2x2Skip_c (int16_t* pRs, int16_t iFF, int16_t iMF) {
227 int16_t pDct[4], s[4];
228 int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
229
230 s[0] = pRs[0] + pRs[32];
231 s[1] = pRs[0] - pRs[32];
232 s[2] = pRs[16] + pRs[48];
233 s[3] = pRs[16] - pRs[48];
234
235 pDct[0] = s[0] + s[2];
236 pDct[1] = s[0] - s[2];
237 pDct[2] = s[1] + s[3];
238 pDct[3] = s[1] - s[3];
239
240 return ((WELS_ABS (pDct[0]) > iThreshold) || (WELS_ABS (pDct[1]) > iThreshold) || (WELS_ABS (pDct[2]) > iThreshold)
241 || (WELS_ABS (pDct[3]) > iThreshold));
242 }
243
WelsHadamardQuant2x2_c(int16_t * pRs,const int16_t iFF,int16_t iMF,int16_t * pDct,int16_t * pBlock)244 int32_t WelsHadamardQuant2x2_c (int16_t* pRs, const int16_t iFF, int16_t iMF, int16_t* pDct, int16_t* pBlock) {
245 int16_t s[4];
246 int32_t iSign, i, iDcNzc = 0;
247
248 s[0] = pRs[0] + pRs[32];
249 s[1] = pRs[0] - pRs[32];
250 s[2] = pRs[16] + pRs[48];
251 s[3] = pRs[16] - pRs[48];
252
253 pRs[0] = 0;
254 pRs[16] = 0;
255 pRs[32] = 0;
256 pRs[48] = 0;
257
258 pDct[0] = s[0] + s[2];
259 pDct[1] = s[0] - s[2];
260 pDct[2] = s[1] + s[3];
261 pDct[3] = s[1] - s[3];
262
263 iSign = WELS_SIGN (pDct[0]);
264 pDct[0] = WELS_NEW_QUANT (pDct[0], iFF, iMF);
265 iSign = WELS_SIGN (pDct[1]);
266 pDct[1] = WELS_NEW_QUANT (pDct[1], iFF, iMF);
267 iSign = WELS_SIGN (pDct[2]);
268 pDct[2] = WELS_NEW_QUANT (pDct[2], iFF, iMF);
269 iSign = WELS_SIGN (pDct[3]);
270 pDct[3] = WELS_NEW_QUANT (pDct[3], iFF, iMF);
271
272 ST64 (pBlock, LD64 (pDct));
273
274 for (i = 0; i < 4; i++)
275 iDcNzc += (pBlock[i] != 0);
276 return iDcNzc;
277 }
278
279 /* dc value pick up and hdm_4x4 */
WelsHadamardT4Dc_c(int16_t * pLumaDc,int16_t * pDct)280 void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct) {
281 int32_t p[16], s[4];
282 int32_t i, iIdx;
283
284 for (i = 0 ; i < 16 ; i += 4) {
285 iIdx = ((i & 0x08) << 4) + ((i & 0x04) << 3);
286 s[0] = pDct[iIdx ] + pDct[iIdx + 80];
287 s[3] = pDct[iIdx ] - pDct[iIdx + 80];
288 s[1] = pDct[iIdx + 16] + pDct[iIdx + 64];
289 s[2] = pDct[iIdx + 16] - pDct[iIdx + 64];
290
291 p[i ] = s[0] + s[1];
292 p[i + 2] = s[0] - s[1];
293 p[i + 1] = s[3] + s[2];
294 p[i + 3] = s[3] - s[2];
295 }
296
297 for (i = 0 ; i < 4 ; i ++) {
298 s[0] = p[i ] + p[i + 12];
299 s[3] = p[i ] - p[i + 12];
300 s[1] = p[i + 4] + p[i + 8];
301 s[2] = p[i + 4] - p[i + 8];
302
303 pLumaDc[i ] = WELS_CLIP3 ((s[0] + s[1] + 1) >> 1, -32768, 32767);
304 pLumaDc[i + 8 ] = WELS_CLIP3 ((s[0] - s[1] + 1) >> 1, -32768, 32767);
305 pLumaDc[i + 4 ] = WELS_CLIP3 ((s[3] + s[2] + 1) >> 1, -32768, 32767);
306 pLumaDc[i + 12] = WELS_CLIP3 ((s[3] - s[2] + 1) >> 1, -32768, 32767);
307 }
308 }
309
310 /****************************************************************************
311 * DCT functions
312 ****************************************************************************/
WelsDctT4_c(int16_t * pDct,uint8_t * pPixel1,int32_t iStride1,uint8_t * pPixel2,int32_t iStride2)313 void WelsDctT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
314 int16_t i, pData[16], s[4];
315 for (i = 0 ; i < 16 ; i += 4) {
316 const int32_t kiI1 = 1 + i;
317 const int32_t kiI2 = 2 + i;
318 const int32_t kiI3 = 3 + i;
319
320 pData[i ] = pPixel1[0] - pPixel2[0];
321 pData[kiI1] = pPixel1[1] - pPixel2[1];
322 pData[kiI2] = pPixel1[2] - pPixel2[2];
323 pData[kiI3] = pPixel1[3] - pPixel2[3];
324
325 pPixel1 += iStride1;
326 pPixel2 += iStride2;
327
328 /*horizontal transform */
329 s[0] = pData[i] + pData[kiI3];
330 s[3] = pData[i] - pData[kiI3];
331 s[1] = pData[kiI1] + pData[kiI2];
332 s[2] = pData[kiI1] - pData[kiI2];
333
334 pDct[i ] = s[0] + s[1];
335 pDct[kiI2] = s[0] - s[1];
336 pDct[kiI1] = (s[3] * (1 << 1)) + s[2];
337 pDct[kiI3] = s[3] - (s[2] * (1 << 1));
338 }
339
340 /* vertical transform */
341 for (i = 0 ; i < 4 ; i ++) {
342 const int32_t kiI4 = 4 + i;
343 const int32_t kiI8 = 8 + i;
344 const int32_t kiI12 = 12 + i;
345
346 s[0] = pDct[i ] + pDct[kiI12];
347 s[3] = pDct[i ] - pDct[kiI12];
348 s[1] = pDct[kiI4] + pDct[kiI8 ];
349 s[2] = pDct[kiI4] - pDct[kiI8 ];
350
351 pDct[i ] = s[0] + s[1];
352 pDct[kiI8 ] = s[0] - s[1];
353 pDct[kiI4 ] = (s[3] * (1 << 1)) + s[2];
354 pDct[kiI12] = s[3] - (s[2] * (1 << 1));
355 }
356 }
357
WelsDctFourT4_c(int16_t * pDct,uint8_t * pPixel1,int32_t iStride1,uint8_t * pPixel2,int32_t iStride2)358 void WelsDctFourT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
359 int32_t stride_1 = iStride1 << 2;
360 int32_t stride_2 = iStride2 << 2;
361
362 WelsDctT4_c (pDct, &pPixel1[0], iStride1, &pPixel2[0], iStride2);
363 WelsDctT4_c (pDct + 16, &pPixel1[4], iStride1, &pPixel2[4], iStride2);
364 WelsDctT4_c (pDct + 32, &pPixel1[stride_1 ], iStride1, &pPixel2[stride_2 ], iStride2);
365 WelsDctT4_c (pDct + 48, &pPixel1[stride_1 + 4], iStride1, &pPixel2[stride_2 + 4], iStride2);
366 }
367
368 /****************************************************************************
369 * Scan and Score functions
370 ****************************************************************************/
WelsScan4x4DcAc_c(int16_t * pLevel,int16_t * pDct)371 void WelsScan4x4DcAc_c (int16_t* pLevel, int16_t* pDct) {
372 ST32 (pLevel, LD32 (pDct));
373 pLevel[2] = pDct[4];
374 pLevel[3] = pDct[8];
375 pLevel[4] = pDct[5];
376 ST32 (pLevel + 5, LD32 (pDct + 2));
377 pLevel[7] = pDct[6];
378 pLevel[8] = pDct[9];
379 ST32 (pLevel + 9, LD32 (pDct + 12));
380 pLevel[11] = pDct[10];
381 pLevel[12] = pDct[7];
382 pLevel[13] = pDct[11];
383 ST32 (pLevel + 14, LD32 (pDct + 14));
384 }
385
WelsScan4x4Ac_c(int16_t * pLevel,int16_t * pDct)386 void WelsScan4x4Ac_c (int16_t* pLevel, int16_t* pDct) {
387 pLevel[0] = pDct[1];
388 pLevel[1] = pDct[4];
389 pLevel[2] = pDct[8];
390 pLevel[3] = pDct[5];
391 ST32 (&pLevel[4], LD32 (&pDct[2]));
392 pLevel[6] = pDct[6];
393 pLevel[7] = pDct[9];
394 ST32 (&pLevel[8], LD32 (&pDct[12]));
395 pLevel[10] = pDct[10];
396 pLevel[11] = pDct[7];
397 pLevel[12] = pDct[11];
398 ST32 (&pLevel[13], LD32 (&pDct[14]));
399 pLevel[15] = 0;
400 }
401
WelsScan4x4Dc(int16_t * pLevel,int16_t * pDct)402 void WelsScan4x4Dc (int16_t* pLevel, int16_t* pDct) {
403 ST32 (pLevel, LD32 (pDct));
404 pLevel[2] = pDct[4];
405 pLevel[3] = pDct[8];
406 pLevel[4] = pDct[5];
407 ST32 (pLevel + 5, LD32 (pDct + 2));
408 pLevel[7] = pDct[6];
409 pLevel[8] = pDct[9];
410 ST32 (pLevel + 9, LD32 (pDct + 12));
411 pLevel[11] = pDct[10];
412 pLevel[12] = pDct[7];
413 pLevel[13] = pDct[11];
414 ST32 (pLevel + 14, LD32 (pDct + 14));
415 }
416
417 //refer to JVT-O079
WelsCalculateSingleCtr4x4_c(int16_t * pDct)418 int32_t WelsCalculateSingleCtr4x4_c (int16_t* pDct) {
419 static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
420
421 int32_t iSingleCtr = 0;
422 int32_t iIdx = 15;
423 int32_t iRun;
424
425 while (iIdx >= 0 && pDct[iIdx] == 0) --iIdx;
426
427 while (iIdx >= 0) {
428 -- iIdx;
429 iRun = iIdx;
430 while (iIdx >= 0 && pDct[iIdx] == 0) --iIdx;
431 iRun -= iIdx;
432 iSingleCtr += kiTRunTable[iRun];
433 }
434 return iSingleCtr;
435 }
436
WelsGetNoneZeroCount_c(int16_t * pLevel)437 int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) {
438 int32_t iCnt = 0;
439 int32_t iIdx = 0;
440
441 while (iIdx < 16) {
442 iCnt += (pLevel[ iIdx] == 0);
443 iCnt += (pLevel[1 + iIdx] == 0);
444 iCnt += (pLevel[2 + iIdx] == 0);
445 iCnt += (pLevel[3 + iIdx] == 0);
446
447 iIdx += 4;
448 }
449 return (16 - iCnt);
450 }
451
452 #ifdef HAVE_NEON
WelsHadamardQuant2x2Skip_neon(int16_t * pRes,int16_t iFF,int16_t iMF)453 int32_t WelsHadamardQuant2x2Skip_neon (int16_t* pRes, int16_t iFF, int16_t iMF) {
454 int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
455 return WelsHadamardQuant2x2SkipKernel_neon (pRes, iThreshold);
456 }
457 #endif
458 #ifdef HAVE_NEON_AARCH64
WelsHadamardQuant2x2Skip_AArch64_neon(int16_t * pRes,int16_t iFF,int16_t iMF)459 int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF, int16_t iMF) {
460 int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
461 return WelsHadamardQuant2x2SkipKernel_AArch64_neon (pRes, iThreshold);
462 }
463 #endif
WelsInitEncodingFuncs(SWelsFuncPtrList * pFuncList,uint32_t uiCpuFlag)464 void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
465 pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c;
466 pFuncList->pfCopy16x16Aligned =
467 pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_c;
468 pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_c;
469 pFuncList->pfCopy8x16Aligned = WelsCopy8x16_c;
470 pFuncList->pfCopy4x4 = WelsCopy4x4_c;
471 pFuncList->pfCopy8x4 = WelsCopy8x4_c;
472 pFuncList->pfCopy4x8 = WelsCopy4x8_c;
473 pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_c;
474 pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_c;
475 pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_c;
476
477 pFuncList->pfDctT4 = WelsDctT4_c;
478 pFuncList->pfDctFourT4 = WelsDctFourT4_c;
479
480 pFuncList->pfScan4x4 = WelsScan4x4DcAc_c;
481 pFuncList->pfScan4x4Ac = WelsScan4x4Ac_c;
482 pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_c;
483
484 pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_c;
485
486 pFuncList->pfQuantization4x4 = WelsQuant4x4_c;
487 pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_c;
488 pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_c;
489 pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_c;
490
491 #if defined(X86_ASM)
492 if (uiCpuFlag & WELS_CPU_MMXEXT) {
493
494 pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_mmx;
495 pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_mmx;
496
497 pFuncList->pfDctT4 = WelsDctT4_mmx;
498
499 pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmx;
500 pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmx;
501 }
502 if (uiCpuFlag & WELS_CPU_SSE2) {
503 pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse2;
504 pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
505
506 pFuncList->pfQuantization4x4 = WelsQuant4x4_sse2;
507 pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_sse2;
508 pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_sse2;
509 pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_sse2;
510
511 pFuncList->pfCopy16x16Aligned = WelsCopy16x16_sse2;
512 pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_sse2;
513 pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_sse2;
514
515 pFuncList->pfScan4x4 = WelsScan4x4DcAc_sse2;
516 pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2;
517 pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2;
518
519 pFuncList->pfDctT4 = WelsDctT4_sse2;
520 pFuncList->pfDctFourT4 = WelsDctFourT4_sse2;
521 }
522 //#ifndef MACOS
523 if (uiCpuFlag & WELS_CPU_SSSE3) {
524 pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
525 }
526 if (uiCpuFlag & WELS_CPU_SSE42) {
527 pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse42;
528 }
529 #if defined(HAVE_AVX2)
530 if (uiCpuFlag & WELS_CPU_AVX2) {
531 pFuncList->pfDctT4 = WelsDctT4_avx2;
532 pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
533
534 pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2;
535 pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2;
536 pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2;
537 pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
538 }
539 #endif
540 //#endif//MACOS
541
542 #endif//X86_ASM
543
544 #if defined(HAVE_NEON)
545 if (uiCpuFlag & WELS_CPU_NEON) {
546 pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_neon;
547 pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_neon;
548 pFuncList->pfDctT4 = WelsDctT4_neon;
549 pFuncList->pfCopy8x8Aligned = WelsCopy8x8_neon;
550 pFuncList->pfCopy8x16Aligned = WelsCopy8x16_neon;
551
552 pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_neon;
553 pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_neon;
554
555 pFuncList->pfQuantization4x4 = WelsQuant4x4_neon;
556 pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_neon;
557 pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_neon;
558 pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_neon;
559
560 pFuncList->pfCopy16x16Aligned = WelsCopy16x16_neon;
561 pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_neon;
562 pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_neon;
563 pFuncList->pfDctFourT4 = WelsDctFourT4_neon;
564 }
565 #endif
566
567 #if defined(HAVE_NEON_AARCH64)
568 if (uiCpuFlag & WELS_CPU_NEON) {
569 pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_AArch64_neon;
570 pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_AArch64_neon;
571 pFuncList->pfDctT4 = WelsDctT4_AArch64_neon;
572 pFuncList->pfCopy8x8Aligned = WelsCopy8x8_AArch64_neon;
573 pFuncList->pfCopy8x16Aligned = WelsCopy8x16_AArch64_neon;
574
575 pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_AArch64_neon;
576 pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_AArch64_neon;
577
578 pFuncList->pfQuantization4x4 = WelsQuant4x4_AArch64_neon;
579 pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_AArch64_neon;
580 pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_AArch64_neon;
581 pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_AArch64_neon;
582
583 pFuncList->pfCopy16x16Aligned = WelsCopy16x16_AArch64_neon;
584 pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_AArch64_neon;
585 pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_AArch64_neon;
586 pFuncList->pfDctFourT4 = WelsDctFourT4_AArch64_neon;
587 }
588 #endif
589
590 #if defined(HAVE_MMI)
591 if (uiCpuFlag & WELS_CPU_MMI) {
592 pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmi;
593 pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmi;
594
595 pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_mmi;
596 pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_mmi;
597
598 pFuncList->pfQuantization4x4 = WelsQuant4x4_mmi;
599 pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_mmi;
600 pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_mmi;
601 pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_mmi;
602
603 pFuncList->pfCopy16x16Aligned = WelsCopy16x16_mmi;
604 pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_mmi;
605 pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_mmi;
606
607 pFuncList->pfScan4x4 = WelsScan4x4DcAc_mmi;
608 pFuncList->pfScan4x4Ac = WelsScan4x4Ac_mmi;
609 pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_mmi;
610
611 pFuncList->pfDctT4 = WelsDctT4_mmi;
612 pFuncList->pfDctFourT4 = WelsDctFourT4_mmi;
613 }
614 #endif//HAVE_MMI
615
616 #if defined(HAVE_MSA)
617 if (uiCpuFlag & WELS_CPU_MSA) {
618 pFuncList->pfCopy8x8Aligned = WelsCopy8x8_msa;
619 pFuncList->pfCopy8x16Aligned = WelsCopy8x16_msa;
620
621 pFuncList->pfCopy16x16Aligned =
622 pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_msa;
623 pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_msa;
624 }
625 #endif
626
627 #if defined(HAVE_LSX)
628 if (uiCpuFlag & WELS_CPU_LSX) {
629 pFuncList->pfCopy8x8Aligned = WelsCopy8x8_lsx;
630 pFuncList->pfCopy16x16Aligned = WelsCopy16x16_lsx;
631 pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_lsx;
632 pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_lsx;
633 }
634 #endif
635 }
636 }
637