• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2013, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  */
32 
33 
34 #include "ls_defines.h"
35 #include "encode_mb_aux.h"
36 #include "cpu_core.h"
37 namespace WelsEnc {
38 
39 ALIGNED_DECLARE (const int16_t, g_kiQuantInterFF[58][8], 16) = {
40   /* 0*/ {   0,   1,   0,   1,   1,   1,   1,   1 },
41   /* 1*/ {   0,   1,   0,   1,   1,   1,   1,   1 },
42   /* 2*/ {   1,   1,   1,   1,   1,   1,   1,   1 },
43   /* 3*/ {   1,   1,   1,   1,   1,   1,   1,   1 },
44   /* 4*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
45   /* 5*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
46   /* 6*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
47   /* 7*/ {   1,   1,   1,   1,   1,   2,   1,   2 },
48   /* 8*/ {   1,   2,   1,   2,   2,   3,   2,   3 },
49   /* 9*/ {   1,   2,   1,   2,   2,   3,   2,   3 },
50   /*10*/ {   1,   2,   1,   2,   2,   3,   2,   3 },
51   /*11*/ {   1,   2,   1,   2,   2,   4,   2,   4 },
52   /*12*/ {   2,   3,   2,   3,   3,   4,   3,   4 },
53   /*13*/ {   2,   3,   2,   3,   3,   5,   3,   5 },
54   /*14*/ {   2,   3,   2,   3,   3,   5,   3,   5 },
55   /*15*/ {   2,   4,   2,   4,   4,   6,   4,   6 },
56   /*16*/ {   3,   4,   3,   4,   4,   7,   4,   7 },
57   /*17*/ {   3,   5,   3,   5,   5,   8,   5,   8 },
58   /*18*/ {   3,   5,   3,   5,   5,   8,   5,   8 },
59   /*19*/ {   4,   6,   4,   6,   6,   9,   6,   9 },
60   /*20*/ {   4,   7,   4,   7,   7,  10,   7,  10 },
61   /*21*/ {   5,   8,   5,   8,   8,  12,   8,  12 },
62   /*22*/ {   5,   8,   5,   8,   8,  13,   8,  13 },
63   /*23*/ {   6,  10,   6,  10,  10,  15,  10,  15 },
64   /*24*/ {   7,  11,   7,  11,  11,  17,  11,  17 },
65   /*25*/ {   7,  12,   7,  12,  12,  19,  12,  19 },
66   /*26*/ {   9,  13,   9,  13,  13,  21,  13,  21 },
67   /*27*/ {   9,  15,   9,  15,  15,  24,  15,  24 },
68   /*28*/ {  11,  17,  11,  17,  17,  26,  17,  26 },
69   /*29*/ {  12,  19,  12,  19,  19,  30,  19,  30 },
70   /*30*/ {  13,  22,  13,  22,  22,  33,  22,  33 },
71   /*31*/ {  15,  23,  15,  23,  23,  38,  23,  38 },
72   /*32*/ {  17,  27,  17,  27,  27,  42,  27,  42 },
73   /*33*/ {  19,  30,  19,  30,  30,  48,  30,  48 },
74   /*34*/ {  21,  33,  21,  33,  33,  52,  33,  52 },
75   /*35*/ {  24,  38,  24,  38,  38,  60,  38,  60 },
76   /*36*/ {  27,  43,  27,  43,  43,  67,  43,  67 },
77   /*37*/ {  29,  47,  29,  47,  47,  75,  47,  75 },
78   /*38*/ {  35,  53,  35,  53,  53,  83,  53,  83 },
79   /*39*/ {  37,  60,  37,  60,  60,  96,  60,  96 },
80   /*40*/ {  43,  67,  43,  67,  67, 104,  67, 104 },
81   /*41*/ {  48,  77,  48,  77,  77, 121,  77, 121 },
82   /*42*/ {  53,  87,  53,  87,  87, 133,  87, 133 },
83   /*43*/ {  59,  93,  59,  93,  93, 150,  93, 150 },
84   /*44*/ {  69, 107,  69, 107, 107, 167, 107, 167 },
85   /*45*/ {  75, 120,  75, 120, 120, 192, 120, 192 },
86   /*46*/ {  85, 133,  85, 133, 133, 208, 133, 208 },
87   /*47*/ {  96, 153,  96, 153, 153, 242, 153, 242 },
88   /*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 },
89   /*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 },
90   /*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 },
91   /*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 },
92   /* from here below is only for intra */
93   /*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 },
94   /*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 },
95   /*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 },
96   /*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 },
97   /*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 },
98   /*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 },
99 };
100 
101 
102 
103 ALIGNED_DECLARE (const int16_t, g_kiQuantMF[52][8], 16) = {
104   /* 0*/        {26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 },
105   /* 1*/        {23832, 14980, 23832, 14980, 14980,  9320, 14980,  9320 },
106   /* 2*/        {20164, 13108, 20164, 13108, 13108,  8388, 13108,  8388 },
107   /* 3*/        {18724, 11650, 18724, 11650, 11650,  7294, 11650,  7294 },
108   /* 4*/        {16384, 10486, 16384, 10486, 10486,  6710, 10486,  6710 },
109   /* 5*/        {14564,  9118, 14564,  9118,  9118,  5786,  9118,  5786 },
110   /* 6*/        {13107,  8066, 13107,  8066,  8066,  5243,  8066,  5243 },
111   /* 7*/        {11916,  7490, 11916,  7490,  7490,  4660,  7490,  4660 },
112   /* 8*/        {10082,  6554, 10082,  6554,  6554,  4194,  6554,  4194 },
113   /* 9*/        { 9362,  5825,  9362,  5825,  5825,  3647,  5825,  3647 },
114   /*10*/        { 8192,  5243,  8192,  5243,  5243,  3355,  5243,  3355 },
115   /*11*/        { 7282,  4559,  7282,  4559,  4559,  2893,  4559,  2893 },
116   /*12*/        { 6554,  4033,  6554,  4033,  4033,  2622,  4033,  2622 },
117   /*13*/        { 5958,  3745,  5958,  3745,  3745,  2330,  3745,  2330 },
118   /*14*/        { 5041,  3277,  5041,  3277,  3277,  2097,  3277,  2097 },
119   /*15*/        { 4681,  2913,  4681,  2913,  2913,  1824,  2913,  1824 },
120   /*16*/        { 4096,  2622,  4096,  2622,  2622,  1678,  2622,  1678 },
121   /*17*/        { 3641,  2280,  3641,  2280,  2280,  1447,  2280,  1447 },
122   /*18*/        { 3277,  2017,  3277,  2017,  2017,  1311,  2017,  1311 },
123   /*19*/        { 2979,  1873,  2979,  1873,  1873,  1165,  1873,  1165 },
124   /*20*/        { 2521,  1639,  2521,  1639,  1639,  1049,  1639,  1049 },
125   /*21*/        { 2341,  1456,  2341,  1456,  1456,   912,  1456,   912 },
126   /*22*/        { 2048,  1311,  2048,  1311,  1311,   839,  1311,   839 },
127   /*23*/        { 1821,  1140,  1821,  1140,  1140,   723,  1140,   723 },
128   /*24*/        { 1638,  1008,  1638,  1008,  1008,   655,  1008,   655 },
129   /*25*/        { 1490,   936,  1490,   936,   936,   583,   936,   583 },
130   /*26*/        { 1260,   819,  1260,   819,   819,   524,   819,   524 },
131   /*27*/        { 1170,   728,  1170,   728,   728,   456,   728,   456 },
132   /*28*/        { 1024,   655,  1024,   655,   655,   419,   655,   419 },
133   /*29*/        {  910,   570,   910,   570,   570,   362,   570,   362 },
134   /*30*/        {  819,   504,   819,   504,   504,   328,   504,   328 },
135   /*31*/        {  745,   468,   745,   468,   468,   291,   468,   291 },
136   /*32*/        {  630,   410,   630,   410,   410,   262,   410,   262 },
137   /*33*/        {  585,   364,   585,   364,   364,   228,   364,   228 },
138   /*34*/        {  512,   328,   512,   328,   328,   210,   328,   210 },
139   /*35*/        {  455,   285,   455,   285,   285,   181,   285,   181 },
140   /*36*/        {  410,   252,   410,   252,   252,   164,   252,   164 },
141   /*37*/        {  372,   234,   372,   234,   234,   146,   234,   146 },
142   /*38*/        {  315,   205,   315,   205,   205,   131,   205,   131 },
143   /*39*/        {  293,   182,   293,   182,   182,   114,   182,   114 },
144   /*40*/        {  256,   164,   256,   164,   164,   105,   164,   105 },
145   /*41*/        {  228,   142,   228,   142,   142,    90,   142,    90 },
146   /*42*/        {  205,   126,   205,   126,   126,    82,   126,    82 },
147   /*43*/        {  186,   117,   186,   117,   117,    73,   117,    73 },
148   /*44*/        {  158,   102,   158,   102,   102,    66,   102,    66 },
149   /*45*/        {  146,    91,   146,    91,    91,    57,    91,    57 },
150   /*46*/        {  128,    82,   128,    82,    82,    52,    82,    52 },
151   /*47*/        {  114,    71,   114,    71,    71,    45,    71,    45 },
152   /*48*/        {  102,    63,   102,    63,    63,    41,    63,    41 },
153   /*49*/        {   93,    59,    93,    59,    59,    36,    59,    36 },
154   /*50*/        {   79,    51,    79,    51,    51,    33,    51,    33 },
155   /*51*/        {   73,    46,    73,    46,    46,    28,    46,    28 }
156 };
157 
158 /****************************************************************************
159  * HDM and Quant functions
160  ****************************************************************************/
161 #define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign)
162 #define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16
163 #define WELS_NEW_QUANT(pDct,iFF,iMF) WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF))
WelsQuant4x4_c(int16_t * pDct,const int16_t * pFF,const int16_t * pMF)164 void WelsQuant4x4_c (int16_t* pDct, const int16_t* pFF,  const int16_t* pMF) {
165   int32_t i, j, iSign;
166   for (i = 0; i < 16; i += 4) {
167     j = i & 0x07;
168     iSign = WELS_SIGN (pDct[i]);
169     pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
170     iSign = WELS_SIGN (pDct[i + 1]);
171     pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
172     iSign = WELS_SIGN (pDct[i + 2]);
173     pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
174     iSign = WELS_SIGN (pDct[i + 3]);
175     pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
176   }
177 }
178 
WelsQuant4x4Dc_c(int16_t * pDct,int16_t iFF,int16_t iMF)179 void WelsQuant4x4Dc_c (int16_t* pDct, int16_t iFF,  int16_t iMF) {
180   int32_t i, iSign;
181   for (i = 0; i < 16; i += 4) {
182     iSign = WELS_SIGN (pDct[i]);
183     pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF);
184     iSign = WELS_SIGN (pDct[i + 1]);
185     pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], iFF, iMF);
186     iSign = WELS_SIGN (pDct[i + 2]);
187     pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], iFF, iMF);
188     iSign = WELS_SIGN (pDct[i + 3]);
189     pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], iFF, iMF);
190   }
191 }
192 
WelsQuantFour4x4_c(int16_t * pDct,const int16_t * pFF,const int16_t * pMF)193 void WelsQuantFour4x4_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) {
194   int32_t i, j, iSign;
195 
196   for (i = 0; i < 64; i += 4) {
197     j = i & 0x07;
198     iSign = WELS_SIGN (pDct[i]);
199     pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]);
200     iSign = WELS_SIGN (pDct[i + 1]);
201     pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]);
202     iSign = WELS_SIGN (pDct[i + 2]);
203     pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]);
204     iSign = WELS_SIGN (pDct[i + 3]);
205     pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]);
206   }
207 }
208 
WelsQuantFour4x4Max_c(int16_t * pDct,const int16_t * pFF,const int16_t * pMF,int16_t * pMax)209 void WelsQuantFour4x4Max_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax) {
210   int32_t i, j, k, iSign;
211   int16_t iMaxAbs;
212   for (k = 0; k < 4; k++) {
213     iMaxAbs = 0;
214     for (i = 0; i < 16; i++) {
215       j = i & 0x07;
216       iSign = WELS_SIGN (pDct[i]);
217       pDct[i] = NEW_QUANT (pDct[i], pFF[j], pMF[j]);
218       if (iMaxAbs < pDct[i]) iMaxAbs = pDct[i];
219       pDct[i] = WELS_ABS_LC (pDct[i]);
220     }
221     pDct += 16;
222     pMax[k] = iMaxAbs;
223   }
224 }
225 
WelsHadamardQuant2x2Skip_c(int16_t * pRs,int16_t iFF,int16_t iMF)226 int32_t WelsHadamardQuant2x2Skip_c (int16_t* pRs, int16_t iFF,  int16_t iMF) {
227   int16_t pDct[4], s[4];
228   int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
229 
230   s[0] = pRs[0]  + pRs[32];
231   s[1] = pRs[0]  - pRs[32];
232   s[2] = pRs[16] + pRs[48];
233   s[3] = pRs[16] - pRs[48];
234 
235   pDct[0] = s[0] + s[2];
236   pDct[1] = s[0] - s[2];
237   pDct[2] = s[1] + s[3];
238   pDct[3] = s[1] - s[3];
239 
240   return ((WELS_ABS (pDct[0]) > iThreshold) || (WELS_ABS (pDct[1]) > iThreshold) || (WELS_ABS (pDct[2]) > iThreshold)
241           || (WELS_ABS (pDct[3]) > iThreshold));
242 }
243 
WelsHadamardQuant2x2_c(int16_t * pRs,const int16_t iFF,int16_t iMF,int16_t * pDct,int16_t * pBlock)244 int32_t WelsHadamardQuant2x2_c (int16_t* pRs, const int16_t iFF, int16_t iMF, int16_t* pDct, int16_t* pBlock) {
245   int16_t s[4];
246   int32_t iSign, i, iDcNzc = 0;
247 
248   s[0] = pRs[0]  + pRs[32];
249   s[1] = pRs[0]  - pRs[32];
250   s[2] = pRs[16] + pRs[48];
251   s[3] = pRs[16] - pRs[48];
252 
253   pRs[0] = 0;
254   pRs[16] = 0;
255   pRs[32] = 0;
256   pRs[48] = 0;
257 
258   pDct[0] = s[0] + s[2];
259   pDct[1] = s[0] - s[2];
260   pDct[2] = s[1] + s[3];
261   pDct[3] = s[1] - s[3];
262 
263   iSign = WELS_SIGN (pDct[0]);
264   pDct[0] = WELS_NEW_QUANT (pDct[0], iFF, iMF);
265   iSign = WELS_SIGN (pDct[1]);
266   pDct[1] = WELS_NEW_QUANT (pDct[1], iFF, iMF);
267   iSign = WELS_SIGN (pDct[2]);
268   pDct[2] = WELS_NEW_QUANT (pDct[2], iFF, iMF);
269   iSign = WELS_SIGN (pDct[3]);
270   pDct[3] = WELS_NEW_QUANT (pDct[3], iFF, iMF);
271 
272   ST64 (pBlock, LD64 (pDct));
273 
274   for (i = 0; i < 4; i++)
275     iDcNzc += (pBlock[i] != 0);
276   return iDcNzc;
277 }
278 
279 /* dc value pick up and hdm_4x4 */
WelsHadamardT4Dc_c(int16_t * pLumaDc,int16_t * pDct)280 void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct) {
281   int32_t p[16], s[4];
282   int32_t i, iIdx;
283 
284   for (i = 0 ; i < 16 ; i += 4) {
285     iIdx = ((i & 0x08) << 4) + ((i & 0x04) << 3);
286     s[0] = pDct[iIdx ]     + pDct[iIdx + 80];
287     s[3] = pDct[iIdx ]     - pDct[iIdx + 80];
288     s[1] = pDct[iIdx + 16] + pDct[iIdx + 64];
289     s[2] = pDct[iIdx + 16] - pDct[iIdx + 64];
290 
291     p[i  ]   = s[0] + s[1];
292     p[i + 2] = s[0] - s[1];
293     p[i + 1] = s[3] + s[2];
294     p[i + 3] = s[3] - s[2];
295   }
296 
297   for (i = 0 ; i < 4 ; i ++) {
298     s[0] = p[i ]    + p[i + 12];
299     s[3] = p[i ]    - p[i + 12];
300     s[1] = p[i + 4] + p[i + 8];
301     s[2] = p[i + 4] - p[i + 8];
302 
303     pLumaDc[i  ]    = WELS_CLIP3 ((s[0] + s[1] + 1) >> 1, -32768, 32767);
304     pLumaDc[i + 8 ] = WELS_CLIP3 ((s[0] - s[1] + 1) >> 1, -32768, 32767);
305     pLumaDc[i + 4 ] = WELS_CLIP3 ((s[3] + s[2] + 1) >> 1, -32768, 32767);
306     pLumaDc[i + 12] = WELS_CLIP3 ((s[3] - s[2] + 1) >> 1, -32768, 32767);
307   }
308 }
309 
310 /****************************************************************************
311  * DCT functions
312  ****************************************************************************/
WelsDctT4_c(int16_t * pDct,uint8_t * pPixel1,int32_t iStride1,uint8_t * pPixel2,int32_t iStride2)313 void WelsDctT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
314   int16_t i, pData[16], s[4];
315   for (i = 0 ; i < 16 ; i += 4) {
316     const int32_t kiI1 = 1 + i;
317     const int32_t kiI2 = 2 + i;
318     const int32_t kiI3 = 3 + i;
319 
320     pData[i ] = pPixel1[0] - pPixel2[0];
321     pData[kiI1] = pPixel1[1] - pPixel2[1];
322     pData[kiI2] = pPixel1[2] - pPixel2[2];
323     pData[kiI3] = pPixel1[3] - pPixel2[3];
324 
325     pPixel1 += iStride1;
326     pPixel2 += iStride2;
327 
328     /*horizontal transform */
329     s[0] = pData[i] + pData[kiI3];
330     s[3] = pData[i] - pData[kiI3];
331     s[1] = pData[kiI1] + pData[kiI2];
332     s[2] = pData[kiI1] - pData[kiI2];
333 
334     pDct[i ]   = s[0] + s[1];
335     pDct[kiI2] = s[0] - s[1];
336     pDct[kiI1] = (s[3] * (1 << 1)) + s[2];
337     pDct[kiI3] = s[3] - (s[2] * (1 << 1));
338   }
339 
340   /* vertical transform */
341   for (i = 0 ; i < 4 ; i ++) {
342     const int32_t kiI4  = 4 + i;
343     const int32_t kiI8  = 8 + i;
344     const int32_t kiI12 = 12 + i;
345 
346     s[0] = pDct[i ] + pDct[kiI12];
347     s[3] = pDct[i ] - pDct[kiI12];
348     s[1] = pDct[kiI4] + pDct[kiI8 ];
349     s[2] = pDct[kiI4] - pDct[kiI8 ];
350 
351     pDct[i  ]   = s[0] + s[1];
352     pDct[kiI8 ] = s[0] - s[1];
353     pDct[kiI4 ] = (s[3] * (1 << 1)) + s[2];
354     pDct[kiI12] = s[3] - (s[2] * (1 << 1));
355   }
356 }
357 
WelsDctFourT4_c(int16_t * pDct,uint8_t * pPixel1,int32_t iStride1,uint8_t * pPixel2,int32_t iStride2)358 void WelsDctFourT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) {
359   int32_t stride_1 = iStride1 << 2;
360   int32_t stride_2 = iStride2 << 2;
361 
362   WelsDctT4_c (pDct,      &pPixel1[0],            iStride1, &pPixel2[0],            iStride2);
363   WelsDctT4_c (pDct + 16, &pPixel1[4],            iStride1, &pPixel2[4],            iStride2);
364   WelsDctT4_c (pDct + 32, &pPixel1[stride_1    ], iStride1, &pPixel2[stride_2    ], iStride2);
365   WelsDctT4_c (pDct + 48, &pPixel1[stride_1 + 4], iStride1, &pPixel2[stride_2 + 4], iStride2);
366 }
367 
368 /****************************************************************************
369  * Scan and Score functions
370  ****************************************************************************/
WelsScan4x4DcAc_c(int16_t * pLevel,int16_t * pDct)371 void WelsScan4x4DcAc_c (int16_t* pLevel, int16_t* pDct) {
372   ST32 (pLevel, LD32 (pDct));
373   pLevel[2] = pDct[4];
374   pLevel[3] = pDct[8];
375   pLevel[4] = pDct[5];
376   ST32 (pLevel + 5, LD32 (pDct + 2));
377   pLevel[7] = pDct[6];
378   pLevel[8] = pDct[9];
379   ST32 (pLevel + 9, LD32 (pDct + 12));
380   pLevel[11] = pDct[10];
381   pLevel[12] = pDct[7];
382   pLevel[13] = pDct[11];
383   ST32 (pLevel + 14, LD32 (pDct + 14));
384 }
385 
WelsScan4x4Ac_c(int16_t * pLevel,int16_t * pDct)386 void WelsScan4x4Ac_c (int16_t* pLevel, int16_t* pDct) {
387   pLevel[0]  = pDct[1];
388   pLevel[1]  = pDct[4];
389   pLevel[2]  = pDct[8];
390   pLevel[3]  = pDct[5];
391   ST32 (&pLevel[4], LD32 (&pDct[2]));
392   pLevel[6]  = pDct[6];
393   pLevel[7]  = pDct[9];
394   ST32 (&pLevel[8], LD32 (&pDct[12]));
395   pLevel[10] = pDct[10];
396   pLevel[11] = pDct[7];
397   pLevel[12] = pDct[11];
398   ST32 (&pLevel[13], LD32 (&pDct[14]));
399   pLevel[15] = 0;
400 }
401 
WelsScan4x4Dc(int16_t * pLevel,int16_t * pDct)402 void WelsScan4x4Dc (int16_t* pLevel, int16_t* pDct) {
403   ST32 (pLevel, LD32 (pDct));
404   pLevel[2] = pDct[4];
405   pLevel[3] = pDct[8];
406   pLevel[4] = pDct[5];
407   ST32 (pLevel + 5, LD32 (pDct + 2));
408   pLevel[7] = pDct[6];
409   pLevel[8] = pDct[9];
410   ST32 (pLevel + 9, LD32 (pDct + 12));
411   pLevel[11] = pDct[10];
412   pLevel[12] = pDct[7];
413   pLevel[13] = pDct[11];
414   ST32 (pLevel + 14, LD32 (pDct + 14));
415 }
416 
417 //refer to JVT-O079
WelsCalculateSingleCtr4x4_c(int16_t * pDct)418 int32_t WelsCalculateSingleCtr4x4_c (int16_t* pDct) {
419   static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
420 
421   int32_t iSingleCtr = 0;
422   int32_t iIdx = 15;
423   int32_t iRun;
424 
425   while (iIdx >= 0 && pDct[iIdx] == 0)      --iIdx;
426 
427   while (iIdx >= 0) {
428     -- iIdx;
429     iRun = iIdx;
430     while (iIdx >= 0 && pDct[iIdx] == 0)  --iIdx;
431     iRun -= iIdx;
432     iSingleCtr += kiTRunTable[iRun];
433   }
434   return iSingleCtr;
435 }
436 
WelsGetNoneZeroCount_c(int16_t * pLevel)437 int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) {
438   int32_t iCnt = 0;
439   int32_t iIdx = 0;
440 
441   while (iIdx < 16) {
442     iCnt += (pLevel[  iIdx] == 0);
443     iCnt += (pLevel[1 + iIdx] == 0);
444     iCnt += (pLevel[2 + iIdx] == 0);
445     iCnt += (pLevel[3 + iIdx] == 0);
446 
447     iIdx += 4;
448   }
449   return (16 - iCnt);
450 }
451 
452 #ifdef HAVE_NEON
WelsHadamardQuant2x2Skip_neon(int16_t * pRes,int16_t iFF,int16_t iMF)453 int32_t WelsHadamardQuant2x2Skip_neon (int16_t* pRes, int16_t iFF,  int16_t iMF) {
454   int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
455   return WelsHadamardQuant2x2SkipKernel_neon (pRes, iThreshold);
456 }
457 #endif
458 #ifdef HAVE_NEON_AARCH64
WelsHadamardQuant2x2Skip_AArch64_neon(int16_t * pRes,int16_t iFF,int16_t iMF)459 int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF,  int16_t iMF) {
460   int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
461   return WelsHadamardQuant2x2SkipKernel_AArch64_neon (pRes, iThreshold);
462 }
463 #endif
WelsInitEncodingFuncs(SWelsFuncPtrList * pFuncList,uint32_t uiCpuFlag)464 void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t  uiCpuFlag) {
465   pFuncList->pfCopy8x8Aligned           = WelsCopy8x8_c;
466   pFuncList->pfCopy16x16Aligned         =
467   pFuncList->pfCopy16x16NotAligned      = WelsCopy16x16_c;
468   pFuncList->pfCopy16x8NotAligned       = WelsCopy16x8_c;
469   pFuncList->pfCopy8x16Aligned          = WelsCopy8x16_c;
470   pFuncList->pfCopy4x4           = WelsCopy4x4_c;
471   pFuncList->pfCopy8x4           = WelsCopy8x4_c;
472   pFuncList->pfCopy4x8           = WelsCopy4x8_c;
473   pFuncList->pfQuantizationHadamard2x2          = WelsHadamardQuant2x2_c;
474   pFuncList->pfQuantizationHadamard2x2Skip      = WelsHadamardQuant2x2Skip_c;
475   pFuncList->pfTransformHadamard4x4Dc           = WelsHadamardT4Dc_c;
476 
477   pFuncList->pfDctT4                    = WelsDctT4_c;
478   pFuncList->pfDctFourT4                = WelsDctFourT4_c;
479 
480   pFuncList->pfScan4x4                  = WelsScan4x4DcAc_c;
481   pFuncList->pfScan4x4Ac                = WelsScan4x4Ac_c;
482   pFuncList->pfCalculateSingleCtr4x4    = WelsCalculateSingleCtr4x4_c;
483 
484   pFuncList->pfGetNoneZeroCount         = WelsGetNoneZeroCount_c;
485 
486   pFuncList->pfQuantization4x4          = WelsQuant4x4_c;
487   pFuncList->pfQuantizationDc4x4        = WelsQuant4x4Dc_c;
488   pFuncList->pfQuantizationFour4x4      = WelsQuantFour4x4_c;
489   pFuncList->pfQuantizationFour4x4Max   = WelsQuantFour4x4Max_c;
490 
491 #if defined(X86_ASM)
492   if (uiCpuFlag & WELS_CPU_MMXEXT) {
493 
494     pFuncList->pfQuantizationHadamard2x2        = WelsHadamardQuant2x2_mmx;
495     pFuncList->pfQuantizationHadamard2x2Skip    = WelsHadamardQuant2x2Skip_mmx;
496 
497     pFuncList->pfDctT4                  = WelsDctT4_mmx;
498 
499     pFuncList->pfCopy8x8Aligned         = WelsCopy8x8_mmx;
500     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmx;
501   }
502   if (uiCpuFlag & WELS_CPU_SSE2) {
503     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_sse2;
504     pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
505 
506     pFuncList->pfQuantization4x4        = WelsQuant4x4_sse2;
507     pFuncList->pfQuantizationDc4x4      = WelsQuant4x4Dc_sse2;
508     pFuncList->pfQuantizationFour4x4    = WelsQuantFour4x4_sse2;
509     pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_sse2;
510 
511     pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_sse2;
512     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_sse2;
513     pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8NotAligned_sse2;
514 
515     pFuncList->pfScan4x4                = WelsScan4x4DcAc_sse2;
516     pFuncList->pfScan4x4Ac              = WelsScan4x4Ac_sse2;
517     pFuncList->pfCalculateSingleCtr4x4  = WelsCalculateSingleCtr4x4_sse2;
518 
519     pFuncList->pfDctT4                  = WelsDctT4_sse2;
520     pFuncList->pfDctFourT4              = WelsDctFourT4_sse2;
521   }
522 //#ifndef MACOS
523   if (uiCpuFlag & WELS_CPU_SSSE3) {
524     pFuncList->pfScan4x4                = WelsScan4x4DcAc_ssse3;
525   }
526   if (uiCpuFlag & WELS_CPU_SSE42) {
527     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_sse42;
528   }
529 #if defined(HAVE_AVX2)
530   if (uiCpuFlag & WELS_CPU_AVX2) {
531     pFuncList->pfDctT4                  = WelsDctT4_avx2;
532     pFuncList->pfDctFourT4              = WelsDctFourT4_avx2;
533 
534     pFuncList->pfQuantization4x4        = WelsQuant4x4_avx2;
535     pFuncList->pfQuantizationDc4x4      = WelsQuant4x4Dc_avx2;
536     pFuncList->pfQuantizationFour4x4    = WelsQuantFour4x4_avx2;
537     pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
538   }
539 #endif
540 //#endif//MACOS
541 
542 #endif//X86_ASM
543 
544 #if defined(HAVE_NEON)
545   if (uiCpuFlag & WELS_CPU_NEON) {
546     pFuncList->pfQuantizationHadamard2x2        = WelsHadamardQuant2x2_neon;
547     pFuncList->pfQuantizationHadamard2x2Skip    = WelsHadamardQuant2x2Skip_neon;
548     pFuncList->pfDctT4                          = WelsDctT4_neon;
549     pFuncList->pfCopy8x8Aligned                 = WelsCopy8x8_neon;
550     pFuncList->pfCopy8x16Aligned                = WelsCopy8x16_neon;
551 
552     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_neon;
553     pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_neon;
554 
555     pFuncList->pfQuantization4x4        = WelsQuant4x4_neon;
556     pFuncList->pfQuantizationDc4x4      = WelsQuant4x4Dc_neon;
557     pFuncList->pfQuantizationFour4x4    = WelsQuantFour4x4_neon;
558     pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_neon;
559 
560     pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_neon;
561     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_neon;
562     pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8NotAligned_neon;
563     pFuncList->pfDctFourT4              = WelsDctFourT4_neon;
564   }
565 #endif
566 
567 #if defined(HAVE_NEON_AARCH64)
568   if (uiCpuFlag & WELS_CPU_NEON) {
569     pFuncList->pfQuantizationHadamard2x2        = WelsHadamardQuant2x2_AArch64_neon;
570     pFuncList->pfQuantizationHadamard2x2Skip    = WelsHadamardQuant2x2Skip_AArch64_neon;
571     pFuncList->pfDctT4                          = WelsDctT4_AArch64_neon;
572     pFuncList->pfCopy8x8Aligned                 = WelsCopy8x8_AArch64_neon;
573     pFuncList->pfCopy8x16Aligned                = WelsCopy8x16_AArch64_neon;
574 
575     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_AArch64_neon;
576     pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_AArch64_neon;
577 
578     pFuncList->pfQuantization4x4        = WelsQuant4x4_AArch64_neon;
579     pFuncList->pfQuantizationDc4x4      = WelsQuant4x4Dc_AArch64_neon;
580     pFuncList->pfQuantizationFour4x4    = WelsQuantFour4x4_AArch64_neon;
581     pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_AArch64_neon;
582 
583     pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_AArch64_neon;
584     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_AArch64_neon;
585     pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8NotAligned_AArch64_neon;
586     pFuncList->pfDctFourT4              = WelsDctFourT4_AArch64_neon;
587   }
588 #endif
589 
590 #if defined(HAVE_MMI)
591   if (uiCpuFlag & WELS_CPU_MMI) {
592     pFuncList->pfCopy8x8Aligned         = WelsCopy8x8_mmi;
593     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmi;
594 
595     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_mmi;
596     pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_mmi;
597 
598     pFuncList->pfQuantization4x4        = WelsQuant4x4_mmi;
599     pFuncList->pfQuantizationDc4x4      = WelsQuant4x4Dc_mmi;
600     pFuncList->pfQuantizationFour4x4    = WelsQuantFour4x4_mmi;
601     pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_mmi;
602 
603     pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_mmi;
604     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_mmi;
605     pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8NotAligned_mmi;
606 
607     pFuncList->pfScan4x4                = WelsScan4x4DcAc_mmi;
608     pFuncList->pfScan4x4Ac              = WelsScan4x4Ac_mmi;
609     pFuncList->pfCalculateSingleCtr4x4  = WelsCalculateSingleCtr4x4_mmi;
610 
611     pFuncList->pfDctT4                  = WelsDctT4_mmi;
612     pFuncList->pfDctFourT4              = WelsDctFourT4_mmi;
613   }
614 #endif//HAVE_MMI
615 
616 #if defined(HAVE_MSA)
617   if (uiCpuFlag & WELS_CPU_MSA) {
618     pFuncList->pfCopy8x8Aligned         = WelsCopy8x8_msa;
619     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_msa;
620 
621     pFuncList->pfCopy16x16Aligned       =
622     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16_msa;
623     pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8_msa;
624   }
625 #endif
626 
627 #if defined(HAVE_LSX)
628   if (uiCpuFlag & WELS_CPU_LSX) {
629     pFuncList->pfCopy8x8Aligned         = WelsCopy8x8_lsx;
630     pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_lsx;
631     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_lsx;
632     pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_lsx;
633   }
634 #endif
635 }
636 }
637