• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2013, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    get_intra_predictor.c
33  *
34  * \brief   implementation for get intra predictor about 16x16, 4x4, chroma.
35  *
36  * \date    4/2/2009 Created
37  *          9/14/2009 C level based optimization with high performance gained.
38  *              [const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
39  *
40  *************************************************************************************
41  */
42 #include "ls_defines.h"
43 #include "cpu_core.h"
44 #include "intra_pred_common.h"
45 #include "get_intra_predictor.h"
46 
47 namespace WelsEnc {
48 #define I4x4_COUNT 4
49 #define I8x8_COUNT 8
50 #define I16x16_COUNT 16
51 
52 typedef void (*PFillingPred) (uint8_t* pPred, uint8_t* pSrc);
53 typedef void (*PFillingPred1to16) (uint8_t* pPred, const uint8_t kuiSrc);
54 
WelsFillingPred8to16_c(uint8_t * pPred,uint8_t * pSrc)55 static inline void WelsFillingPred8to16_c (uint8_t* pPred, uint8_t* pSrc) {
56   ST64 (pPred  , LD64 (pSrc));
57   ST64 (pPred + 8, LD64 (pSrc));
58 }
WelsFillingPred8x2to16_c(uint8_t * pPred,uint8_t * pSrc)59 static inline void WelsFillingPred8x2to16_c (uint8_t* pPred, uint8_t* pSrc) {
60   ST64 (pPred  , LD64 (pSrc));
61   ST64 (pPred + 8, LD64 (pSrc + 8));
62 }
WelsFillingPred1to16_c(uint8_t * pPred,const uint8_t kuiSrc)63 static inline void WelsFillingPred1to16_c (uint8_t* pPred, const uint8_t kuiSrc) {
64   const uint8_t kuiSrc8[8] = { kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc };
65   ST64 (pPred  , LD64 (kuiSrc8));
66   ST64 (pPred + 8, LD64 (kuiSrc8));
67 }
68 
69 #define WelsFillingPred8to16 WelsFillingPred8to16_c
70 #define WelsFillingPred8x2to16 WelsFillingPred8x2to16_c
71 #define WelsFillingPred1to16 WelsFillingPred1to16_c
72 
73 
74 
75 #define I4x4_PRED_STRIDE 4
76 #define I4x4_PRED_STRIDE2 8
77 #define I4x4_PRED_STRIDE3 12
78 
WelsI4x4LumaPredV_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)79 void WelsI4x4LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
80   const uint32_t kuiSrc = LD32 (&pRef[-kiStride]);
81   ENFORCE_STACK_ALIGN_1D (uint32_t, uiSrcx2, 2, 16)
82   uiSrcx2[0] = uiSrcx2[1] = kuiSrc;
83 
84   WelsFillingPred8to16 (pPred, (uint8_t*)&uiSrcx2[0]);
85 }
86 
WelsI4x4LumaPredH_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)87 void WelsI4x4LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
88   const uint32_t kiStridex2Left = (kiStride << 1) - 1;
89   const uint32_t kiStridex3Left = kiStride + kiStridex2Left;
90   const uint8_t kuiHor1 = pRef[-1];
91   const uint8_t kuiHor2 = pRef[kiStride - 1];
92   const uint8_t kuiHor3 = pRef[kiStridex2Left];
93   const uint8_t kuiHor4 = pRef[kiStridex3Left];
94   const uint8_t kuiVec1[4] = {kuiHor1, kuiHor1, kuiHor1, kuiHor1};
95   const uint8_t kuiVec2[4] = {kuiHor2, kuiHor2, kuiHor2, kuiHor2};
96   const uint8_t kuiVec3[4] = {kuiHor3, kuiHor3, kuiHor3, kuiHor3};
97   const uint8_t kuiVec4[4] = {kuiHor4, kuiHor4, kuiHor4, kuiHor4};
98   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
99   ST32 (&uiSrc[0], LD32 (kuiVec1));
100   ST32 (&uiSrc[4], LD32 (kuiVec2));
101   ST32 (&uiSrc[8], LD32 (kuiVec3));
102   ST32 (&uiSrc[12], LD32 (kuiVec4));
103 
104   WelsFillingPred8x2to16 (pPred, uiSrc);
105 }
WelsI4x4LumaPredDc_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)106 void WelsI4x4LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
107   const uint8_t kuiDcValue = (pRef[-1] + pRef[kiStride - 1] + pRef[ (kiStride << 1) - 1] + pRef[ (kiStride << 1) +
108                               kiStride - 1] +
109                               pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 4) >> 3;
110 
111   WelsFillingPred1to16 (pPred, kuiDcValue);
112 }
113 
WelsI4x4LumaPredDcLeft_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)114 void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
115   const uint8_t kuiDcValue = (pRef[-1] + pRef[kiStride - 1] + pRef[ (kiStride << 1) - 1] + pRef[ (kiStride << 1) +
116                               kiStride - 1] + 2) >> 2;
117 
118   WelsFillingPred1to16 (pPred, kuiDcValue);
119 }
120 
WelsI4x4LumaPredDcTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)121 void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
122   const uint8_t kuiDcValue = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 2) >> 2;
123 
124   WelsFillingPred1to16 (pPred, kuiDcValue);
125 }
126 
WelsI4x4LumaPredDcNA_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)127 void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
128   const uint8_t kuiDcValue = 0x80;
129 
130   WelsFillingPred1to16 (pPred, kuiDcValue);
131 }
132 
133 /*down pLeft*/
WelsI4x4LumaPredDDL_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)134 void WelsI4x4LumaPredDDL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
135   /*get pTop*/
136   const uint8_t kuiT0   = pRef[-kiStride];
137   const uint8_t kuiT1   = pRef[1 - kiStride];
138   const uint8_t kuiT2   = pRef[2 - kiStride];
139   const uint8_t kuiT3   = pRef[3 - kiStride];
140   const uint8_t kuiT4   = pRef[4 - kiStride];
141   const uint8_t kuiT5   = pRef[5 - kiStride];
142   const uint8_t kuiT6   = pRef[6 - kiStride];
143   const uint8_t kuiT7   = pRef[7 - kiStride];
144   const uint8_t kuiDDL0 = (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2;      // uiDDL0
145   const uint8_t kuiDDL1 = (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2;      // uiDDL1
146   const uint8_t kuiDDL2 = (2 + kuiT2 + kuiT4 + (kuiT3 << 1)) >> 2;      // uiDDL2
147   const uint8_t kuiDDL3 = (2 + kuiT3 + kuiT5 + (kuiT4 << 1)) >> 2;      // uiDDL3
148   const uint8_t kuiDDL4 = (2 + kuiT4 + kuiT6 + (kuiT5 << 1)) >> 2;      // uiDDL4
149   const uint8_t kuiDDL5 = (2 + kuiT5 + kuiT7 + (kuiT6 << 1)) >> 2;      // uiDDL5
150   const uint8_t kuiDDL6 = (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2;      // uiDDL6
151   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
152   uiSrc[0] = kuiDDL0;
153   uiSrc[1] = uiSrc[4] = kuiDDL1;
154   uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDDL2;
155   uiSrc[3] = uiSrc[6] = uiSrc[9] = uiSrc[12] = kuiDDL3;
156   uiSrc[7] = uiSrc[10] = uiSrc[13] = kuiDDL4;
157   uiSrc[11] = uiSrc[14] = kuiDDL5;
158   uiSrc[15] = kuiDDL6;
159 
160   WelsFillingPred8x2to16 (pPred, uiSrc);
161 }
162 
163 /*down pLeft*/
WelsI4x4LumaPredDDLTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)164 void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
165   /*get pTop*/
166   const uint8_t kuiT0   = pRef[-kiStride];
167   const uint8_t kuiT1   = pRef[1 - kiStride];
168   const uint8_t kuiT2   = pRef[2 - kiStride];
169   const uint8_t kuiT3   = pRef[3 - kiStride];
170   const uint8_t kuiDLT0 = (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2;      // uiDLT0
171   const uint8_t kuiDLT1 = (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2;      // uiDLT1
172   const uint8_t kuiDLT2 = (2 + kuiT2 + kuiT3 + (kuiT3 << 1)) >> 2;      // uiDLT2
173   const uint8_t kuiDLT3 = (2 + (kuiT3 << 2)) >> 2;                      // uiDLT3
174   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
175   memset (&uiSrc[6], kuiDLT3, 10 * sizeof (uint8_t));
176   uiSrc[0] = kuiDLT0;
177   uiSrc[1] = uiSrc[4] = kuiDLT1;
178   uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDLT2;
179   uiSrc[3] = kuiDLT3;
180 
181   WelsFillingPred8x2to16 (pPred, uiSrc);
182 }
183 
184 
185 /*down right*/
WelsI4x4LumaPredDDR_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)186 void WelsI4x4LumaPredDDR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
187   const int32_t kiStridex2  = kiStride << 1;
188   const int32_t kiStridex3  = kiStride + kiStridex2;
189   const uint8_t kuiLT       = pRef[-kiStride - 1];  // pTop-pLeft
190   /*get pLeft and pTop*/
191   const uint8_t kuiL0       = pRef[-1];
192   const uint8_t kuiL1       = pRef[kiStride - 1];
193   const uint8_t kuiL2       = pRef[kiStridex2 - 1];
194   const uint8_t kuiL3       = pRef[kiStridex3 - 1];
195   const uint8_t kuiT0       = pRef[-kiStride];
196   const uint8_t kuiT1       = pRef[1 - kiStride];
197   const uint8_t kuiT2       = pRef[2 - kiStride];
198   const uint8_t kuiT3       = pRef[3 - kiStride];
199   const uint16_t kuiTL0     = 1 + kuiLT + kuiL0;
200   const uint16_t kuiLT0     = 1 + kuiLT + kuiT0;
201   const uint16_t kuiT01     = 1 + kuiT0 + kuiT1;
202   const uint16_t kuiT12     = 1 + kuiT1 + kuiT2;
203   const uint16_t kuiT23     = 1 + kuiT2 + kuiT3;
204   const uint16_t kuiL01     = 1 + kuiL0 + kuiL1;
205   const uint16_t kuiL12     = 1 + kuiL1 + kuiL2;
206   const uint16_t kuiL23     = 1 + kuiL2 + kuiL3;
207   const uint8_t kuiDDR0     = (kuiTL0 + kuiLT0) >> 2;
208   const uint8_t kuiDDR1     = (kuiLT0 + kuiT01) >> 2;
209   const uint8_t kuiDDR2     = (kuiT01 + kuiT12) >> 2;
210   const uint8_t kuiDDR3     = (kuiT12 + kuiT23) >> 2;
211   const uint8_t kuiDDR4     = (kuiTL0 + kuiL01) >> 2;
212   const uint8_t kuiDDR5     = (kuiL01 + kuiL12) >> 2;
213   const uint8_t kuiDDR6     = (kuiL12 + kuiL23) >> 2;
214   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
215   uiSrc[0] = uiSrc[5] = uiSrc[10] = uiSrc[15] = kuiDDR0;
216   uiSrc[1] = uiSrc[6] = uiSrc[11] = kuiDDR1;
217   uiSrc[2] = uiSrc[7] = kuiDDR2;
218   uiSrc[3] = kuiDDR3;
219   uiSrc[4] = uiSrc[9] = uiSrc[14] = kuiDDR4;
220   uiSrc[8] = uiSrc[13] = kuiDDR5;
221   uiSrc[12] = kuiDDR6;
222 
223   WelsFillingPred8x2to16 (pPred, uiSrc);
224 }
225 
226 
227 /*vertical pLeft*/
WelsI4x4LumaPredVL_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)228 void WelsI4x4LumaPredVL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
229   /*get pTop*/
230   const uint8_t kuiT0   = pRef[-kiStride];
231   const uint8_t kuiT1   = pRef[1 - kiStride];
232   const uint8_t kuiT2   = pRef[2 - kiStride];
233   const uint8_t kuiT3   = pRef[3 - kiStride];
234   const uint8_t kuiT4   = pRef[4 - kiStride];
235   const uint8_t kuiT5   = pRef[5 - kiStride];
236   const uint8_t kuiT6   = pRef[6 - kiStride];
237   const uint8_t kuiVL0  = (1 + kuiT0 + kuiT1) >> 1;                     // uiVL0
238   const uint8_t kuiVL1  = (1 + kuiT1 + kuiT2) >> 1;                     // uiVL1
239   const uint8_t kuiVL2  = (1 + kuiT2 + kuiT3) >> 1;                     // uiVL2
240   const uint8_t kuiVL3  = (1 + kuiT3 + kuiT4) >> 1;                     // uiVL3
241   const uint8_t kuiVL4  = (1 + kuiT4 + kuiT5) >> 1;                     // uiVL4
242   const uint8_t kuiVL5  = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;      // uiVL5
243   const uint8_t kuiVL6  = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;      // uiVL6
244   const uint8_t kuiVL7  = (2 + kuiT2 + (kuiT3 << 1) + kuiT4) >> 2;      // uiVL7
245   const uint8_t kuiVL8  = (2 + kuiT3 + (kuiT4 << 1) + kuiT5) >> 2;      // uiVL8
246   const uint8_t kuiVL9  = (2 + kuiT4 + (kuiT5 << 1) + kuiT6) >> 2;      // uiVL9
247   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
248   uiSrc[0] = kuiVL0;
249   uiSrc[1] = uiSrc[8] = kuiVL1;
250   uiSrc[2] = uiSrc[9] = kuiVL2;
251   uiSrc[3] = uiSrc[10] = kuiVL3;
252   uiSrc[4] = kuiVL5;
253   uiSrc[5] = uiSrc[12] = kuiVL6;
254   uiSrc[6] = uiSrc[13] = kuiVL7;
255   uiSrc[7] = uiSrc[14] = kuiVL8;
256   uiSrc[11] = kuiVL4;
257   uiSrc[15] = kuiVL9;
258 
259   WelsFillingPred8x2to16 (pPred, uiSrc);
260 }
261 
262 
263 
264 /*vertical pLeft*/
WelsI4x4LumaPredVLTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)265 void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
266   uint8_t* pTopLeft     = &pRef[-kiStride - 1]; // pTop-pLeft
267   /*get pTop*/
268   const uint8_t kuiT0   = * (pTopLeft + 1);
269   const uint8_t kuiT1   = * (pTopLeft + 2);
270   const uint8_t kuiT2   = * (pTopLeft + 3);
271   const uint8_t kuiT3   = * (pTopLeft + 4);
272   const uint8_t kuiVLT0 = (1 + kuiT0 + kuiT1) >> 1;                     // uiVLT0
273   const uint8_t kuiVLT1 = (1 + kuiT1 + kuiT2) >> 1;                     // uiVLT1
274   const uint8_t kuiVLT2 = (1 + kuiT2 + kuiT3) >> 1;                     // uiVLT2
275   const uint8_t kuiVLT3 = (1 + (kuiT3 << 1)) >> 1;                      // uiVLT3
276   const uint8_t kuiVLT4 = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;      // uiVLT4
277   const uint8_t kuiVLT5 = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;      // uiVLT5
278   const uint8_t kuiVLT6 = (2 + kuiT2 + (kuiT3 << 1) + kuiT3) >> 2;      // uiVLT6
279   const uint8_t kuiVLT7 = (2 + (kuiT3 << 2)) >> 2;                      // uiVLT7
280   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
281   uiSrc[0] = kuiVLT0;
282   uiSrc[1] = uiSrc[8] = kuiVLT1;
283   uiSrc[2] = uiSrc[9] = kuiVLT2;
284   uiSrc[3] = uiSrc[10] = uiSrc[11] = kuiVLT3;
285   uiSrc[4] = kuiVLT4;
286   uiSrc[5] = uiSrc[12] = kuiVLT5;
287   uiSrc[6] = uiSrc[13] = kuiVLT6;
288   uiSrc[7] = uiSrc[14] = uiSrc[15] = kuiVLT7;
289 
290   WelsFillingPred8x2to16 (pPred, uiSrc);
291 }
292 
293 /*vertical right*/
WelsI4x4LumaPredVR_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)294 void WelsI4x4LumaPredVR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
295   const int32_t kiStridex2  = kiStride << 1;
296   const uint8_t kuiLT       = pRef[-kiStride - 1];  // pTop-pLeft
297   /*get pLeft and pTop*/
298   const uint8_t kuiL0       = pRef[-1];
299   const uint8_t kuiL1       = pRef[kiStride - 1];
300   const uint8_t kuiL2       = pRef[kiStridex2 - 1];
301   const uint8_t kuiT0       = pRef[-kiStride];
302   const uint8_t kuiT1       = pRef[1 - kiStride];
303   const uint8_t kuiT2       = pRef[2 - kiStride];
304   const uint8_t kuiT3       = pRef[3 - kiStride];
305   const uint8_t kuiVR0      = (1 + kuiLT + kuiT0) >> 1;
306   const uint8_t kuiVR1      = (1 + kuiT0 + kuiT1) >> 1;
307   const uint8_t kuiVR2      = (1 + kuiT1 + kuiT2) >> 1;
308   const uint8_t kuiVR3      = (1 + kuiT2 + kuiT3) >> 1;
309   const uint8_t kuiVR4      = (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;
310   const uint8_t kuiVR5      = (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;
311   const uint8_t kuiVR6      = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;
312   const uint8_t kuiVR7      = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;
313   const uint8_t kuiVR8      = (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;
314   const uint8_t kuiVR9      = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;
315   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
316   uiSrc[0] = uiSrc[9] = kuiVR0;
317   uiSrc[1] = uiSrc[10] = kuiVR1;
318   uiSrc[2] = uiSrc[11] = kuiVR2;
319   uiSrc[3] = kuiVR3;
320   uiSrc[4] = uiSrc[13] = kuiVR4;
321   uiSrc[5] = uiSrc[14] = kuiVR5;
322   uiSrc[6] = uiSrc[15] = kuiVR6;
323   uiSrc[7] = kuiVR7;
324   uiSrc[8] = kuiVR8;
325   uiSrc[12] = kuiVR9;
326 
327   WelsFillingPred8x2to16 (pPred, uiSrc);
328 }
329 
330 
331 /*horizontal up*/
WelsI4x4LumaPredHU_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)332 void WelsI4x4LumaPredHU_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
333   const int32_t kiStridex2  = kiStride << 1;
334   const int32_t kiStridex3  = kiStride + kiStridex2;
335   /*get pLeft*/
336   const uint8_t kuiL0       = pRef[-1];
337   const uint8_t kuiL1       = pRef[kiStride - 1];
338   const uint8_t kuiL2       = pRef[kiStridex2 - 1];
339   const uint8_t kuiL3       = pRef[kiStridex3 - 1];
340   const uint16_t kuiL01     = (1 + kuiL0 + kuiL1);
341   const uint16_t kuiL12     = (1 + kuiL1 + kuiL2);
342   const uint16_t kuiL23     = (1 + kuiL2 + kuiL3);
343   const uint8_t kuiHU0      = kuiL01 >> 1;
344   const uint8_t kuiHU1      = (kuiL01 + kuiL12) >> 2;
345   const uint8_t kuiHU2      = kuiL12 >> 1;
346   const uint8_t kuiHU3      = (kuiL12 + kuiL23) >> 2;
347   const uint8_t kuiHU4      = kuiL23 >> 1;
348   const uint8_t kuiHU5      = (1 + kuiL23 + (kuiL3 << 1)) >> 2;
349   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
350   uiSrc[0] = kuiHU0;
351   uiSrc[1] = kuiHU1;
352   uiSrc[2] = uiSrc[4] = kuiHU2;
353   uiSrc[3] = uiSrc[5] = kuiHU3;
354   uiSrc[6] = uiSrc[8] = kuiHU4;
355   uiSrc[7] = uiSrc[9] = kuiHU5;
356   memset (&uiSrc[10], kuiL3, 6 * sizeof (uint8_t));
357 
358   WelsFillingPred8x2to16 (pPred, uiSrc);
359 }
360 
361 
362 /*horizontal down*/
WelsI4x4LumaPredHD_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)363 void WelsI4x4LumaPredHD_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
364   const int32_t kiStridex2  = kiStride << 1;
365   const int32_t kiStridex3  = kiStride + kiStridex2;
366   const uint8_t kuiLT       = pRef[-kiStride - 1];  // pTop-pLeft
367   /*get pLeft and pTop*/
368   const uint8_t kuiL0       = pRef[-1];
369   const uint8_t kuiL1       = pRef[kiStride - 1];
370   const uint8_t kuiL2       = pRef[kiStridex2 - 1];
371   const uint8_t kuiL3       = pRef[kiStridex3 - 1];
372   const uint8_t kuiT0       = pRef[-kiStride];
373   const uint8_t kuiT1       = pRef[1 - kiStride];
374   const uint8_t kuiT2       = pRef[2 - kiStride];
375   const uint8_t kuiHD0      = (1 + kuiLT + kuiL0) >> 1;                     // uiHD0
376   const uint8_t kuiHD1      = (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;      // uiHD1
377   const uint8_t kuiHD2      = (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;      // uiHD2
378   const uint8_t kuiHD3      = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;      // uiHD3
379   const uint8_t kuiHD4      = (1 + kuiL0 + kuiL1) >> 1;                     // uiHD4
380   const uint8_t kuiHD5      = (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;      // uiHD5
381   const uint8_t kuiHD6      = (1 + kuiL1 + kuiL2) >> 1;                     // uiHD6
382   const uint8_t kuiHD7      = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;      // uiHD7
383   const uint8_t kuiHD8      = (1 + kuiL2 + kuiL3) >> 1;                     // uiHD8
384   const uint8_t kuiHD9      = (2 + kuiL1 + (kuiL2 << 1) + kuiL3) >> 2;      // uiHD9
385   ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
386   uiSrc[0] = uiSrc[6] = kuiHD0;
387   uiSrc[1] = uiSrc[7] = kuiHD1;
388   uiSrc[2] = kuiHD2;
389   uiSrc[3] = kuiHD3;
390   uiSrc[4] = uiSrc[10] = kuiHD4;
391   uiSrc[5] = uiSrc[11] = kuiHD5;
392   uiSrc[8] = uiSrc[14] = kuiHD6;
393   uiSrc[9] = uiSrc[15] = kuiHD7;
394   uiSrc[12] = kuiHD8;
395   uiSrc[13] = kuiHD9;
396 
397   WelsFillingPred8x2to16 (pPred, uiSrc);
398 }
399 
400 
401 
402 #define I8x8_PRED_STRIDE 8
403 
WelsIChromaPredV_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)404 void WelsIChromaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
405   const uint64_t kuiSrc64 = LD64 (&pRef[-kiStride]);
406 
407   ST64 (pPred     , kuiSrc64);
408   ST64 (pPred + 8 , kuiSrc64);
409   ST64 (pPred + 16, kuiSrc64);
410   ST64 (pPred + 24, kuiSrc64);
411   ST64 (pPred + 32, kuiSrc64);
412   ST64 (pPred + 40, kuiSrc64);
413   ST64 (pPred + 48, kuiSrc64);
414   ST64 (pPred + 56, kuiSrc64);
415 }
416 
WelsIChromaPredH_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)417 void WelsIChromaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
418   int32_t iStridex7 = (kiStride << 3) - kiStride;
419   int32_t iI8x8Stridex7 = (I8x8_PRED_STRIDE << 3) - I8x8_PRED_STRIDE;
420   uint8_t i = 7;
421 
422   do {
423     const uint8_t kuiLeft = pRef[iStridex7 - 1]; // pLeft value
424     uint64_t kuiSrc64 = (uint64_t) (0x0101010101010101ULL * kuiLeft);
425     ST64 (pPred + iI8x8Stridex7, kuiSrc64);
426 
427     iStridex7 -= kiStride;
428     iI8x8Stridex7 -= I8x8_PRED_STRIDE;
429   } while (i-- > 0);
430 }
431 
432 
WelsIChromaPredPlane_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)433 void WelsIChromaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
434   int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
435   int32_t i, j;
436   uint8_t* pTop = &pRef[-kiStride];
437   uint8_t* pLeft = &pRef[-1];
438 
439   for (i = 0 ; i < 4 ; i ++) {
440     iTopSum += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
441     iLeftSum += (i + 1) * (pLeft[ (4 + i) * kiStride] - pLeft[ (2 - i) * kiStride]);
442   }
443 
444   iLTshift = (pLeft[7 * kiStride] + pTop[7]) << 4;
445   iTopshift = (17 * iTopSum + 16) >> 5;
446   iLeftshift = (17 * iLeftSum + 16) >> 5;
447 
448   for (i = 0 ; i < 8 ; i ++) {
449     for (j = 0 ; j < 8 ; j ++) {
450       pPred[j] = WelsClip1 ((iLTshift + iTopshift * (j - 3) + iLeftshift * (i - 3) + 16) >> 5);
451     }
452     pPred += I8x8_PRED_STRIDE;
453   }
454 }
455 
456 
WelsIChromaPredDc_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)457 void WelsIChromaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
458   const int32_t kuiL1 = kiStride - 1;
459   const int32_t kuiL2 = kuiL1 + kiStride;
460   const int32_t kuiL3 = kuiL2 + kiStride;
461   const int32_t kuiL4 = kuiL3 + kiStride;
462   const int32_t kuiL5 = kuiL4 + kiStride;
463   const int32_t kuiL6 = kuiL5 + kiStride;
464   const int32_t kuiL7 = kuiL6 + kiStride;
465   /*caculate the iMean value*/
466   const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] +
467                             pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 4) >> 3;
468   const uint32_t kuiSum2 = pRef[4 - kiStride] + pRef[5 - kiStride] + pRef[6 - kiStride] + pRef[7 - kiStride];
469   const uint32_t kuiSum3 = pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7];
470   const uint8_t kuiMean2 = (kuiSum2 + 2) >> 2;
471   const uint8_t kuiMean3 = (kuiSum3 + 2) >> 2;
472   const uint8_t kuiMean4 = (kuiSum2 + kuiSum3 + 4) >> 3;
473 
474   const uint8_t kuiTopMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
475   const uint8_t kuiBottomMean[8] = {kuiMean3, kuiMean3, kuiMean3, kuiMean3, kuiMean4, kuiMean4, kuiMean4, kuiMean4};
476   const uint64_t kuiTopMean64 = LD64 (kuiTopMean);
477   const uint64_t kuiBottomMean64 = LD64 (kuiBottomMean);
478 
479   ST64 (pPred     , kuiTopMean64);
480   ST64 (pPred + 8 , kuiTopMean64);
481   ST64 (pPred + 16, kuiTopMean64);
482   ST64 (pPred + 24, kuiTopMean64);
483   ST64 (pPred + 32, kuiBottomMean64);
484   ST64 (pPred + 40, kuiBottomMean64);
485   ST64 (pPred + 48, kuiBottomMean64);
486   ST64 (pPred + 56, kuiBottomMean64);
487 }
488 
WelsIChromaPredDcLeft_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)489 void WelsIChromaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
490   const int32_t kuiL1   = kiStride - 1;
491   const int32_t kuiL2   = kuiL1 + kiStride;
492   const int32_t kuiL3   = kuiL2 + kiStride;
493   const int32_t kuiL4   = kuiL3 + kiStride;
494   const int32_t kuiL5   = kuiL4 + kiStride;
495   const int32_t kuiL6   = kuiL5 + kiStride;
496   const int32_t kuiL7   = kuiL6 + kiStride;
497   /*caculate the iMean value*/
498   const uint8_t kuiTopMean          = (pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 2) >> 2 ;
499   const uint8_t kuiBottomMean       = (pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7] + 2) >> 2;
500   const uint64_t kuiTopMean64       = (uint64_t) (0x0101010101010101ULL * kuiTopMean);
501   const uint64_t kuiBottomMean64    = (uint64_t) (0x0101010101010101ULL * kuiBottomMean);
502   ST64 (pPred     , kuiTopMean64);
503   ST64 (pPred + 8 , kuiTopMean64);
504   ST64 (pPred + 16, kuiTopMean64);
505   ST64 (pPred + 24, kuiTopMean64);
506   ST64 (pPred + 32, kuiBottomMean64);
507   ST64 (pPred + 40, kuiBottomMean64);
508   ST64 (pPred + 48, kuiBottomMean64);
509   ST64 (pPred + 56, kuiBottomMean64);
510 }
511 
WelsIChromaPredDcTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)512 void WelsIChromaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
513   /*caculate the iMean value*/
514   const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 2) >> 2;
515   const uint8_t kuiMean2 = (pRef[4 - kiStride] + pRef[5 - kiStride] + pRef[6 - kiStride] + pRef[7 - kiStride] + 2) >> 2;
516   const uint8_t kuiMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
517   const uint64_t kuiMean64 = LD64 (kuiMean);
518 
519   ST64 (pPred     , kuiMean64);
520   ST64 (pPred + 8 , kuiMean64);
521   ST64 (pPred + 16, kuiMean64);
522   ST64 (pPred + 24, kuiMean64);
523   ST64 (pPred + 32, kuiMean64);
524   ST64 (pPred + 40, kuiMean64);
525   ST64 (pPred + 48, kuiMean64);
526   ST64 (pPred + 56, kuiMean64);
527 }
528 
WelsIChromaPredDcNA_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)529 void WelsIChromaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
530   const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080ULL;
531   ST64 (pPred     , kuiDcValue64);
532   ST64 (pPred + 8 , kuiDcValue64);
533   ST64 (pPred + 16, kuiDcValue64);
534   ST64 (pPred + 24, kuiDcValue64);
535   ST64 (pPred + 32, kuiDcValue64);
536   ST64 (pPred + 40, kuiDcValue64);
537   ST64 (pPred + 48, kuiDcValue64);
538   ST64 (pPred + 56, kuiDcValue64);
539 }
540 
541 
WelsI16x16LumaPredPlane_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)542 void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
543   int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
544   int32_t i, j;
545   uint8_t* pTop = &pRef[-kiStride];
546   uint8_t* pLeft = &pRef[-1];
547   int32_t iPredStride = 16;
548 
549   for (i = 0 ; i < 8 ; i ++) {
550     iTopSum += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
551     iLeftSum += (i + 1) * (pLeft[ (8 + i) * kiStride] - pLeft[ (6 - i) * kiStride]);
552   }
553 
554   iLTshift = (pLeft[15 * kiStride] + pTop[15]) << 4;
555   iTopshift = (5 * iTopSum + 32) >> 6;
556   iLeftshift = (5 * iLeftSum + 32) >> 6;
557 
558   for (i = 0 ; i < 16 ; i ++) {
559     for (j = 0 ; j < 16 ; j ++) {
560       pPred[j] = WelsClip1 ((iLTshift + iTopshift * (j - 7) + iLeftshift * (i - 7) + 16) >> 5);
561     }
562     pPred += iPredStride;
563   }
564 }
565 
WelsI16x16LumaPredDc_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)566 void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
567   int32_t iStridex15 = (kiStride << 4) - kiStride;
568   int32_t iSum = 0;
569   uint8_t i = 15;
570   uint8_t iMean = 0;
571 
572   /*caculate the iMean value*/
573   do {
574     iSum += pRef[-1 + iStridex15] + pRef[-kiStride + i];
575     iStridex15 -= kiStride;
576   } while (i-- > 0);
577   iMean = (16 + iSum) >> 5;
578   memset (pPred, iMean, 256);
579 }
580 
581 
WelsI16x16LumaPredDcTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)582 void WelsI16x16LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
583   int32_t iSum = 0;
584   uint8_t i = 15;
585   uint8_t iMean = 0;
586 
587   /*caculate the iMean value*/
588   do {
589     iSum += pRef[-kiStride + i];
590   } while (i-- > 0);
591   iMean = (8 + iSum) >> 4;
592   memset (pPred, iMean, 256);
593 }
594 
WelsI16x16LumaPredDcLeft_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)595 void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
596   int32_t iStridex15 = (kiStride << 4) - kiStride;
597   int32_t iSum = 0;
598   uint8_t i = 15;
599   uint8_t iMean = 0;
600 
601   /*caculate the iMean value*/
602   do {
603     iSum += pRef[-1 + iStridex15];
604     iStridex15 -= kiStride;
605   } while (i-- > 0);
606   iMean = (8 + iSum) >> 4;
607   memset (pPred, iMean, 256);
608 }
609 
WelsI16x16LumaPredDcNA_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)610 void WelsI16x16LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
611   memset (pPred, 0x80, 256);
612 }
613 
WelsInitIntraPredFuncs(SWelsFuncPtrList * pFuncList,const uint32_t kuiCpuFlag)614 void WelsInitIntraPredFuncs (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuFlag) {
615   pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] =      WelsI16x16LumaPredV_c;
616   pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] =      WelsI16x16LumaPredH_c;
617   pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] =     WelsI16x16LumaPredDc_c;
618   pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] =      WelsI16x16LumaPredPlane_c;
619   pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L] =   WelsI16x16LumaPredDcLeft_c;
620   pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T] =   WelsI16x16LumaPredDcTop_c;
621   pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
622 
623   pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_c;
624   pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_c;
625   pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_c;
626   pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_L] = WelsI4x4LumaPredDcLeft_c;
627   pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T] = WelsI4x4LumaPredDcTop_c;
628   pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
629 
630   pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_c;
631   pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
632   pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_c;
633 
634   pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL] = WelsI4x4LumaPredVL_c;
635   pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
636   pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR] = WelsI4x4LumaPredVR_c;
637   pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU] = WelsI4x4LumaPredHU_c;
638   pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD] = WelsI4x4LumaPredHD_c;
639 
640   pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_c;
641   pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_c;
642   pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_c;
643   pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_c;
644   pFuncList->pfGetChromaPred[C_PRED_DC_L] = WelsIChromaPredDcLeft_c;
645   pFuncList->pfGetChromaPred[C_PRED_DC_T] = WelsIChromaPredDcTop_c;
646   pFuncList->pfGetChromaPred[C_PRED_DC_128] = WelsIChromaPredDcNA_c;
647 #ifdef HAVE_NEON
648   if (kuiCpuFlag & WELS_CPU_NEON) {
649     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_neon;
650     pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD]  = WelsI4x4LumaPredHD_neon;
651     pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU]  = WelsI4x4LumaPredHU_neon;
652     pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR]  = WelsI4x4LumaPredVR_neon;
653     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_neon;
654     pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL]  = WelsI4x4LumaPredVL_neon;
655     pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_neon;
656     pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_neon;
657 
658     pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_neon;
659     pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_neon;
660     pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_neon;
661     pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_neon;
662 
663     pFuncList->pfGetChromaPred[C_PRED_DC]   = WelsIChromaPredDc_neon;
664     pFuncList->pfGetChromaPred[C_PRED_V]    = WelsIChromaPredV_neon;
665     pFuncList->pfGetChromaPred[C_PRED_P]    = WelsIChromaPredPlane_neon;
666     pFuncList->pfGetChromaPred[C_PRED_H]    = WelsIChromaPredH_neon;
667   }
668 #endif
669 
670 #if defined(HAVE_NEON_AARCH64)
671   if (kuiCpuFlag & WELS_CPU_NEON) {
672     pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_AArch64_neon;
673     pFuncList->pfGetLumaI16x16Pred[I16_PRED_P]  = WelsI16x16LumaPredPlane_AArch64_neon;
674     pFuncList->pfGetLumaI16x16Pred[I16_PRED_H]  = WelsI16x16LumaPredH_AArch64_neon;
675     pFuncList->pfGetLumaI16x16Pred[I16_PRED_V]  = WelsI16x16LumaPredV_AArch64_neon;
676     pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L]  = WelsI16x16LumaPredDcLeft_AArch64_neon;
677     pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T]  = WelsI16x16LumaPredDcTop_AArch64_neon;
678 
679     pFuncList->pfGetLumaI4x4Pred[I4_PRED_H    ] = WelsI4x4LumaPredH_AArch64_neon;
680     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL  ] = WelsI4x4LumaPredDDL_AArch64_neon;
681     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_AArch64_neon;
682     pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL   ] = WelsI4x4LumaPredVL_AArch64_neon;
683     pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP ] = WelsI4x4LumaPredVLTop_AArch64_neon;
684     pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR   ] = WelsI4x4LumaPredVR_AArch64_neon;
685     pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU   ] = WelsI4x4LumaPredHU_AArch64_neon;
686     pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD   ] = WelsI4x4LumaPredHD_AArch64_neon;
687     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC   ] = WelsI4x4LumaPredDc_AArch64_neon;
688     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T   ] = WelsI4x4LumaPredDcTop_AArch64_neon;
689 
690     pFuncList->pfGetChromaPred[C_PRED_H]       = WelsIChromaPredH_AArch64_neon;
691     pFuncList->pfGetChromaPred[C_PRED_V]       = WelsIChromaPredV_AArch64_neon;
692     pFuncList->pfGetChromaPred[C_PRED_P ]      = WelsIChromaPredPlane_AArch64_neon;
693     pFuncList->pfGetChromaPred[C_PRED_DC]      = WelsIChromaPredDc_AArch64_neon;
694     pFuncList->pfGetChromaPred[C_PRED_DC_T]      = WelsIChromaPredDcTop_AArch64_neon;
695   }
696 #endif//HAVE_NEON_AARCH64
697 
698 #ifdef X86_ASM
699   if (kuiCpuFlag & WELS_CPU_MMXEXT) {
700     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_mmx;
701     pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD]  = WelsI4x4LumaPredHD_mmx;
702     pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU]  = WelsI4x4LumaPredHU_mmx;
703     pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR]  = WelsI4x4LumaPredVR_mmx;
704     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_mmx;
705     pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL]  = WelsI4x4LumaPredVL_mmx;
706     pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmx;
707   }
708   if (kuiCpuFlag & WELS_CPU_SSE2) {
709     pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_sse2;
710     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_sse2;
711     pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_sse2;
712 
713     pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_sse2;
714     pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_sse2;
715     pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
716     pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_sse2;
717 
718     pFuncList->pfGetChromaPred[C_PRED_DC]   = WelsIChromaPredDc_sse2;
719     pFuncList->pfGetChromaPred[C_PRED_V]    = WelsIChromaPredV_sse2;
720     pFuncList->pfGetChromaPred[C_PRED_P]    = WelsIChromaPredPlane_sse2;
721   }
722 #endif
723 
724 #if defined(HAVE_MMI)
725   if (kuiCpuFlag & WELS_CPU_MMI) {
726     pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_mmi;
727     pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_mmi;
728     pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_mmi;
729     pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_mmi;
730 
731     pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmi;
732     pFuncList->pfGetChromaPred[C_PRED_DC]   = WelsIChromaPredDc_mmi;
733     pFuncList->pfGetChromaPred[C_PRED_V]    = WelsIChromaPredV_mmi;
734     pFuncList->pfGetChromaPred[C_PRED_P]    = WelsIChromaPredPlane_mmi;
735   }
736 #endif//HAVE_MMI
737 }
738 }
739