1 /*!
2 * \copy
3 * Copyright (c) 2009-2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file get_intra_predictor.c
33 *
34 * \brief implementation for get intra predictor about 16x16, 4x4, chroma.
35 *
36 * \date 4/2/2009 Created
37 * 9/14/2009 C level based optimization with high performance gained.
38 * [const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
39 *
40 *************************************************************************************
41 */
42 #include "ls_defines.h"
43 #include "cpu_core.h"
44 #include "intra_pred_common.h"
45 #include "get_intra_predictor.h"
46
47 namespace WelsEnc {
48 #define I4x4_COUNT 4
49 #define I8x8_COUNT 8
50 #define I16x16_COUNT 16
51
52 typedef void (*PFillingPred) (uint8_t* pPred, uint8_t* pSrc);
53 typedef void (*PFillingPred1to16) (uint8_t* pPred, const uint8_t kuiSrc);
54
WelsFillingPred8to16_c(uint8_t * pPred,uint8_t * pSrc)55 static inline void WelsFillingPred8to16_c (uint8_t* pPred, uint8_t* pSrc) {
56 ST64 (pPred , LD64 (pSrc));
57 ST64 (pPred + 8, LD64 (pSrc));
58 }
WelsFillingPred8x2to16_c(uint8_t * pPred,uint8_t * pSrc)59 static inline void WelsFillingPred8x2to16_c (uint8_t* pPred, uint8_t* pSrc) {
60 ST64 (pPred , LD64 (pSrc));
61 ST64 (pPred + 8, LD64 (pSrc + 8));
62 }
WelsFillingPred1to16_c(uint8_t * pPred,const uint8_t kuiSrc)63 static inline void WelsFillingPred1to16_c (uint8_t* pPred, const uint8_t kuiSrc) {
64 const uint8_t kuiSrc8[8] = { kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc };
65 ST64 (pPred , LD64 (kuiSrc8));
66 ST64 (pPred + 8, LD64 (kuiSrc8));
67 }
68
69 #define WelsFillingPred8to16 WelsFillingPred8to16_c
70 #define WelsFillingPred8x2to16 WelsFillingPred8x2to16_c
71 #define WelsFillingPred1to16 WelsFillingPred1to16_c
72
73
74
75 #define I4x4_PRED_STRIDE 4
76 #define I4x4_PRED_STRIDE2 8
77 #define I4x4_PRED_STRIDE3 12
78
WelsI4x4LumaPredV_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)79 void WelsI4x4LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
80 const uint32_t kuiSrc = LD32 (&pRef[-kiStride]);
81 ENFORCE_STACK_ALIGN_1D (uint32_t, uiSrcx2, 2, 16)
82 uiSrcx2[0] = uiSrcx2[1] = kuiSrc;
83
84 WelsFillingPred8to16 (pPred, (uint8_t*)&uiSrcx2[0]);
85 }
86
WelsI4x4LumaPredH_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)87 void WelsI4x4LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
88 const uint32_t kiStridex2Left = (kiStride << 1) - 1;
89 const uint32_t kiStridex3Left = kiStride + kiStridex2Left;
90 const uint8_t kuiHor1 = pRef[-1];
91 const uint8_t kuiHor2 = pRef[kiStride - 1];
92 const uint8_t kuiHor3 = pRef[kiStridex2Left];
93 const uint8_t kuiHor4 = pRef[kiStridex3Left];
94 const uint8_t kuiVec1[4] = {kuiHor1, kuiHor1, kuiHor1, kuiHor1};
95 const uint8_t kuiVec2[4] = {kuiHor2, kuiHor2, kuiHor2, kuiHor2};
96 const uint8_t kuiVec3[4] = {kuiHor3, kuiHor3, kuiHor3, kuiHor3};
97 const uint8_t kuiVec4[4] = {kuiHor4, kuiHor4, kuiHor4, kuiHor4};
98 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
99 ST32 (&uiSrc[0], LD32 (kuiVec1));
100 ST32 (&uiSrc[4], LD32 (kuiVec2));
101 ST32 (&uiSrc[8], LD32 (kuiVec3));
102 ST32 (&uiSrc[12], LD32 (kuiVec4));
103
104 WelsFillingPred8x2to16 (pPred, uiSrc);
105 }
WelsI4x4LumaPredDc_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)106 void WelsI4x4LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
107 const uint8_t kuiDcValue = (pRef[-1] + pRef[kiStride - 1] + pRef[ (kiStride << 1) - 1] + pRef[ (kiStride << 1) +
108 kiStride - 1] +
109 pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 4) >> 3;
110
111 WelsFillingPred1to16 (pPred, kuiDcValue);
112 }
113
WelsI4x4LumaPredDcLeft_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)114 void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
115 const uint8_t kuiDcValue = (pRef[-1] + pRef[kiStride - 1] + pRef[ (kiStride << 1) - 1] + pRef[ (kiStride << 1) +
116 kiStride - 1] + 2) >> 2;
117
118 WelsFillingPred1to16 (pPred, kuiDcValue);
119 }
120
WelsI4x4LumaPredDcTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)121 void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
122 const uint8_t kuiDcValue = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 2) >> 2;
123
124 WelsFillingPred1to16 (pPred, kuiDcValue);
125 }
126
WelsI4x4LumaPredDcNA_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)127 void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
128 const uint8_t kuiDcValue = 0x80;
129
130 WelsFillingPred1to16 (pPred, kuiDcValue);
131 }
132
133 /*down pLeft*/
WelsI4x4LumaPredDDL_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)134 void WelsI4x4LumaPredDDL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
135 /*get pTop*/
136 const uint8_t kuiT0 = pRef[-kiStride];
137 const uint8_t kuiT1 = pRef[1 - kiStride];
138 const uint8_t kuiT2 = pRef[2 - kiStride];
139 const uint8_t kuiT3 = pRef[3 - kiStride];
140 const uint8_t kuiT4 = pRef[4 - kiStride];
141 const uint8_t kuiT5 = pRef[5 - kiStride];
142 const uint8_t kuiT6 = pRef[6 - kiStride];
143 const uint8_t kuiT7 = pRef[7 - kiStride];
144 const uint8_t kuiDDL0 = (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2; // uiDDL0
145 const uint8_t kuiDDL1 = (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2; // uiDDL1
146 const uint8_t kuiDDL2 = (2 + kuiT2 + kuiT4 + (kuiT3 << 1)) >> 2; // uiDDL2
147 const uint8_t kuiDDL3 = (2 + kuiT3 + kuiT5 + (kuiT4 << 1)) >> 2; // uiDDL3
148 const uint8_t kuiDDL4 = (2 + kuiT4 + kuiT6 + (kuiT5 << 1)) >> 2; // uiDDL4
149 const uint8_t kuiDDL5 = (2 + kuiT5 + kuiT7 + (kuiT6 << 1)) >> 2; // uiDDL5
150 const uint8_t kuiDDL6 = (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2; // uiDDL6
151 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
152 uiSrc[0] = kuiDDL0;
153 uiSrc[1] = uiSrc[4] = kuiDDL1;
154 uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDDL2;
155 uiSrc[3] = uiSrc[6] = uiSrc[9] = uiSrc[12] = kuiDDL3;
156 uiSrc[7] = uiSrc[10] = uiSrc[13] = kuiDDL4;
157 uiSrc[11] = uiSrc[14] = kuiDDL5;
158 uiSrc[15] = kuiDDL6;
159
160 WelsFillingPred8x2to16 (pPred, uiSrc);
161 }
162
163 /*down pLeft*/
WelsI4x4LumaPredDDLTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)164 void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
165 /*get pTop*/
166 const uint8_t kuiT0 = pRef[-kiStride];
167 const uint8_t kuiT1 = pRef[1 - kiStride];
168 const uint8_t kuiT2 = pRef[2 - kiStride];
169 const uint8_t kuiT3 = pRef[3 - kiStride];
170 const uint8_t kuiDLT0 = (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2; // uiDLT0
171 const uint8_t kuiDLT1 = (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2; // uiDLT1
172 const uint8_t kuiDLT2 = (2 + kuiT2 + kuiT3 + (kuiT3 << 1)) >> 2; // uiDLT2
173 const uint8_t kuiDLT3 = (2 + (kuiT3 << 2)) >> 2; // uiDLT3
174 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
175 memset (&uiSrc[6], kuiDLT3, 10 * sizeof (uint8_t));
176 uiSrc[0] = kuiDLT0;
177 uiSrc[1] = uiSrc[4] = kuiDLT1;
178 uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDLT2;
179 uiSrc[3] = kuiDLT3;
180
181 WelsFillingPred8x2to16 (pPred, uiSrc);
182 }
183
184
185 /*down right*/
WelsI4x4LumaPredDDR_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)186 void WelsI4x4LumaPredDDR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
187 const int32_t kiStridex2 = kiStride << 1;
188 const int32_t kiStridex3 = kiStride + kiStridex2;
189 const uint8_t kuiLT = pRef[-kiStride - 1]; // pTop-pLeft
190 /*get pLeft and pTop*/
191 const uint8_t kuiL0 = pRef[-1];
192 const uint8_t kuiL1 = pRef[kiStride - 1];
193 const uint8_t kuiL2 = pRef[kiStridex2 - 1];
194 const uint8_t kuiL3 = pRef[kiStridex3 - 1];
195 const uint8_t kuiT0 = pRef[-kiStride];
196 const uint8_t kuiT1 = pRef[1 - kiStride];
197 const uint8_t kuiT2 = pRef[2 - kiStride];
198 const uint8_t kuiT3 = pRef[3 - kiStride];
199 const uint16_t kuiTL0 = 1 + kuiLT + kuiL0;
200 const uint16_t kuiLT0 = 1 + kuiLT + kuiT0;
201 const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
202 const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
203 const uint16_t kuiT23 = 1 + kuiT2 + kuiT3;
204 const uint16_t kuiL01 = 1 + kuiL0 + kuiL1;
205 const uint16_t kuiL12 = 1 + kuiL1 + kuiL2;
206 const uint16_t kuiL23 = 1 + kuiL2 + kuiL3;
207 const uint8_t kuiDDR0 = (kuiTL0 + kuiLT0) >> 2;
208 const uint8_t kuiDDR1 = (kuiLT0 + kuiT01) >> 2;
209 const uint8_t kuiDDR2 = (kuiT01 + kuiT12) >> 2;
210 const uint8_t kuiDDR3 = (kuiT12 + kuiT23) >> 2;
211 const uint8_t kuiDDR4 = (kuiTL0 + kuiL01) >> 2;
212 const uint8_t kuiDDR5 = (kuiL01 + kuiL12) >> 2;
213 const uint8_t kuiDDR6 = (kuiL12 + kuiL23) >> 2;
214 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
215 uiSrc[0] = uiSrc[5] = uiSrc[10] = uiSrc[15] = kuiDDR0;
216 uiSrc[1] = uiSrc[6] = uiSrc[11] = kuiDDR1;
217 uiSrc[2] = uiSrc[7] = kuiDDR2;
218 uiSrc[3] = kuiDDR3;
219 uiSrc[4] = uiSrc[9] = uiSrc[14] = kuiDDR4;
220 uiSrc[8] = uiSrc[13] = kuiDDR5;
221 uiSrc[12] = kuiDDR6;
222
223 WelsFillingPred8x2to16 (pPred, uiSrc);
224 }
225
226
227 /*vertical pLeft*/
WelsI4x4LumaPredVL_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)228 void WelsI4x4LumaPredVL_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
229 /*get pTop*/
230 const uint8_t kuiT0 = pRef[-kiStride];
231 const uint8_t kuiT1 = pRef[1 - kiStride];
232 const uint8_t kuiT2 = pRef[2 - kiStride];
233 const uint8_t kuiT3 = pRef[3 - kiStride];
234 const uint8_t kuiT4 = pRef[4 - kiStride];
235 const uint8_t kuiT5 = pRef[5 - kiStride];
236 const uint8_t kuiT6 = pRef[6 - kiStride];
237 const uint8_t kuiVL0 = (1 + kuiT0 + kuiT1) >> 1; // uiVL0
238 const uint8_t kuiVL1 = (1 + kuiT1 + kuiT2) >> 1; // uiVL1
239 const uint8_t kuiVL2 = (1 + kuiT2 + kuiT3) >> 1; // uiVL2
240 const uint8_t kuiVL3 = (1 + kuiT3 + kuiT4) >> 1; // uiVL3
241 const uint8_t kuiVL4 = (1 + kuiT4 + kuiT5) >> 1; // uiVL4
242 const uint8_t kuiVL5 = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2; // uiVL5
243 const uint8_t kuiVL6 = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2; // uiVL6
244 const uint8_t kuiVL7 = (2 + kuiT2 + (kuiT3 << 1) + kuiT4) >> 2; // uiVL7
245 const uint8_t kuiVL8 = (2 + kuiT3 + (kuiT4 << 1) + kuiT5) >> 2; // uiVL8
246 const uint8_t kuiVL9 = (2 + kuiT4 + (kuiT5 << 1) + kuiT6) >> 2; // uiVL9
247 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
248 uiSrc[0] = kuiVL0;
249 uiSrc[1] = uiSrc[8] = kuiVL1;
250 uiSrc[2] = uiSrc[9] = kuiVL2;
251 uiSrc[3] = uiSrc[10] = kuiVL3;
252 uiSrc[4] = kuiVL5;
253 uiSrc[5] = uiSrc[12] = kuiVL6;
254 uiSrc[6] = uiSrc[13] = kuiVL7;
255 uiSrc[7] = uiSrc[14] = kuiVL8;
256 uiSrc[11] = kuiVL4;
257 uiSrc[15] = kuiVL9;
258
259 WelsFillingPred8x2to16 (pPred, uiSrc);
260 }
261
262
263
264 /*vertical pLeft*/
WelsI4x4LumaPredVLTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)265 void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
266 uint8_t* pTopLeft = &pRef[-kiStride - 1]; // pTop-pLeft
267 /*get pTop*/
268 const uint8_t kuiT0 = * (pTopLeft + 1);
269 const uint8_t kuiT1 = * (pTopLeft + 2);
270 const uint8_t kuiT2 = * (pTopLeft + 3);
271 const uint8_t kuiT3 = * (pTopLeft + 4);
272 const uint8_t kuiVLT0 = (1 + kuiT0 + kuiT1) >> 1; // uiVLT0
273 const uint8_t kuiVLT1 = (1 + kuiT1 + kuiT2) >> 1; // uiVLT1
274 const uint8_t kuiVLT2 = (1 + kuiT2 + kuiT3) >> 1; // uiVLT2
275 const uint8_t kuiVLT3 = (1 + (kuiT3 << 1)) >> 1; // uiVLT3
276 const uint8_t kuiVLT4 = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2; // uiVLT4
277 const uint8_t kuiVLT5 = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2; // uiVLT5
278 const uint8_t kuiVLT6 = (2 + kuiT2 + (kuiT3 << 1) + kuiT3) >> 2; // uiVLT6
279 const uint8_t kuiVLT7 = (2 + (kuiT3 << 2)) >> 2; // uiVLT7
280 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
281 uiSrc[0] = kuiVLT0;
282 uiSrc[1] = uiSrc[8] = kuiVLT1;
283 uiSrc[2] = uiSrc[9] = kuiVLT2;
284 uiSrc[3] = uiSrc[10] = uiSrc[11] = kuiVLT3;
285 uiSrc[4] = kuiVLT4;
286 uiSrc[5] = uiSrc[12] = kuiVLT5;
287 uiSrc[6] = uiSrc[13] = kuiVLT6;
288 uiSrc[7] = uiSrc[14] = uiSrc[15] = kuiVLT7;
289
290 WelsFillingPred8x2to16 (pPred, uiSrc);
291 }
292
293 /*vertical right*/
WelsI4x4LumaPredVR_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)294 void WelsI4x4LumaPredVR_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
295 const int32_t kiStridex2 = kiStride << 1;
296 const uint8_t kuiLT = pRef[-kiStride - 1]; // pTop-pLeft
297 /*get pLeft and pTop*/
298 const uint8_t kuiL0 = pRef[-1];
299 const uint8_t kuiL1 = pRef[kiStride - 1];
300 const uint8_t kuiL2 = pRef[kiStridex2 - 1];
301 const uint8_t kuiT0 = pRef[-kiStride];
302 const uint8_t kuiT1 = pRef[1 - kiStride];
303 const uint8_t kuiT2 = pRef[2 - kiStride];
304 const uint8_t kuiT3 = pRef[3 - kiStride];
305 const uint8_t kuiVR0 = (1 + kuiLT + kuiT0) >> 1;
306 const uint8_t kuiVR1 = (1 + kuiT0 + kuiT1) >> 1;
307 const uint8_t kuiVR2 = (1 + kuiT1 + kuiT2) >> 1;
308 const uint8_t kuiVR3 = (1 + kuiT2 + kuiT3) >> 1;
309 const uint8_t kuiVR4 = (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;
310 const uint8_t kuiVR5 = (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;
311 const uint8_t kuiVR6 = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;
312 const uint8_t kuiVR7 = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;
313 const uint8_t kuiVR8 = (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;
314 const uint8_t kuiVR9 = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;
315 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
316 uiSrc[0] = uiSrc[9] = kuiVR0;
317 uiSrc[1] = uiSrc[10] = kuiVR1;
318 uiSrc[2] = uiSrc[11] = kuiVR2;
319 uiSrc[3] = kuiVR3;
320 uiSrc[4] = uiSrc[13] = kuiVR4;
321 uiSrc[5] = uiSrc[14] = kuiVR5;
322 uiSrc[6] = uiSrc[15] = kuiVR6;
323 uiSrc[7] = kuiVR7;
324 uiSrc[8] = kuiVR8;
325 uiSrc[12] = kuiVR9;
326
327 WelsFillingPred8x2to16 (pPred, uiSrc);
328 }
329
330
331 /*horizontal up*/
WelsI4x4LumaPredHU_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)332 void WelsI4x4LumaPredHU_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
333 const int32_t kiStridex2 = kiStride << 1;
334 const int32_t kiStridex3 = kiStride + kiStridex2;
335 /*get pLeft*/
336 const uint8_t kuiL0 = pRef[-1];
337 const uint8_t kuiL1 = pRef[kiStride - 1];
338 const uint8_t kuiL2 = pRef[kiStridex2 - 1];
339 const uint8_t kuiL3 = pRef[kiStridex3 - 1];
340 const uint16_t kuiL01 = (1 + kuiL0 + kuiL1);
341 const uint16_t kuiL12 = (1 + kuiL1 + kuiL2);
342 const uint16_t kuiL23 = (1 + kuiL2 + kuiL3);
343 const uint8_t kuiHU0 = kuiL01 >> 1;
344 const uint8_t kuiHU1 = (kuiL01 + kuiL12) >> 2;
345 const uint8_t kuiHU2 = kuiL12 >> 1;
346 const uint8_t kuiHU3 = (kuiL12 + kuiL23) >> 2;
347 const uint8_t kuiHU4 = kuiL23 >> 1;
348 const uint8_t kuiHU5 = (1 + kuiL23 + (kuiL3 << 1)) >> 2;
349 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
350 uiSrc[0] = kuiHU0;
351 uiSrc[1] = kuiHU1;
352 uiSrc[2] = uiSrc[4] = kuiHU2;
353 uiSrc[3] = uiSrc[5] = kuiHU3;
354 uiSrc[6] = uiSrc[8] = kuiHU4;
355 uiSrc[7] = uiSrc[9] = kuiHU5;
356 memset (&uiSrc[10], kuiL3, 6 * sizeof (uint8_t));
357
358 WelsFillingPred8x2to16 (pPred, uiSrc);
359 }
360
361
362 /*horizontal down*/
WelsI4x4LumaPredHD_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)363 void WelsI4x4LumaPredHD_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
364 const int32_t kiStridex2 = kiStride << 1;
365 const int32_t kiStridex3 = kiStride + kiStridex2;
366 const uint8_t kuiLT = pRef[-kiStride - 1]; // pTop-pLeft
367 /*get pLeft and pTop*/
368 const uint8_t kuiL0 = pRef[-1];
369 const uint8_t kuiL1 = pRef[kiStride - 1];
370 const uint8_t kuiL2 = pRef[kiStridex2 - 1];
371 const uint8_t kuiL3 = pRef[kiStridex3 - 1];
372 const uint8_t kuiT0 = pRef[-kiStride];
373 const uint8_t kuiT1 = pRef[1 - kiStride];
374 const uint8_t kuiT2 = pRef[2 - kiStride];
375 const uint8_t kuiHD0 = (1 + kuiLT + kuiL0) >> 1; // uiHD0
376 const uint8_t kuiHD1 = (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2; // uiHD1
377 const uint8_t kuiHD2 = (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2; // uiHD2
378 const uint8_t kuiHD3 = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2; // uiHD3
379 const uint8_t kuiHD4 = (1 + kuiL0 + kuiL1) >> 1; // uiHD4
380 const uint8_t kuiHD5 = (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2; // uiHD5
381 const uint8_t kuiHD6 = (1 + kuiL1 + kuiL2) >> 1; // uiHD6
382 const uint8_t kuiHD7 = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2; // uiHD7
383 const uint8_t kuiHD8 = (1 + kuiL2 + kuiL3) >> 1; // uiHD8
384 const uint8_t kuiHD9 = (2 + kuiL1 + (kuiL2 << 1) + kuiL3) >> 2; // uiHD9
385 ENFORCE_STACK_ALIGN_1D (uint8_t, uiSrc, 16, 16) // TobeCont'd about assign opt as follows
386 uiSrc[0] = uiSrc[6] = kuiHD0;
387 uiSrc[1] = uiSrc[7] = kuiHD1;
388 uiSrc[2] = kuiHD2;
389 uiSrc[3] = kuiHD3;
390 uiSrc[4] = uiSrc[10] = kuiHD4;
391 uiSrc[5] = uiSrc[11] = kuiHD5;
392 uiSrc[8] = uiSrc[14] = kuiHD6;
393 uiSrc[9] = uiSrc[15] = kuiHD7;
394 uiSrc[12] = kuiHD8;
395 uiSrc[13] = kuiHD9;
396
397 WelsFillingPred8x2to16 (pPred, uiSrc);
398 }
399
400
401
402 #define I8x8_PRED_STRIDE 8
403
WelsIChromaPredV_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)404 void WelsIChromaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
405 const uint64_t kuiSrc64 = LD64 (&pRef[-kiStride]);
406
407 ST64 (pPred , kuiSrc64);
408 ST64 (pPred + 8 , kuiSrc64);
409 ST64 (pPred + 16, kuiSrc64);
410 ST64 (pPred + 24, kuiSrc64);
411 ST64 (pPred + 32, kuiSrc64);
412 ST64 (pPred + 40, kuiSrc64);
413 ST64 (pPred + 48, kuiSrc64);
414 ST64 (pPred + 56, kuiSrc64);
415 }
416
WelsIChromaPredH_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)417 void WelsIChromaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
418 int32_t iStridex7 = (kiStride << 3) - kiStride;
419 int32_t iI8x8Stridex7 = (I8x8_PRED_STRIDE << 3) - I8x8_PRED_STRIDE;
420 uint8_t i = 7;
421
422 do {
423 const uint8_t kuiLeft = pRef[iStridex7 - 1]; // pLeft value
424 uint64_t kuiSrc64 = (uint64_t) (0x0101010101010101ULL * kuiLeft);
425 ST64 (pPred + iI8x8Stridex7, kuiSrc64);
426
427 iStridex7 -= kiStride;
428 iI8x8Stridex7 -= I8x8_PRED_STRIDE;
429 } while (i-- > 0);
430 }
431
432
WelsIChromaPredPlane_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)433 void WelsIChromaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
434 int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
435 int32_t i, j;
436 uint8_t* pTop = &pRef[-kiStride];
437 uint8_t* pLeft = &pRef[-1];
438
439 for (i = 0 ; i < 4 ; i ++) {
440 iTopSum += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
441 iLeftSum += (i + 1) * (pLeft[ (4 + i) * kiStride] - pLeft[ (2 - i) * kiStride]);
442 }
443
444 iLTshift = (pLeft[7 * kiStride] + pTop[7]) << 4;
445 iTopshift = (17 * iTopSum + 16) >> 5;
446 iLeftshift = (17 * iLeftSum + 16) >> 5;
447
448 for (i = 0 ; i < 8 ; i ++) {
449 for (j = 0 ; j < 8 ; j ++) {
450 pPred[j] = WelsClip1 ((iLTshift + iTopshift * (j - 3) + iLeftshift * (i - 3) + 16) >> 5);
451 }
452 pPred += I8x8_PRED_STRIDE;
453 }
454 }
455
456
WelsIChromaPredDc_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)457 void WelsIChromaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
458 const int32_t kuiL1 = kiStride - 1;
459 const int32_t kuiL2 = kuiL1 + kiStride;
460 const int32_t kuiL3 = kuiL2 + kiStride;
461 const int32_t kuiL4 = kuiL3 + kiStride;
462 const int32_t kuiL5 = kuiL4 + kiStride;
463 const int32_t kuiL6 = kuiL5 + kiStride;
464 const int32_t kuiL7 = kuiL6 + kiStride;
465 /*caculate the iMean value*/
466 const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] +
467 pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 4) >> 3;
468 const uint32_t kuiSum2 = pRef[4 - kiStride] + pRef[5 - kiStride] + pRef[6 - kiStride] + pRef[7 - kiStride];
469 const uint32_t kuiSum3 = pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7];
470 const uint8_t kuiMean2 = (kuiSum2 + 2) >> 2;
471 const uint8_t kuiMean3 = (kuiSum3 + 2) >> 2;
472 const uint8_t kuiMean4 = (kuiSum2 + kuiSum3 + 4) >> 3;
473
474 const uint8_t kuiTopMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
475 const uint8_t kuiBottomMean[8] = {kuiMean3, kuiMean3, kuiMean3, kuiMean3, kuiMean4, kuiMean4, kuiMean4, kuiMean4};
476 const uint64_t kuiTopMean64 = LD64 (kuiTopMean);
477 const uint64_t kuiBottomMean64 = LD64 (kuiBottomMean);
478
479 ST64 (pPred , kuiTopMean64);
480 ST64 (pPred + 8 , kuiTopMean64);
481 ST64 (pPred + 16, kuiTopMean64);
482 ST64 (pPred + 24, kuiTopMean64);
483 ST64 (pPred + 32, kuiBottomMean64);
484 ST64 (pPred + 40, kuiBottomMean64);
485 ST64 (pPred + 48, kuiBottomMean64);
486 ST64 (pPred + 56, kuiBottomMean64);
487 }
488
WelsIChromaPredDcLeft_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)489 void WelsIChromaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
490 const int32_t kuiL1 = kiStride - 1;
491 const int32_t kuiL2 = kuiL1 + kiStride;
492 const int32_t kuiL3 = kuiL2 + kiStride;
493 const int32_t kuiL4 = kuiL3 + kiStride;
494 const int32_t kuiL5 = kuiL4 + kiStride;
495 const int32_t kuiL6 = kuiL5 + kiStride;
496 const int32_t kuiL7 = kuiL6 + kiStride;
497 /*caculate the iMean value*/
498 const uint8_t kuiTopMean = (pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 2) >> 2 ;
499 const uint8_t kuiBottomMean = (pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7] + 2) >> 2;
500 const uint64_t kuiTopMean64 = (uint64_t) (0x0101010101010101ULL * kuiTopMean);
501 const uint64_t kuiBottomMean64 = (uint64_t) (0x0101010101010101ULL * kuiBottomMean);
502 ST64 (pPred , kuiTopMean64);
503 ST64 (pPred + 8 , kuiTopMean64);
504 ST64 (pPred + 16, kuiTopMean64);
505 ST64 (pPred + 24, kuiTopMean64);
506 ST64 (pPred + 32, kuiBottomMean64);
507 ST64 (pPred + 40, kuiBottomMean64);
508 ST64 (pPred + 48, kuiBottomMean64);
509 ST64 (pPred + 56, kuiBottomMean64);
510 }
511
WelsIChromaPredDcTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)512 void WelsIChromaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
513 /*caculate the iMean value*/
514 const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] + pRef[3 - kiStride] + 2) >> 2;
515 const uint8_t kuiMean2 = (pRef[4 - kiStride] + pRef[5 - kiStride] + pRef[6 - kiStride] + pRef[7 - kiStride] + 2) >> 2;
516 const uint8_t kuiMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
517 const uint64_t kuiMean64 = LD64 (kuiMean);
518
519 ST64 (pPred , kuiMean64);
520 ST64 (pPred + 8 , kuiMean64);
521 ST64 (pPred + 16, kuiMean64);
522 ST64 (pPred + 24, kuiMean64);
523 ST64 (pPred + 32, kuiMean64);
524 ST64 (pPred + 40, kuiMean64);
525 ST64 (pPred + 48, kuiMean64);
526 ST64 (pPred + 56, kuiMean64);
527 }
528
WelsIChromaPredDcNA_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)529 void WelsIChromaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
530 const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080ULL;
531 ST64 (pPred , kuiDcValue64);
532 ST64 (pPred + 8 , kuiDcValue64);
533 ST64 (pPred + 16, kuiDcValue64);
534 ST64 (pPred + 24, kuiDcValue64);
535 ST64 (pPred + 32, kuiDcValue64);
536 ST64 (pPred + 40, kuiDcValue64);
537 ST64 (pPred + 48, kuiDcValue64);
538 ST64 (pPred + 56, kuiDcValue64);
539 }
540
541
WelsI16x16LumaPredPlane_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)542 void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
543 int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
544 int32_t i, j;
545 uint8_t* pTop = &pRef[-kiStride];
546 uint8_t* pLeft = &pRef[-1];
547 int32_t iPredStride = 16;
548
549 for (i = 0 ; i < 8 ; i ++) {
550 iTopSum += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
551 iLeftSum += (i + 1) * (pLeft[ (8 + i) * kiStride] - pLeft[ (6 - i) * kiStride]);
552 }
553
554 iLTshift = (pLeft[15 * kiStride] + pTop[15]) << 4;
555 iTopshift = (5 * iTopSum + 32) >> 6;
556 iLeftshift = (5 * iLeftSum + 32) >> 6;
557
558 for (i = 0 ; i < 16 ; i ++) {
559 for (j = 0 ; j < 16 ; j ++) {
560 pPred[j] = WelsClip1 ((iLTshift + iTopshift * (j - 7) + iLeftshift * (i - 7) + 16) >> 5);
561 }
562 pPred += iPredStride;
563 }
564 }
565
WelsI16x16LumaPredDc_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)566 void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
567 int32_t iStridex15 = (kiStride << 4) - kiStride;
568 int32_t iSum = 0;
569 uint8_t i = 15;
570 uint8_t iMean = 0;
571
572 /*caculate the iMean value*/
573 do {
574 iSum += pRef[-1 + iStridex15] + pRef[-kiStride + i];
575 iStridex15 -= kiStride;
576 } while (i-- > 0);
577 iMean = (16 + iSum) >> 5;
578 memset (pPred, iMean, 256);
579 }
580
581
WelsI16x16LumaPredDcTop_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)582 void WelsI16x16LumaPredDcTop_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
583 int32_t iSum = 0;
584 uint8_t i = 15;
585 uint8_t iMean = 0;
586
587 /*caculate the iMean value*/
588 do {
589 iSum += pRef[-kiStride + i];
590 } while (i-- > 0);
591 iMean = (8 + iSum) >> 4;
592 memset (pPred, iMean, 256);
593 }
594
WelsI16x16LumaPredDcLeft_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)595 void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
596 int32_t iStridex15 = (kiStride << 4) - kiStride;
597 int32_t iSum = 0;
598 uint8_t i = 15;
599 uint8_t iMean = 0;
600
601 /*caculate the iMean value*/
602 do {
603 iSum += pRef[-1 + iStridex15];
604 iStridex15 -= kiStride;
605 } while (i-- > 0);
606 iMean = (8 + iSum) >> 4;
607 memset (pPred, iMean, 256);
608 }
609
WelsI16x16LumaPredDcNA_c(uint8_t * pPred,uint8_t * pRef,const int32_t kiStride)610 void WelsI16x16LumaPredDcNA_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
611 memset (pPred, 0x80, 256);
612 }
613
WelsInitIntraPredFuncs(SWelsFuncPtrList * pFuncList,const uint32_t kuiCpuFlag)614 void WelsInitIntraPredFuncs (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuFlag) {
615 pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_c;
616 pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_c;
617 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_c;
618 pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_c;
619 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L] = WelsI16x16LumaPredDcLeft_c;
620 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T] = WelsI16x16LumaPredDcTop_c;
621 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
622
623 pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_c;
624 pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_c;
625 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_c;
626 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_L] = WelsI4x4LumaPredDcLeft_c;
627 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T] = WelsI4x4LumaPredDcTop_c;
628 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
629
630 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_c;
631 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
632 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_c;
633
634 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL] = WelsI4x4LumaPredVL_c;
635 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
636 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR] = WelsI4x4LumaPredVR_c;
637 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU] = WelsI4x4LumaPredHU_c;
638 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD] = WelsI4x4LumaPredHD_c;
639
640 pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_c;
641 pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_c;
642 pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_c;
643 pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_c;
644 pFuncList->pfGetChromaPred[C_PRED_DC_L] = WelsIChromaPredDcLeft_c;
645 pFuncList->pfGetChromaPred[C_PRED_DC_T] = WelsIChromaPredDcTop_c;
646 pFuncList->pfGetChromaPred[C_PRED_DC_128] = WelsIChromaPredDcNA_c;
647 #ifdef HAVE_NEON
648 if (kuiCpuFlag & WELS_CPU_NEON) {
649 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_neon;
650 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD] = WelsI4x4LumaPredHD_neon;
651 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU] = WelsI4x4LumaPredHU_neon;
652 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR] = WelsI4x4LumaPredVR_neon;
653 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_neon;
654 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL] = WelsI4x4LumaPredVL_neon;
655 pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_neon;
656 pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_neon;
657
658 pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_neon;
659 pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_neon;
660 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_neon;
661 pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_neon;
662
663 pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_neon;
664 pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_neon;
665 pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_neon;
666 pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_neon;
667 }
668 #endif
669
670 #if defined(HAVE_NEON_AARCH64)
671 if (kuiCpuFlag & WELS_CPU_NEON) {
672 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_AArch64_neon;
673 pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_AArch64_neon;
674 pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_AArch64_neon;
675 pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_AArch64_neon;
676 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L] = WelsI16x16LumaPredDcLeft_AArch64_neon;
677 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T] = WelsI16x16LumaPredDcTop_AArch64_neon;
678
679 pFuncList->pfGetLumaI4x4Pred[I4_PRED_H ] = WelsI4x4LumaPredH_AArch64_neon;
680 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL ] = WelsI4x4LumaPredDDL_AArch64_neon;
681 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_AArch64_neon;
682 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL ] = WelsI4x4LumaPredVL_AArch64_neon;
683 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP ] = WelsI4x4LumaPredVLTop_AArch64_neon;
684 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR ] = WelsI4x4LumaPredVR_AArch64_neon;
685 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU ] = WelsI4x4LumaPredHU_AArch64_neon;
686 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD ] = WelsI4x4LumaPredHD_AArch64_neon;
687 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC ] = WelsI4x4LumaPredDc_AArch64_neon;
688 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T ] = WelsI4x4LumaPredDcTop_AArch64_neon;
689
690 pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_AArch64_neon;
691 pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_AArch64_neon;
692 pFuncList->pfGetChromaPred[C_PRED_P ] = WelsIChromaPredPlane_AArch64_neon;
693 pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_AArch64_neon;
694 pFuncList->pfGetChromaPred[C_PRED_DC_T] = WelsIChromaPredDcTop_AArch64_neon;
695 }
696 #endif//HAVE_NEON_AARCH64
697
698 #ifdef X86_ASM
699 if (kuiCpuFlag & WELS_CPU_MMXEXT) {
700 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_mmx;
701 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD] = WelsI4x4LumaPredHD_mmx;
702 pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU] = WelsI4x4LumaPredHU_mmx;
703 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR] = WelsI4x4LumaPredVR_mmx;
704 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_mmx;
705 pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL] = WelsI4x4LumaPredVL_mmx;
706 pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmx;
707 }
708 if (kuiCpuFlag & WELS_CPU_SSE2) {
709 pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_sse2;
710 pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_sse2;
711 pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_sse2;
712
713 pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_sse2;
714 pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_sse2;
715 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
716 pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_sse2;
717
718 pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_sse2;
719 pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_sse2;
720 pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_sse2;
721 }
722 #endif
723
724 #if defined(HAVE_MMI)
725 if (kuiCpuFlag & WELS_CPU_MMI) {
726 pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_mmi;
727 pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_mmi;
728 pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_mmi;
729 pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_mmi;
730
731 pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmi;
732 pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_mmi;
733 pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_mmi;
734 pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_mmi;
735 }
736 #endif//HAVE_MMI
737 }
738 }
739