1 /*!
2 * \copy
3 * Copyright (c) 2009-2013, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file get_intra_predictor.c
33 *
34 * \brief implementation for get intra predictor about 16x16, 4x4, chroma.
35 *
36 * \date 4/2/2009 Created
37 * 9/14/2009 C level based optimization with high performance gained.
38 * [const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
39 *
40 *************************************************************************************
41 */
42 #include <string.h>
43
44 #include "macros.h"
45 #include "ls_defines.h"
46 #include "get_intra_predictor.h"
47
48 namespace WelsDec {
49
50 #define I4x4_COUNT 4
51 #define I8x8_COUNT 8
52 #define I16x16_COUNT 16
53
WelsI4x4LumaPredV_c(uint8_t * pPred,const int32_t kiStride)54 void WelsI4x4LumaPredV_c (uint8_t* pPred, const int32_t kiStride) {
55 const uint32_t kuiVal = LD32A4 (pPred - kiStride);
56
57 ST32A4 (pPred, kuiVal);
58 ST32A4 (pPred + kiStride, kuiVal);
59 ST32A4 (pPred + (kiStride << 1), kuiVal);
60 ST32A4 (pPred + (kiStride << 1) + kiStride, kuiVal);
61 }
62
WelsI4x4LumaPredH_c(uint8_t * pPred,const int32_t kiStride)63 void WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride) {
64 const int32_t kiStride2 = kiStride << 1;
65 const int32_t kiStride3 = kiStride2 + kiStride;
66 const uint32_t kuiL0 = 0x01010101U * pPred[-1 ];
67 const uint32_t kuiL1 = 0x01010101U * pPred[-1 + kiStride ];
68 const uint32_t kuiL2 = 0x01010101U * pPred[-1 + kiStride2];
69 const uint32_t kuiL3 = 0x01010101U * pPred[-1 + kiStride3];
70
71 ST32A4 (pPred, kuiL0);
72 ST32A4 (pPred + kiStride, kuiL1);
73 ST32A4 (pPred + kiStride2, kuiL2);
74 ST32A4 (pPred + kiStride3, kuiL3);
75 }
76
WelsI4x4LumaPredDc_c(uint8_t * pPred,const int32_t kiStride)77 void WelsI4x4LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
78 const int32_t kiStride2 = kiStride << 1;
79 const int32_t kiStride3 = kiStride2 + kiStride;
80 const uint8_t kuiMean = (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] +
81 pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 4) >> 3;
82 const uint32_t kuiMean32 = 0x01010101U * kuiMean;
83
84 ST32A4 (pPred, kuiMean32);
85 ST32A4 (pPred + kiStride, kuiMean32);
86 ST32A4 (pPred + kiStride2, kuiMean32);
87 ST32A4 (pPred + kiStride3, kuiMean32);
88 }
89
WelsI4x4LumaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride)90 void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
91 const int32_t kiStride2 = kiStride << 1;
92 const int32_t kiStride3 = kiStride2 + kiStride;
93 const uint8_t kuiMean = (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] + 2) >> 2;
94 const uint32_t kuiMean32 = 0x01010101U * kuiMean;
95
96 ST32A4 (pPred, kuiMean32);
97 ST32A4 (pPred + kiStride, kuiMean32);
98 ST32A4 (pPred + kiStride2, kuiMean32);
99 ST32A4 (pPred + kiStride3, kuiMean32);
100 }
101
WelsI4x4LumaPredDcTop_c(uint8_t * pPred,const int32_t kiStride)102 void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
103 const int32_t kiStride2 = kiStride << 1;
104 const int32_t kiStride3 = kiStride2 + kiStride;
105 const uint8_t kuiMean = (pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 2)
106 >> 2;
107 const uint32_t kuiMean32 = 0x01010101U * kuiMean;
108
109 ST32A4 (pPred, kuiMean32);
110 ST32A4 (pPred + kiStride, kuiMean32);
111 ST32A4 (pPred + kiStride2, kuiMean32);
112 ST32A4 (pPred + kiStride3, kuiMean32);
113 }
114
WelsI4x4LumaPredDcNA_c(uint8_t * pPred,const int32_t kiStride)115 void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
116 const uint32_t kuiDC32 = 0x80808080U;
117
118 ST32A4 (pPred, kuiDC32);
119 ST32A4 (pPred + kiStride, kuiDC32);
120 ST32A4 (pPred + (kiStride << 1), kuiDC32);
121 ST32A4 (pPred + (kiStride << 1) + kiStride, kuiDC32);
122 }
123
124 /*down pLeft*/
WelsI4x4LumaPredDDL_c(uint8_t * pPred,const int32_t kiStride)125 void WelsI4x4LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride) {
126 const int32_t kiStride2 = kiStride << 1;
127 const int32_t kiStride3 = kiStride + kiStride2;
128 /*get pTop*/
129 uint8_t* ptop = &pPred[-kiStride];
130 const uint8_t kuiT0 = *ptop;
131 const uint8_t kuiT1 = * (ptop + 1);
132 const uint8_t kuiT2 = * (ptop + 2);
133 const uint8_t kuiT3 = * (ptop + 3);
134 const uint8_t kuiT4 = * (ptop + 4);
135 const uint8_t kuiT5 = * (ptop + 5);
136 const uint8_t kuiT6 = * (ptop + 6);
137 const uint8_t kuiT7 = * (ptop + 7);
138 const uint8_t kuiDDL0 = (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2; // kDDL0
139 const uint8_t kuiDDL1 = (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2; // kDDL1
140 const uint8_t kuiDDL2 = (2 + kuiT2 + kuiT4 + (kuiT3 << 1)) >> 2; // kDDL2
141 const uint8_t kuiDDL3 = (2 + kuiT3 + kuiT5 + (kuiT4 << 1)) >> 2; // kDDL3
142 const uint8_t kuiDDL4 = (2 + kuiT4 + kuiT6 + (kuiT5 << 1)) >> 2; // kDDL4
143 const uint8_t kuiDDL5 = (2 + kuiT5 + kuiT7 + (kuiT6 << 1)) >> 2; // kDDL5
144 const uint8_t kuiDDL6 = (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2; // kDDL6
145 const uint8_t kuiList[8] = { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 };
146
147 ST32A4 (pPred, LD32 (kuiList));
148 ST32A4 (pPred + kiStride, LD32 (kuiList + 1));
149 ST32A4 (pPred + kiStride2, LD32 (kuiList + 2));
150 ST32A4 (pPred + kiStride3, LD32 (kuiList + 3));
151 }
152
153 /*down pLeft*/
WelsI4x4LumaPredDDLTop_c(uint8_t * pPred,const int32_t kiStride)154 void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride) {
155 const int32_t kiStride2 = kiStride << 1;
156 const int32_t kiStride3 = kiStride + kiStride2;
157 /*get pTop*/
158 uint8_t* ptop = &pPred[-kiStride];
159 const uint8_t kuiT0 = *ptop;
160 const uint8_t kuiT1 = * (ptop + 1);
161 const uint8_t kuiT2 = * (ptop + 2);
162 const uint8_t kuiT3 = * (ptop + 3);
163 const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
164 const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
165 const uint16_t kuiT23 = 1 + kuiT2 + kuiT3;
166 const uint16_t kuiT33 = 1 + (kuiT3 << 1);
167 const uint8_t kuiDLT0 = (kuiT01 + kuiT12) >> 2; // kDLT0
168 const uint8_t kuiDLT1 = (kuiT12 + kuiT23) >> 2; // kDLT1
169 const uint8_t kuiDLT2 = (kuiT23 + kuiT33) >> 2; // kDLT2
170 const uint8_t kuiDLT3 = kuiT33 >> 1; // kDLT3
171 const uint8_t kuiList[8] = { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 };
172
173 ST32A4 (pPred, LD32 (kuiList));
174 ST32A4 (pPred + kiStride, LD32 (kuiList + 1));
175 ST32A4 (pPred + kiStride2, LD32 (kuiList + 2));
176 ST32A4 (pPred + kiStride3, LD32 (kuiList + 3));
177 }
178
179
180 /*down right*/
WelsI4x4LumaPredDDR_c(uint8_t * pPred,const int32_t kiStride)181 void WelsI4x4LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride) {
182 const int32_t kiStride2 = kiStride << 1;
183 const int32_t kiStride3 = kiStride + kiStride2;
184 uint8_t* ptopleft = &pPred[- (kiStride + 1)];
185 uint8_t* pleft = &pPred[-1];
186 const uint8_t kuiLT = *ptopleft;
187 /*get pLeft and pTop*/
188 const uint8_t kuiL0 = *pleft;
189 const uint8_t kuiL1 = * (pleft + kiStride);
190 const uint8_t kuiL2 = * (pleft + kiStride2);
191 const uint8_t kuiL3 = * (pleft + kiStride3);
192 const uint8_t kuiT0 = * (ptopleft + 1);
193 const uint8_t kuiT1 = * (ptopleft + 2);
194 const uint8_t kuiT2 = * (ptopleft + 3);
195 const uint8_t kuiT3 = * (ptopleft + 4);
196 const uint16_t kuiTL0 = 1 + kuiLT + kuiL0;
197 const uint16_t kuiLT0 = 1 + kuiLT + kuiT0;
198 const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
199 const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
200 const uint16_t kuiT23 = 1 + kuiT2 + kuiT3;
201 const uint16_t kuiL01 = 1 + kuiL0 + kuiL1;
202 const uint16_t kuiL12 = 1 + kuiL1 + kuiL2;
203 const uint16_t kuiL23 = 1 + kuiL2 + kuiL3;
204 const uint8_t kuiDDR0 = (kuiTL0 + kuiLT0) >> 2; // kuiDDR0
205 const uint8_t kuiDDR1 = (kuiLT0 + kuiT01) >> 2; // kuiDDR1
206 const uint8_t kuiDDR2 = (kuiT01 + kuiT12) >> 2; // kuiDDR2
207 const uint8_t kuiDDR3 = (kuiT12 + kuiT23) >> 2; // kuiDDR3
208 const uint8_t kuiDDR4 = (kuiTL0 + kuiL01) >> 2; // kuiDDR4
209 const uint8_t kuiDDR5 = (kuiL01 + kuiL12) >> 2; // kuiDDR5
210 const uint8_t kuiDDR6 = (kuiL12 + kuiL23) >> 2; // kuiDDR6
211 const uint8_t kuiList[8] = { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0 };
212
213 ST32A4 (pPred, LD32 (kuiList + 3));
214 ST32A4 (pPred + kiStride, LD32 (kuiList + 2));
215 ST32A4 (pPred + kiStride2, LD32 (kuiList + 1));
216 ST32A4 (pPred + kiStride3, LD32 (kuiList));
217 }
218
219
220 /*vertical pLeft*/
WelsI4x4LumaPredVL_c(uint8_t * pPred,const int32_t kiStride)221 void WelsI4x4LumaPredVL_c (uint8_t* pPred, const int32_t kiStride) {
222 const int32_t kiStride2 = kiStride << 1;
223 const int32_t kiStride3 = kiStride + kiStride2;
224 uint8_t* ptopleft = &pPred[- (kiStride + 1)];
225 /*get pTop*/
226 const uint8_t kuiT0 = * (ptopleft + 1);
227 const uint8_t kuiT1 = * (ptopleft + 2);
228 const uint8_t kuiT2 = * (ptopleft + 3);
229 const uint8_t kuiT3 = * (ptopleft + 4);
230 const uint8_t kuiT4 = * (ptopleft + 5);
231 const uint8_t kuiT5 = * (ptopleft + 6);
232 const uint8_t kuiT6 = * (ptopleft + 7);
233 const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
234 const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
235 const uint16_t kuiT23 = 1 + kuiT2 + kuiT3;
236 const uint16_t kuiT34 = 1 + kuiT3 + kuiT4;
237 const uint16_t kuiT45 = 1 + kuiT4 + kuiT5;
238 const uint16_t kuiT56 = 1 + kuiT5 + kuiT6;
239 const uint8_t kuiVL0 = kuiT01 >> 1; // kuiVL0
240 const uint8_t kuiVL1 = kuiT12 >> 1; // kuiVL1
241 const uint8_t kuiVL2 = kuiT23 >> 1; // kuiVL2
242 const uint8_t kuiVL3 = kuiT34 >> 1; // kuiVL3
243 const uint8_t kuiVL4 = kuiT45 >> 1; // kuiVL4
244 const uint8_t kuiVL5 = (kuiT01 + kuiT12) >> 2; // kuiVL5
245 const uint8_t kuiVL6 = (kuiT12 + kuiT23) >> 2; // kuiVL6
246 const uint8_t kuiVL7 = (kuiT23 + kuiT34) >> 2; // kuiVL7
247 const uint8_t kuiVL8 = (kuiT34 + kuiT45) >> 2; // kuiVL8
248 const uint8_t kuiVL9 = (kuiT45 + kuiT56) >> 2; // kuiVL9
249 const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 };
250
251 ST32A4 (pPred, LD32 (kuiList));
252 ST32A4 (pPred + kiStride, LD32 (kuiList + 5));
253 ST32A4 (pPred + kiStride2, LD32 (kuiList + 1));
254 ST32A4 (pPred + kiStride3, LD32 (kuiList + 6));
255 }
256
257 /*vertical pLeft*/
WelsI4x4LumaPredVLTop_c(uint8_t * pPred,const int32_t kiStride)258 void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride) {
259 const int32_t kiStride2 = kiStride << 1;
260 const int32_t kiStride3 = kiStride + kiStride2;
261 uint8_t* ptopleft = &pPred[- (kiStride + 1)];
262 /*get pTop*/
263 const uint8_t kuiT0 = * (ptopleft + 1);
264 const uint8_t kuiT1 = * (ptopleft + 2);
265 const uint8_t kuiT2 = * (ptopleft + 3);
266 const uint8_t kuiT3 = * (ptopleft + 4);
267 const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
268 const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
269 const uint16_t kuiT23 = 1 + kuiT2 + kuiT3;
270 const uint16_t kuiT33 = 1 + (kuiT3 << 1);
271 const uint8_t kuiVL0 = kuiT01 >> 1;
272 const uint8_t kuiVL1 = kuiT12 >> 1;
273 const uint8_t kuiVL2 = kuiT23 >> 1;
274 const uint8_t kuiVL3 = kuiT33 >> 1;
275 const uint8_t kuiVL4 = (kuiT01 + kuiT12) >> 2;
276 const uint8_t kuiVL5 = (kuiT12 + kuiT23) >> 2;
277 const uint8_t kuiVL6 = (kuiT23 + kuiT33) >> 2;
278 const uint8_t kuiVL7 = kuiVL3;
279 const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 };
280
281 ST32A4 (pPred, LD32 (kuiList));
282 ST32A4 (pPred + kiStride, LD32 (kuiList + 5));
283 ST32A4 (pPred + kiStride2, LD32 (kuiList + 1));
284 ST32A4 (pPred + kiStride3, LD32 (kuiList + 6));
285 }
286
287
288 /*vertical right*/
WelsI4x4LumaPredVR_c(uint8_t * pPred,const int32_t kiStride)289 void WelsI4x4LumaPredVR_c (uint8_t* pPred, const int32_t kiStride) {
290 const int32_t kiStride2 = kiStride << 1;
291 const int32_t kiStride3 = kiStride + kiStride2;
292 const uint8_t kuiLT = pPred[-kiStride - 1];
293 /*get pLeft and pTop*/
294 const uint8_t kuiL0 = pPred[ - 1];
295 const uint8_t kuiL1 = pPred[kiStride - 1];
296 const uint8_t kuiL2 = pPred[kiStride2 - 1];
297 const uint8_t kuiT0 = pPred[ -kiStride];
298 const uint8_t kuiT1 = pPred[1 - kiStride];
299 const uint8_t kuiT2 = pPred[2 - kiStride];
300 const uint8_t kuiT3 = pPred[3 - kiStride];
301 const uint8_t kuiVR0 = (1 + kuiLT + kuiT0) >> 1; // kuiVR0
302 const uint8_t kuiVR1 = (1 + kuiT0 + kuiT1) >> 1; // kuiVR1
303 const uint8_t kuiVR2 = (1 + kuiT1 + kuiT2) >> 1; // kuiVR2
304 const uint8_t kuiVR3 = (1 + kuiT2 + kuiT3) >> 1; // kuiVR3
305 const uint8_t kuiVR4 = (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2; // kuiVR4
306 const uint8_t kuiVR5 = (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2; // kuiVR5
307 const uint8_t kuiVR6 = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2; // kuiVR6
308 const uint8_t kuiVR7 = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2; // kuiVR7
309 const uint8_t kuiVR8 = (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2; // kuiVR8
310 const uint8_t kuiVR9 = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2; // kuiVR9
311 const uint8_t kuiList[10] = { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 };
312
313 ST32A4 (pPred, LD32 (kuiList + 1));
314 ST32A4 (pPred + kiStride, LD32 (kuiList + 6));
315 ST32A4 (pPred + kiStride2, LD32 (kuiList));
316 ST32A4 (pPred + kiStride3, LD32 (kuiList + 5));
317 }
318
319 /*horizontal up*/
WelsI4x4LumaPredHU_c(uint8_t * pPred,const int32_t kiStride)320 void WelsI4x4LumaPredHU_c (uint8_t* pPred, const int32_t kiStride) {
321 const int32_t kiStride2 = kiStride << 1;
322 const int32_t kiStride3 = kiStride + kiStride2;
323 /*get pLeft*/
324 const uint8_t kuiL0 = pPred[ - 1];
325 const uint8_t kuiL1 = pPred[kiStride - 1];
326 const uint8_t kuiL2 = pPred[kiStride2 - 1];
327 const uint8_t kuiL3 = pPred[kiStride3 - 1];
328 const uint16_t kuiL01 = 1 + kuiL0 + kuiL1;
329 const uint16_t kuiL12 = 1 + kuiL1 + kuiL2;
330 const uint16_t kuiL23 = 1 + kuiL2 + kuiL3;
331 const uint8_t kuiHU0 = kuiL01 >> 1;
332 const uint8_t kuiHU1 = (kuiL01 + kuiL12) >> 2;
333 const uint8_t kuiHU2 = kuiL12 >> 1;
334 const uint8_t kuiHU3 = (kuiL12 + kuiL23) >> 2;
335 const uint8_t kuiHU4 = kuiL23 >> 1;
336 const uint8_t kuiHU5 = (1 + kuiL23 + (kuiL3 << 1)) >> 2;
337 const uint8_t kuiList[10] = { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 };
338
339 ST32A4 (pPred, LD32 (kuiList));
340 ST32A4 (pPred + kiStride, LD32 (kuiList + 2));
341 ST32A4 (pPred + kiStride2, LD32 (kuiList + 4));
342 ST32A4 (pPred + kiStride3, LD32 (kuiList + 6));
343 }
344
345 /*horizontal down*/
WelsI4x4LumaPredHD_c(uint8_t * pPred,const int32_t kiStride)346 void WelsI4x4LumaPredHD_c (uint8_t* pPred, const int32_t kiStride) {
347 const int32_t kiStride2 = kiStride << 1;
348 const int32_t kiStride3 = kiStride + kiStride2;
349 const uint8_t kuiLT = pPred[- (kiStride + 1)];
350 /*get pLeft and pTop*/
351 const uint8_t kuiL0 = pPred[-1 ];
352 const uint8_t kuiL1 = pPred[-1 + kiStride ];
353 const uint8_t kuiL2 = pPred[-1 + kiStride2];
354 const uint8_t kuiL3 = pPred[-1 + kiStride3];
355 const uint8_t kuiT0 = pPred[-kiStride ];
356 const uint8_t kuiT1 = pPred[-kiStride + 1 ];
357 const uint8_t kuiT2 = pPred[-kiStride + 2 ];
358 const uint16_t kuiTL0 = 1 + kuiLT + kuiL0;
359 const uint16_t kuiLT0 = 1 + kuiLT + kuiT0;
360 const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
361 const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
362 const uint16_t kuiL01 = 1 + kuiL0 + kuiL1;
363 const uint16_t kuiL12 = 1 + kuiL1 + kuiL2;
364 const uint16_t kuiL23 = 1 + kuiL2 + kuiL3;
365 const uint8_t kuiHD0 = kuiTL0 >> 1;
366 const uint8_t kuiHD1 = (kuiTL0 + kuiLT0) >> 2;
367 const uint8_t kuiHD2 = (kuiLT0 + kuiT01) >> 2;
368 const uint8_t kuiHD3 = (kuiT01 + kuiT12) >> 2;
369 const uint8_t kuiHD4 = kuiL01 >> 1;
370 const uint8_t kuiHD5 = (kuiTL0 + kuiL01) >> 2;
371 const uint8_t kuiHD6 = kuiL12 >> 1;
372 const uint8_t kuiHD7 = (kuiL01 + kuiL12) >> 2;
373 const uint8_t kuiHD8 = kuiL23 >> 1;
374 const uint8_t kuiHD9 = (kuiL12 + kuiL23) >> 2;
375 const uint8_t kuiList[10] = { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 };
376
377 ST32A4 (pPred, LD32 (kuiList + 6));
378 ST32A4 (pPred + kiStride, LD32 (kuiList + 4));
379 ST32A4 (pPred + kiStride2, LD32 (kuiList + 2));
380 ST32A4 (pPred + kiStride3, LD32 (kuiList));
381 }
382
WelsI8x8LumaPredV_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)383 void WelsI8x8LumaPredV_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
384 uint64_t uiTop = 0;
385 int32_t iStride[8];
386 uint8_t uiPixelFilterT[8];
387 int32_t i;
388
389 for (iStride[0] = 0, i = 1; i < 8; i++) {
390 iStride[i] = iStride[i - 1] + kiStride;
391 }
392
393 uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
394 pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
395 for (i = 1; i < 7; i++) {
396 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
397 }
398 uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
399 pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
400
401 // 8-89
402 for (i = 7; i >= 0; i--) {
403 uiTop = ((uiTop << 8) | uiPixelFilterT[i]);
404 }
405
406 for (i = 0; i < 8; i++) {
407 ST64A8 (pPred + kiStride * i, uiTop);
408 }
409 }
410
WelsI8x8LumaPredH_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)411 void WelsI8x8LumaPredH_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
412 uint64_t uiLeft;
413 int32_t iStride[8];
414 uint8_t uiPixelFilterL[8];
415 int32_t i;
416
417 for (iStride[0] = 0, i = 1; i < 8; i++) {
418 iStride[i] = iStride[i - 1] + kiStride;
419 }
420
421 uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
422 pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
423 for (i = 1; i < 7; i++) {
424 uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
425 2);
426 }
427 uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
428
429 // 8-90
430 for (i = 0; i < 8; i++) {
431 uiLeft = 0x0101010101010101ULL * uiPixelFilterL[i];
432 ST64A8 (pPred + iStride[i], uiLeft);
433 }
434 }
435
WelsI8x8LumaPredDc_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)436 void WelsI8x8LumaPredDc_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
437 int32_t iStride[8];
438 uint8_t uiPixelFilterL[8];
439 uint8_t uiPixelFilterT[8];
440 uint16_t uiTotal = 0;
441 int32_t i;
442
443 for (iStride[0] = 0, i = 1; i < 8; i++) {
444 iStride[i] = iStride[i - 1] + kiStride;
445 }
446
447 uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
448 pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
449 uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
450 pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
451 for (i = 1; i < 7; i++) {
452 uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
453 2);
454 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
455 }
456 uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
457 uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
458 pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
459
460 // 8-91
461 for (i = 0; i < 8; i++) {
462 uiTotal += uiPixelFilterL[i];
463 uiTotal += uiPixelFilterT[i];
464 }
465
466 const uint8_t kuiMean = ((uiTotal + 8) >> 4);
467 const uint64_t kuiMean64 = 0x0101010101010101ULL * kuiMean;
468
469 for (i = 0; i < 8; i++) {
470 ST64A8 (pPred + iStride[i], kuiMean64);
471 }
472 }
473
WelsI8x8LumaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)474 void WelsI8x8LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
475 int32_t iStride[8];
476 uint8_t uiPixelFilterL[8];
477 uint16_t uiTotal = 0;
478 int32_t i;
479
480 for (iStride[0] = 0, i = 1; i < 8; i++) {
481 iStride[i] = iStride[i - 1] + kiStride;
482 }
483
484 uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
485 pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
486 for (i = 1; i < 7; i++) {
487 uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
488 2);
489 }
490 uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
491
492 // 8-92
493 for (i = 0; i < 8; i++) {
494 uiTotal += uiPixelFilterL[i];
495 }
496
497 const uint8_t kuiMean = ((uiTotal + 4) >> 3);
498 const uint64_t kuiMean64 = 0x0101010101010101ULL * kuiMean;
499
500 for (i = 0; i < 8; i++) {
501 ST64A8 (pPred + iStride[i], kuiMean64);
502 }
503 }
504
WelsI8x8LumaPredDcTop_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)505 void WelsI8x8LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
506 int32_t iStride[8];
507 uint8_t uiPixelFilterT[8];
508 uint16_t uiTotal = 0;
509 int32_t i;
510
511 for (iStride[0] = 0, i = 1; i < 8; i++) {
512 iStride[i] = iStride[i - 1] + kiStride;
513 }
514
515 uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
516 pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
517 for (i = 1; i < 7; i++) {
518 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
519 }
520 uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
521 pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
522
523 // 8-93
524 for (i = 0; i < 8; i++) {
525 uiTotal += uiPixelFilterT[i];
526 }
527
528 const uint8_t kuiMean = ((uiTotal + 4) >> 3);
529 const uint64_t kuiMean64 = 0x0101010101010101ULL * kuiMean;
530
531 for (i = 0; i < 8; i++) {
532 ST64A8 (pPred + iStride[i], kuiMean64);
533 }
534 }
535
WelsI8x8LumaPredDcNA_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)536 void WelsI8x8LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
537 // for normal 8 bit depth, 8-94
538 const uint64_t kuiDC64 = 0x8080808080808080ULL;
539
540 int32_t iStride[8];
541 int32_t i;
542 ST64A8 (pPred, kuiDC64);
543 for (iStride[0] = 0, i = 1; i < 8; i++) {
544 iStride[i] = iStride[i - 1] + kiStride;
545 ST64A8 (pPred + iStride[i], kuiDC64);
546 }
547 }
548
549 /*down pLeft*/
WelsI8x8LumaPredDDL_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)550 void WelsI8x8LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
551 // Top and Top-right available
552 int32_t iStride[8];
553 uint8_t uiPixelFilterT[16];
554 int32_t i, j;
555
556 for (iStride[0] = 0, i = 1; i < 8; i++) {
557 iStride[i] = iStride[i - 1] + kiStride;
558 }
559
560 uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
561 pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
562 for (i = 1; i < 15; i++) {
563 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
564 }
565 uiPixelFilterT[15] = ((pPred[14 - kiStride] + pPred[15 - kiStride] * 3 + 2) >> 2);
566
567 for (i = 0; i < 8; i++) { // y
568 for (j = 0; j < 8; j++) { // x
569 if (i == 7 && j == 7) { // 8-95
570 pPred[j + iStride[i]] = (uiPixelFilterT[14] + 3 * uiPixelFilterT[15] + 2) >> 2;
571 } else { // 8-96
572 pPred[j + iStride[i]] = (uiPixelFilterT[i + j] + (uiPixelFilterT[i + j + 1] << 1) + uiPixelFilterT[i + j + 2] + 2) >> 2;
573 }
574 }
575 }
576 }
577
578 /*down pLeft*/
WelsI8x8LumaPredDDLTop_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)579 void WelsI8x8LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
580 // Top available and Top-right unavailable
581 int32_t iStride[8];
582 uint8_t uiPixelFilterT[16];
583 int32_t i, j;
584
585 for (iStride[0] = 0, i = 1; i < 8; i++) {
586 iStride[i] = iStride[i - 1] + kiStride;
587 }
588
589 uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
590 pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
591 for (i = 1; i < 7; i++) {
592 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
593 }
594 // p[x, -1] x=8...15 are replaced with p[7, -1]
595 uiPixelFilterT[7] = ((pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
596 for (i = 8; i < 16; i++) {
597 uiPixelFilterT[i] = pPred[7 - kiStride];
598 }
599
600 for (i = 0; i < 8; i++) { // y
601 for (j = 0; j < 8; j++) { // x
602 if (i == 7 && j == 7) { // 8-95
603 pPred[j + iStride[i]] = (uiPixelFilterT[14] + 3 * uiPixelFilterT[15] + 2) >> 2;
604 } else { // 8-96
605 pPred[j + iStride[i]] = (uiPixelFilterT[i + j] + (uiPixelFilterT[i + j + 1] << 1) + uiPixelFilterT[i + j + 2] + 2) >> 2;
606 }
607 }
608 }
609 }
610
611 /*down right*/
WelsI8x8LumaPredDDR_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)612 void WelsI8x8LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
613 // The TopLeft, Top, Left are all available under this mode
614 int32_t iStride[8];
615 uint8_t uiPixelFilterTL;
616 uint8_t uiPixelFilterL[8];
617 uint8_t uiPixelFilterT[8];
618 int32_t i, j;
619
620 for (iStride[0] = 0, i = 1; i < 8; i++) {
621 iStride[i] = iStride[i - 1] + kiStride;
622 }
623
624 uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
625
626 uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
627 uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
628 for (i = 1; i < 7; i++) {
629 uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
630 2);
631 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
632 }
633 uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
634 uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
635 pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
636
637 for (i = 0; i < 8; i++) { // y
638 // 8-98, x < y-1
639 for (j = 0; j < (i - 1); j++) {
640 pPred[j + iStride[i]] = (uiPixelFilterL[i - j - 2] + (uiPixelFilterL[i - j - 1] << 1) + uiPixelFilterL[i - j] + 2) >> 2;
641 }
642 // 8-98, special case, x == y-1
643 if (i >= 1) {
644 j = i - 1;
645 pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterL[0] << 1) + uiPixelFilterL[1] + 2) >> 2;
646 }
647 // 8-99, x==y
648 j = i;
649 pPred[j + iStride[i]] = (uiPixelFilterT[0] + (uiPixelFilterTL << 1) + uiPixelFilterL[0] + 2) >> 2;
650 // 8-97, special case, x == y+1
651 if (i < 7) {
652 j = i + 1;
653 pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterT[0] << 1) + uiPixelFilterT[1] + 2) >> 2;
654 }
655 for (j = i + 2; j < 8; j++) { // 8-97, x > y+1
656 pPred[j + iStride[i]] = (uiPixelFilterT[j - i - 2] + (uiPixelFilterT[j - i - 1] << 1) + uiPixelFilterT[j - i] + 2) >> 2;
657 }
658 }
659 }
660
661 /*vertical pLeft*/
WelsI8x8LumaPredVL_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)662 void WelsI8x8LumaPredVL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
663 // Top and Top-right available
664 int32_t iStride[8];
665 uint8_t uiPixelFilterT[16];
666 int32_t i, j;
667
668 for (iStride[0] = 0, i = 1; i < 8; i++) {
669 iStride[i] = iStride[i - 1] + kiStride;
670 }
671
672 uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
673 pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
674 for (i = 1; i < 15; i++) {
675 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
676 }
677 uiPixelFilterT[15] = ((pPred[14 - kiStride] + pPred[15 - kiStride] * 3 + 2) >> 2);
678
679 for (i = 0; i < 8; i++) { // y
680 if ((i & 0x01) == 0) { // 8-108
681 for (j = 0; j < 8; j++) { // x
682 pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + uiPixelFilterT[j + (i >> 1) + 1] + 1) >> 1;
683 }
684 } else { // 8-109
685 for (j = 0; j < 8; j++) { // x
686 pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + (uiPixelFilterT[j + (i >> 1) + 1] << 1) + uiPixelFilterT[j +
687 (i >> 1) + 2] + 2) >> 2;
688 }
689 }
690 }
691 }
692
693 /*vertical pLeft*/
WelsI8x8LumaPredVLTop_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)694 void WelsI8x8LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
695 // Top available and Top-right unavailable
696 int32_t iStride[8];
697 uint8_t uiPixelFilterT[16];
698 int32_t i, j;
699
700 for (iStride[0] = 0, i = 1; i < 8; i++) {
701 iStride[i] = iStride[i - 1] + kiStride;
702 }
703
704 uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
705 pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
706 for (i = 1; i < 7; i++) {
707 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
708 }
709 // p[x, -1] x=8...15 are replaced with p[7, -1]
710 uiPixelFilterT[7] = ((pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
711 for (i = 8; i < 16; i++) {
712 uiPixelFilterT[i] = pPred[7 - kiStride];
713 }
714
715 for (i = 0; i < 8; i++) { // y
716 if ((i & 0x01) == 0) { // 8-108
717 for (j = 0; j < 8; j++) { // x
718 pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + uiPixelFilterT[j + (i >> 1) + 1] + 1) >> 1;
719 }
720 } else { // 8-109
721 for (j = 0; j < 8; j++) { // x
722 pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + (uiPixelFilterT[j + (i >> 1) + 1] << 1) + uiPixelFilterT[j +
723 (i >> 1) + 2] + 2) >> 2;
724 }
725 }
726 }
727 }
728
729 /*vertical right*/
WelsI8x8LumaPredVR_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)730 void WelsI8x8LumaPredVR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
731 // The TopLeft, Top, Left are always available under this mode
732 int32_t iStride[8];
733 uint8_t uiPixelFilterTL;
734 uint8_t uiPixelFilterL[8];
735 uint8_t uiPixelFilterT[8];
736 int32_t i, j;
737 int32_t izVR, izVRDiv;
738
739 for (iStride[0] = 0, i = 1; i < 8; i++) {
740 iStride[i] = iStride[i - 1] + kiStride;
741 }
742
743 uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
744
745 uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
746 uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
747 for (i = 1; i < 7; i++) {
748 uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
749 2);
750 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
751 }
752 uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
753 uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
754 pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
755
756 for (i = 0; i < 8; i++) { // y
757 for (j = 0; j < 8; j++) { // x
758 izVR = (j << 1) - i; // 2 * x - y
759 izVRDiv = j - (i >> 1);
760 if (izVR >= 0) {
761 if ((izVR & 0x01) == 0) { // 8-100
762 if (izVRDiv > 0) {
763 pPred[j + iStride[i]] = (uiPixelFilterT[izVRDiv - 1] + uiPixelFilterT[izVRDiv] + 1) >> 1;
764 } else {
765 pPred[j + iStride[i]] = (uiPixelFilterTL + uiPixelFilterT[0] + 1) >> 1;
766 }
767 } else { // 8-101
768 if (izVRDiv > 1) {
769 pPred[j + iStride[i]] = (uiPixelFilterT[izVRDiv - 2] + (uiPixelFilterT[izVRDiv - 1] << 1) + uiPixelFilterT[izVRDiv] + 2)
770 >> 2;
771 } else {
772 pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterT[0] << 1) + uiPixelFilterT[1] + 2) >> 2;
773 }
774 }
775 } else if (izVR == -1) { // 8-102
776 pPred[j + iStride[i]] = (uiPixelFilterL[0] + (uiPixelFilterTL << 1) + uiPixelFilterT[0] + 2) >> 2;
777 } else if (izVR < -2) { // 8-103
778 pPred[j + iStride[i]] = (uiPixelFilterL[-izVR - 1] + (uiPixelFilterL[-izVR - 2] << 1) + uiPixelFilterL[-izVR - 3] + 2)
779 >> 2;
780 } else { // izVR==-2, 8-103, special case
781 pPred[j + iStride[i]] = (uiPixelFilterL[1] + (uiPixelFilterL[0] << 1) + uiPixelFilterTL + 2) >> 2;
782 }
783 }
784 }
785 }
786
787 /*horizontal up*/
WelsI8x8LumaPredHU_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)788 void WelsI8x8LumaPredHU_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
789 int32_t iStride[8];
790 uint8_t uiPixelFilterL[8];
791 int32_t i, j;
792 int32_t izHU;
793
794 for (iStride[0] = 0, i = 1; i < 8; i++) {
795 iStride[i] = iStride[i - 1] + kiStride;
796 }
797
798 uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
799 pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
800 for (i = 1; i < 7; i++) {
801 uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
802 2);
803 }
804 uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
805
806 for (i = 0; i < 8; i++) { // y
807 for (j = 0; j < 8; j++) { // x
808 izHU = j + (i << 1); // x + 2 * y
809 if (izHU < 13) {
810 if ((izHU & 0x01) == 0) { // 8-110
811 pPred[j + iStride[i]] = (uiPixelFilterL[izHU >> 1] + uiPixelFilterL[1 + (izHU >> 1)] + 1) >> 1;
812 } else { // 8-111
813 pPred[j + iStride[i]] = (uiPixelFilterL[izHU >> 1] + (uiPixelFilterL[1 + (izHU >> 1)] << 1) + uiPixelFilterL[2 +
814 (izHU >> 1)] + 2) >> 2;
815 }
816 } else if (izHU == 13) { // 8-112
817 pPred[j + iStride[i]] = (uiPixelFilterL[6] + 3 * uiPixelFilterL[7] + 2) >> 2;
818 } else { // 8-113
819 pPred[j + iStride[i]] = uiPixelFilterL[7];
820 }
821 }
822 }
823 }
824
825 /*horizontal down*/
WelsI8x8LumaPredHD_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)826 void WelsI8x8LumaPredHD_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
827 // The TopLeft, Top, Left are all available under this mode
828 int32_t iStride[8];
829 uint8_t uiPixelFilterTL;
830 uint8_t uiPixelFilterL[8];
831 uint8_t uiPixelFilterT[8];
832 int32_t i, j;
833 int32_t izHD, izHDDiv;
834
835 for (iStride[0] = 0, i = 1; i < 8; i++) {
836 iStride[i] = iStride[i - 1] + kiStride;
837 }
838
839 uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
840
841 uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
842 uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
843 for (i = 1; i < 7; i++) {
844 uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
845 2);
846 uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
847 }
848 uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
849 uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
850 pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
851
852 for (i = 0; i < 8; i++) { // y
853 for (j = 0; j < 8; j++) { // x
854 izHD = (i << 1) - j; // 2*y - x
855 izHDDiv = i - (j >> 1);
856 if (izHD >= 0) {
857 if ((izHD & 0x01) == 0) { // 8-104
858 if (izHDDiv == 0) {
859 pPred[j + iStride[i]] = (uiPixelFilterTL + uiPixelFilterL[0] + 1) >> 1;
860 } else {
861 pPred[j + iStride[i]] = (uiPixelFilterL[izHDDiv - 1] + uiPixelFilterL[izHDDiv] + 1) >> 1;
862 }
863 } else { // 8-105
864 if (izHDDiv == 1) {
865 pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterL[0] << 1) + uiPixelFilterL[1] + 2) >> 2;
866 } else {
867 pPred[j + iStride[i]] = (uiPixelFilterL[izHDDiv - 2] + (uiPixelFilterL[izHDDiv - 1] << 1) + uiPixelFilterL[izHDDiv] + 2)
868 >> 2;
869 }
870 }
871 } else if (izHD == -1) { // 8-106
872 pPred[j + iStride[i]] = (uiPixelFilterL[0] + (uiPixelFilterTL << 1) + uiPixelFilterT[0] + 2) >> 2;
873 } else if (izHD < -2) { // 8-107
874 pPred[j + iStride[i]] = (uiPixelFilterT[-izHD - 1] + (uiPixelFilterT[-izHD - 2] << 1) + uiPixelFilterT[-izHD - 3] + 2)
875 >> 2;
876 } else { // 8-107 special case, izHD==-2
877 pPred[j + iStride[i]] = (uiPixelFilterT[1] + (uiPixelFilterT[0] << 1) + uiPixelFilterTL + 2) >> 2;
878 }
879 }
880 }
881 }
882
883
WelsIChromaPredV_c(uint8_t * pPred,const int32_t kiStride)884 void WelsIChromaPredV_c (uint8_t* pPred, const int32_t kiStride) {
885 const uint64_t kuiVal64 = LD64A8 (&pPred[-kiStride]);
886 const int32_t kiStride2 = kiStride << 1;
887 const int32_t kiStride4 = kiStride2 << 1;
888
889 ST64A8 (pPred, kuiVal64);
890 ST64A8 (pPred + kiStride, kuiVal64);
891 ST64A8 (pPred + kiStride2, kuiVal64);
892 ST64A8 (pPred + kiStride2 + kiStride, kuiVal64);
893 ST64A8 (pPred + kiStride4, kuiVal64);
894 ST64A8 (pPred + kiStride4 + kiStride, kuiVal64);
895 ST64A8 (pPred + kiStride4 + kiStride2, kuiVal64);
896 ST64A8 (pPred + (kiStride << 3) - kiStride, kuiVal64);
897 }
898
WelsIChromaPredH_c(uint8_t * pPred,const int32_t kiStride)899 void WelsIChromaPredH_c (uint8_t* pPred, const int32_t kiStride) {
900 int32_t iTmp = (kiStride << 3) - kiStride;
901 uint8_t i = 7;
902
903 do {
904 const uint8_t kuiVal8 = pPred[iTmp - 1];
905 const uint64_t kuiVal64 = 0x0101010101010101ULL * kuiVal8;
906
907 ST64A8 (pPred + iTmp, kuiVal64);
908
909 iTmp -= kiStride;
910 } while (i-- > 0);
911 }
912
913
WelsIChromaPredPlane_c(uint8_t * pPred,const int32_t kiStride)914 void WelsIChromaPredPlane_c (uint8_t* pPred, const int32_t kiStride) {
915 int32_t a = 0, b = 0, c = 0, H = 0, V = 0;
916 int32_t i, j;
917 uint8_t* pTop = &pPred[-kiStride];
918 uint8_t* pLeft = &pPred[-1];
919
920 for (i = 0 ; i < 4 ; i ++) {
921 H += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
922 V += (i + 1) * (pLeft[ (4 + i) * kiStride] - pLeft[ (2 - i) * kiStride]);
923 }
924
925 a = (pLeft[7 * kiStride] + pTop[7]) << 4;
926 b = (17 * H + 16) >> 5;
927 c = (17 * V + 16) >> 5;
928
929 for (i = 0 ; i < 8 ; i ++) {
930 for (j = 0 ; j < 8 ; j ++) {
931 int32_t iTmp = (a + b * (j - 3) + c * (i - 3) + 16) >> 5;
932 iTmp = WelsClip1 (iTmp);
933 pPred[j] = iTmp;
934 }
935 pPred += kiStride;
936 }
937 }
938
939
WelsIChromaPredDc_c(uint8_t * pPred,const int32_t kiStride)940 void WelsIChromaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
941 const int32_t kiL1 = kiStride - 1;
942 const int32_t kiL2 = kiL1 + kiStride;
943 const int32_t kiL3 = kiL2 + kiStride;
944 const int32_t kiL4 = kiL3 + kiStride;
945 const int32_t kiL5 = kiL4 + kiStride;
946 const int32_t kiL6 = kiL5 + kiStride;
947 const int32_t kiL7 = kiL6 + kiStride;
948 /*caculate the kMean value*/
949 const uint8_t kuiM1 = (pPred[-kiStride] + pPred[1 - kiStride] + pPred[2 - kiStride] + pPred[3 - kiStride] +
950 pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 4) >> 3 ;
951 const uint32_t kuiSum2 = pPred[4 - kiStride] + pPred[5 - kiStride] + pPred[6 - kiStride] + pPred[7 - kiStride];
952 const uint32_t kuiSum3 = pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7];
953 const uint8_t kuiM2 = (kuiSum2 + 2) >> 2;
954 const uint8_t kuiM3 = (kuiSum3 + 2) >> 2;
955 const uint8_t kuiM4 = (kuiSum2 + kuiSum3 + 4) >> 3;
956 const uint8_t kuiMUP[8] = {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
957 const uint8_t kuiMDown[8] = {kuiM3, kuiM3, kuiM3, kuiM3, kuiM4, kuiM4, kuiM4, kuiM4};
958 const uint64_t kuiUP64 = LD64 (kuiMUP);
959 const uint64_t kuiDN64 = LD64 (kuiMDown);
960
961 ST64A8 (pPred, kuiUP64);
962 ST64A8 (pPred + kiL1 + 1, kuiUP64);
963 ST64A8 (pPred + kiL2 + 1, kuiUP64);
964 ST64A8 (pPred + kiL3 + 1, kuiUP64);
965 ST64A8 (pPred + kiL4 + 1, kuiDN64);
966 ST64A8 (pPred + kiL5 + 1, kuiDN64);
967 ST64A8 (pPred + kiL6 + 1, kuiDN64);
968 ST64A8 (pPred + kiL7 + 1, kuiDN64);
969 }
970
WelsIChromaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride)971 void WelsIChromaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
972 const int32_t kiL1 = -1 + kiStride;
973 const int32_t kiL2 = kiL1 + kiStride;
974 const int32_t kiL3 = kiL2 + kiStride;
975 const int32_t kiL4 = kiL3 + kiStride;
976 const int32_t kiL5 = kiL4 + kiStride;
977 const int32_t kiL6 = kiL5 + kiStride;
978 const int32_t kiL7 = kiL6 + kiStride;
979 /*caculate the kMean value*/
980 const uint8_t kuiMUP = (pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 2) >> 2 ;
981 const uint8_t kuiMDown = (pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7] + 2) >> 2;
982 const uint64_t kuiUP64 = 0x0101010101010101ULL * kuiMUP;
983 const uint64_t kuiDN64 = 0x0101010101010101ULL * kuiMDown;
984
985 ST64A8 (pPred, kuiUP64);
986 ST64A8 (pPred + kiL1 + 1, kuiUP64);
987 ST64A8 (pPred + kiL2 + 1, kuiUP64);
988 ST64A8 (pPred + kiL3 + 1, kuiUP64);
989 ST64A8 (pPred + kiL4 + 1, kuiDN64);
990 ST64A8 (pPred + kiL5 + 1, kuiDN64);
991 ST64A8 (pPred + kiL6 + 1, kuiDN64);
992 ST64A8 (pPred + kiL7 + 1, kuiDN64);
993 }
994
WelsIChromaPredDcTop_c(uint8_t * pPred,const int32_t kiStride)995 void WelsIChromaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
996 int32_t iTmp = (kiStride << 3) - kiStride;
997 /*caculate the kMean value*/
998 const uint8_t kuiM1 = (pPred[-kiStride] + pPred[1 - kiStride] + pPred[2 - kiStride] + pPred[3 - kiStride] + 2) >> 2;
999 const uint8_t kuiM2 = (pPred[4 - kiStride] + pPred[5 - kiStride] + pPred[6 - kiStride] + pPred[7 - kiStride] + 2) >>
1000 2;
1001 const uint8_t kuiM[8] = {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
1002
1003 uint8_t i = 7;
1004
1005 do {
1006 ST64A8 (pPred + iTmp, LD64 (kuiM));
1007
1008 iTmp -= kiStride;
1009 } while (i-- > 0);
1010 }
1011
WelsIChromaPredDcNA_c(uint8_t * pPred,const int32_t kiStride)1012 void WelsIChromaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
1013 int32_t iTmp = (kiStride << 3) - kiStride;
1014 const uint64_t kuiDC64 = 0x8080808080808080ULL;
1015 uint8_t i = 7;
1016
1017 do {
1018 ST64A8 (pPred + iTmp, kuiDC64);
1019
1020 iTmp -= kiStride;
1021 } while (i-- > 0);
1022 }
1023
WelsI16x16LumaPredV_c(uint8_t * pPred,const int32_t kiStride)1024 void WelsI16x16LumaPredV_c (uint8_t* pPred, const int32_t kiStride) {
1025 int32_t iTmp = (kiStride << 4) - kiStride;
1026 const uint64_t kuiTop1 = LD64A8 (pPred - kiStride);
1027 const uint64_t kuiTop2 = LD64A8 (pPred - kiStride + 8);
1028 uint8_t i = 15;
1029
1030 do {
1031 ST64A8 (pPred + iTmp, kuiTop1);
1032 ST64A8 (pPred + iTmp + 8, kuiTop2);
1033
1034 iTmp -= kiStride;
1035 } while (i-- > 0);
1036 }
1037
WelsI16x16LumaPredH_c(uint8_t * pPred,const int32_t kiStride)1038 void WelsI16x16LumaPredH_c (uint8_t* pPred, const int32_t kiStride) {
1039 int32_t iTmp = (kiStride << 4) - kiStride;
1040 uint8_t i = 15;
1041
1042 do {
1043 const uint8_t kuiVal8 = pPred[iTmp - 1];
1044 const uint64_t kuiVal64 = 0x0101010101010101ULL * kuiVal8;
1045
1046 ST64A8 (pPred + iTmp, kuiVal64);
1047 ST64A8 (pPred + iTmp + 8, kuiVal64);
1048
1049 iTmp -= kiStride;
1050 } while (i-- > 0);
1051 }
1052
WelsI16x16LumaPredPlane_c(uint8_t * pPred,const int32_t kiStride)1053 void WelsI16x16LumaPredPlane_c (uint8_t* pPred, const int32_t kiStride) {
1054 int32_t a = 0, b = 0, c = 0, H = 0, V = 0;
1055 int32_t i, j;
1056 uint8_t* pTop = &pPred[-kiStride];
1057 uint8_t* pLeft = &pPred[-1];
1058
1059 for (i = 0 ; i < 8 ; i ++) {
1060 H += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
1061 V += (i + 1) * (pLeft[ (8 + i) * kiStride] - pLeft[ (6 - i) * kiStride]);
1062 }
1063
1064 a = (pLeft[15 * kiStride] + pTop[15]) << 4;
1065 b = (5 * H + 32) >> 6;
1066 c = (5 * V + 32) >> 6;
1067
1068 for (i = 0 ; i < 16 ; i ++) {
1069 for (j = 0 ; j < 16 ; j ++) {
1070 int32_t iTmp = (a + b * (j - 7) + c * (i - 7) + 16) >> 5;
1071 iTmp = WelsClip1 (iTmp);
1072 pPred[j] = iTmp;
1073 }
1074 pPred += kiStride;
1075 }
1076 }
1077
WelsI16x16LumaPredDc_c(uint8_t * pPred,const int32_t kiStride)1078 void WelsI16x16LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
1079 int32_t iTmp = (kiStride << 4) - kiStride;
1080 int32_t iSum = 0;
1081 uint8_t i = 15;
1082 uint8_t uiMean = 0;
1083
1084 /*caculate the kMean value*/
1085 do {
1086 iSum += pPred[-1 + iTmp] + pPred[-kiStride + i];
1087 iTmp -= kiStride;
1088 } while (i-- > 0);
1089 uiMean = (16 + iSum) >> 5;
1090
1091 iTmp = (kiStride << 4) - kiStride;
1092 i = 15;
1093 do {
1094 memset (&pPred[iTmp], uiMean, I16x16_COUNT);
1095 iTmp -= kiStride;
1096 } while (i-- > 0);
1097 }
1098
1099
WelsI16x16LumaPredDcTop_c(uint8_t * pPred,const int32_t kiStride)1100 void WelsI16x16LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
1101 int32_t iTmp = (kiStride << 4) - kiStride;
1102 int32_t iSum = 0;
1103 uint8_t i = 15;
1104 uint8_t uiMean = 0;
1105
1106 /*caculate the kMean value*/
1107 do {
1108 iSum += pPred[-kiStride + i];
1109 } while (i-- > 0);
1110 uiMean = (8 + iSum) >> 4;
1111
1112 i = 15;
1113 do {
1114 memset (&pPred[iTmp], uiMean, I16x16_COUNT);
1115 iTmp -= kiStride;
1116 } while (i-- > 0);
1117 }
1118
WelsI16x16LumaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride)1119 void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
1120 int32_t iTmp = (kiStride << 4) - kiStride;
1121 int32_t iSum = 0;
1122 uint64_t uiMean64 = 0;
1123 uint8_t uiMean = 0;
1124 uint8_t i = 15;
1125
1126 /*caculate the kMean value*/
1127 do {
1128 iSum += pPred[-1 + iTmp];
1129 iTmp -= kiStride;
1130 } while (i-- > 0);
1131 uiMean = (8 + iSum) >> 4;
1132 uiMean64 = 0x0101010101010101ULL * uiMean;
1133
1134 iTmp = (kiStride << 4) - kiStride;
1135 i = 15;
1136 do {
1137 ST64A8 (pPred + iTmp, uiMean64);
1138 ST64A8 (pPred + iTmp + 8, uiMean64);
1139
1140 iTmp -= kiStride;
1141 } while (i-- > 0);
1142 }
1143
WelsI16x16LumaPredDcNA_c(uint8_t * pPred,const int32_t kiStride)1144 void WelsI16x16LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
1145 const uint64_t kuiDC64 = 0x8080808080808080ULL;
1146 int32_t iTmp = (kiStride << 4) - kiStride;
1147 uint8_t i = 15;
1148
1149 do {
1150 ST64A8 (pPred + iTmp, kuiDC64);
1151 ST64A8 (pPred + iTmp + 8, kuiDC64);
1152
1153 iTmp -= kiStride;
1154 } while (i-- > 0);
1155 }
1156
1157 } // namespace WelsDec
1158