• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2013, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    get_intra_predictor.c
33  *
34  * \brief   implementation for get intra predictor about 16x16, 4x4, chroma.
35  *
36  * \date    4/2/2009 Created
37  *          9/14/2009 C level based optimization with high performance gained.
38  *              [const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
39  *
40  *************************************************************************************
41  */
42 #include <string.h>
43 
44 #include "macros.h"
45 #include "ls_defines.h"
46 #include "get_intra_predictor.h"
47 
48 namespace WelsDec {
49 
50 #define I4x4_COUNT 4
51 #define I8x8_COUNT 8
52 #define I16x16_COUNT 16
53 
WelsI4x4LumaPredV_c(uint8_t * pPred,const int32_t kiStride)54 void WelsI4x4LumaPredV_c (uint8_t* pPred, const int32_t kiStride) {
55   const uint32_t kuiVal = LD32A4 (pPred - kiStride);
56 
57   ST32A4 (pPred, kuiVal);
58   ST32A4 (pPred + kiStride, kuiVal);
59   ST32A4 (pPred + (kiStride << 1), kuiVal);
60   ST32A4 (pPred + (kiStride << 1) + kiStride, kuiVal);
61 }
62 
WelsI4x4LumaPredH_c(uint8_t * pPred,const int32_t kiStride)63 void WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride) {
64   const int32_t kiStride2 = kiStride << 1;
65   const int32_t kiStride3 = kiStride2 + kiStride;
66   const uint32_t kuiL0 = 0x01010101U * pPred[-1          ];
67   const uint32_t kuiL1 = 0x01010101U * pPred[-1 + kiStride ];
68   const uint32_t kuiL2 = 0x01010101U * pPred[-1 + kiStride2];
69   const uint32_t kuiL3 = 0x01010101U * pPred[-1 + kiStride3];
70 
71   ST32A4 (pPred, kuiL0);
72   ST32A4 (pPred + kiStride, kuiL1);
73   ST32A4 (pPred + kiStride2, kuiL2);
74   ST32A4 (pPred + kiStride3, kuiL3);
75 }
76 
WelsI4x4LumaPredDc_c(uint8_t * pPred,const int32_t kiStride)77 void WelsI4x4LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
78   const int32_t kiStride2  = kiStride << 1;
79   const int32_t kiStride3  = kiStride2 + kiStride;
80   const uint8_t kuiMean    = (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] +
81                               pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 4) >> 3;
82   const uint32_t kuiMean32 = 0x01010101U * kuiMean;
83 
84   ST32A4 (pPred, kuiMean32);
85   ST32A4 (pPred + kiStride, kuiMean32);
86   ST32A4 (pPred + kiStride2, kuiMean32);
87   ST32A4 (pPred + kiStride3, kuiMean32);
88 }
89 
WelsI4x4LumaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride)90 void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
91   const int32_t kiStride2  = kiStride << 1;
92   const int32_t kiStride3  = kiStride2 + kiStride;
93   const uint8_t kuiMean    = (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] + 2) >> 2;
94   const uint32_t kuiMean32 = 0x01010101U * kuiMean;
95 
96   ST32A4 (pPred, kuiMean32);
97   ST32A4 (pPred + kiStride, kuiMean32);
98   ST32A4 (pPred + kiStride2, kuiMean32);
99   ST32A4 (pPred + kiStride3, kuiMean32);
100 }
101 
WelsI4x4LumaPredDcTop_c(uint8_t * pPred,const int32_t kiStride)102 void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
103   const int32_t kiStride2  = kiStride << 1;
104   const int32_t kiStride3  = kiStride2 + kiStride;
105   const uint8_t kuiMean    = (pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 2)
106                              >> 2;
107   const uint32_t kuiMean32 = 0x01010101U * kuiMean;
108 
109   ST32A4 (pPred, kuiMean32);
110   ST32A4 (pPred + kiStride, kuiMean32);
111   ST32A4 (pPred + kiStride2, kuiMean32);
112   ST32A4 (pPred + kiStride3, kuiMean32);
113 }
114 
WelsI4x4LumaPredDcNA_c(uint8_t * pPred,const int32_t kiStride)115 void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
116   const uint32_t kuiDC32 = 0x80808080U;
117 
118   ST32A4 (pPred, kuiDC32);
119   ST32A4 (pPred + kiStride, kuiDC32);
120   ST32A4 (pPred + (kiStride << 1), kuiDC32);
121   ST32A4 (pPred + (kiStride << 1) + kiStride, kuiDC32);
122 }
123 
124 /*down pLeft*/
WelsI4x4LumaPredDDL_c(uint8_t * pPred,const int32_t kiStride)125 void WelsI4x4LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride) {
126   const int32_t kiStride2 = kiStride << 1;
127   const int32_t kiStride3 = kiStride + kiStride2;
128   /*get pTop*/
129   uint8_t* ptop         = &pPred[-kiStride];
130   const uint8_t kuiT0   = *ptop;
131   const uint8_t kuiT1   = * (ptop + 1);
132   const uint8_t kuiT2   = * (ptop + 2);
133   const uint8_t kuiT3   = * (ptop + 3);
134   const uint8_t kuiT4   = * (ptop + 4);
135   const uint8_t kuiT5   = * (ptop + 5);
136   const uint8_t kuiT6   = * (ptop + 6);
137   const uint8_t kuiT7   = * (ptop + 7);
138   const uint8_t kuiDDL0 = (2 + kuiT0 + kuiT2 + (kuiT1 << 1)) >> 2;      // kDDL0
139   const uint8_t kuiDDL1 = (2 + kuiT1 + kuiT3 + (kuiT2 << 1)) >> 2;      // kDDL1
140   const uint8_t kuiDDL2 = (2 + kuiT2 + kuiT4 + (kuiT3 << 1)) >> 2;      // kDDL2
141   const uint8_t kuiDDL3 = (2 + kuiT3 + kuiT5 + (kuiT4 << 1)) >> 2;      // kDDL3
142   const uint8_t kuiDDL4 = (2 + kuiT4 + kuiT6 + (kuiT5 << 1)) >> 2;      // kDDL4
143   const uint8_t kuiDDL5 = (2 + kuiT5 + kuiT7 + (kuiT6 << 1)) >> 2;      // kDDL5
144   const uint8_t kuiDDL6 = (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2;      // kDDL6
145   const uint8_t kuiList[8] = { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 };
146 
147   ST32A4 (pPred, LD32 (kuiList));
148   ST32A4 (pPred + kiStride, LD32 (kuiList + 1));
149   ST32A4 (pPred + kiStride2, LD32 (kuiList + 2));
150   ST32A4 (pPred + kiStride3, LD32 (kuiList + 3));
151 }
152 
153 /*down pLeft*/
WelsI4x4LumaPredDDLTop_c(uint8_t * pPred,const int32_t kiStride)154 void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride) {
155   const int32_t kiStride2 = kiStride << 1;
156   const int32_t kiStride3 = kiStride + kiStride2;
157   /*get pTop*/
158   uint8_t* ptop         = &pPred[-kiStride];
159   const uint8_t kuiT0   = *ptop;
160   const uint8_t kuiT1   = * (ptop + 1);
161   const uint8_t kuiT2   = * (ptop + 2);
162   const uint8_t kuiT3   = * (ptop + 3);
163   const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
164   const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
165   const uint16_t kuiT23 = 1 + kuiT2 + kuiT3;
166   const uint16_t kuiT33 = 1 + (kuiT3 << 1);
167   const uint8_t kuiDLT0 = (kuiT01 + kuiT12) >> 2;       // kDLT0
168   const uint8_t kuiDLT1 = (kuiT12 + kuiT23) >> 2;       // kDLT1
169   const uint8_t kuiDLT2 = (kuiT23 + kuiT33) >> 2;       // kDLT2
170   const uint8_t kuiDLT3 = kuiT33 >> 1;                  // kDLT3
171   const uint8_t kuiList[8] = { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 };
172 
173   ST32A4 (pPred,             LD32 (kuiList));
174   ST32A4 (pPred + kiStride,  LD32 (kuiList + 1));
175   ST32A4 (pPred + kiStride2, LD32 (kuiList + 2));
176   ST32A4 (pPred + kiStride3, LD32 (kuiList + 3));
177 }
178 
179 
180 /*down right*/
WelsI4x4LumaPredDDR_c(uint8_t * pPred,const int32_t kiStride)181 void WelsI4x4LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride) {
182   const int32_t kiStride2 = kiStride << 1;
183   const int32_t kiStride3 = kiStride + kiStride2;
184   uint8_t* ptopleft       = &pPred[- (kiStride + 1)];
185   uint8_t* pleft          = &pPred[-1];
186   const uint8_t kuiLT     = *ptopleft;
187   /*get pLeft and pTop*/
188   const uint8_t kuiL0   = *pleft;
189   const uint8_t kuiL1   = * (pleft + kiStride);
190   const uint8_t kuiL2   = * (pleft + kiStride2);
191   const uint8_t kuiL3   = * (pleft + kiStride3);
192   const uint8_t kuiT0   = * (ptopleft + 1);
193   const uint8_t kuiT1   = * (ptopleft + 2);
194   const uint8_t kuiT2   = * (ptopleft + 3);
195   const uint8_t kuiT3   = * (ptopleft + 4);
196   const uint16_t kuiTL0 = 1 + kuiLT + kuiL0;
197   const uint16_t kuiLT0 = 1 + kuiLT + kuiT0;
198   const uint16_t kuiT01 = 1 + kuiT0 + kuiT1;
199   const uint16_t kuiT12 = 1 + kuiT1 + kuiT2;
200   const uint16_t kuiT23 = 1 + kuiT2 + kuiT3;
201   const uint16_t kuiL01 = 1 + kuiL0 + kuiL1;
202   const uint16_t kuiL12 = 1 + kuiL1 + kuiL2;
203   const uint16_t kuiL23 = 1 + kuiL2 + kuiL3;
204   const uint8_t kuiDDR0 = (kuiTL0 + kuiLT0) >> 2;       // kuiDDR0
205   const uint8_t kuiDDR1 = (kuiLT0 + kuiT01) >> 2;       // kuiDDR1
206   const uint8_t kuiDDR2 = (kuiT01 + kuiT12) >> 2;       // kuiDDR2
207   const uint8_t kuiDDR3 = (kuiT12 + kuiT23) >> 2;       // kuiDDR3
208   const uint8_t kuiDDR4 = (kuiTL0 + kuiL01) >> 2;       // kuiDDR4
209   const uint8_t kuiDDR5 = (kuiL01 + kuiL12) >> 2;       // kuiDDR5
210   const uint8_t kuiDDR6 = (kuiL12 + kuiL23) >> 2;       // kuiDDR6
211   const uint8_t kuiList[8] = { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0 };
212 
213   ST32A4 (pPred, LD32 (kuiList + 3));
214   ST32A4 (pPred + kiStride, LD32 (kuiList + 2));
215   ST32A4 (pPred + kiStride2, LD32 (kuiList + 1));
216   ST32A4 (pPred + kiStride3, LD32 (kuiList));
217 }
218 
219 
220 /*vertical pLeft*/
WelsI4x4LumaPredVL_c(uint8_t * pPred,const int32_t kiStride)221 void WelsI4x4LumaPredVL_c (uint8_t* pPred, const int32_t kiStride) {
222   const int32_t kiStride2       = kiStride << 1;
223   const int32_t kiStride3       = kiStride + kiStride2;
224   uint8_t* ptopleft             = &pPred[- (kiStride + 1)];
225   /*get pTop*/
226   const uint8_t kuiT0           = * (ptopleft + 1);
227   const uint8_t kuiT1           = * (ptopleft + 2);
228   const uint8_t kuiT2           = * (ptopleft + 3);
229   const uint8_t kuiT3           = * (ptopleft + 4);
230   const uint8_t kuiT4           = * (ptopleft + 5);
231   const uint8_t kuiT5           = * (ptopleft + 6);
232   const uint8_t kuiT6           = * (ptopleft + 7);
233   const uint16_t kuiT01         = 1 + kuiT0 + kuiT1;
234   const uint16_t kuiT12         = 1 + kuiT1 + kuiT2;
235   const uint16_t kuiT23         = 1 + kuiT2 + kuiT3;
236   const uint16_t kuiT34         = 1 + kuiT3 + kuiT4;
237   const uint16_t kuiT45         = 1 + kuiT4 + kuiT5;
238   const uint16_t kuiT56         = 1 + kuiT5 + kuiT6;
239   const uint8_t kuiVL0          = kuiT01 >> 1;                  // kuiVL0
240   const uint8_t kuiVL1          = kuiT12 >> 1;                  // kuiVL1
241   const uint8_t kuiVL2          = kuiT23 >> 1;                  // kuiVL2
242   const uint8_t kuiVL3          = kuiT34 >> 1;                  // kuiVL3
243   const uint8_t kuiVL4          = kuiT45 >> 1;                  // kuiVL4
244   const uint8_t kuiVL5          = (kuiT01 + kuiT12) >> 2;       // kuiVL5
245   const uint8_t kuiVL6          = (kuiT12 + kuiT23) >> 2;       // kuiVL6
246   const uint8_t kuiVL7          = (kuiT23 + kuiT34) >> 2;       // kuiVL7
247   const uint8_t kuiVL8          = (kuiT34 + kuiT45) >> 2;       // kuiVL8
248   const uint8_t kuiVL9          = (kuiT45 + kuiT56) >> 2;       // kuiVL9
249   const uint8_t kuiList[10]     = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 };
250 
251   ST32A4 (pPred,             LD32 (kuiList));
252   ST32A4 (pPred + kiStride,  LD32 (kuiList + 5));
253   ST32A4 (pPred + kiStride2, LD32 (kuiList + 1));
254   ST32A4 (pPred + kiStride3, LD32 (kuiList + 6));
255 }
256 
257 /*vertical pLeft*/
WelsI4x4LumaPredVLTop_c(uint8_t * pPred,const int32_t kiStride)258 void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride) {
259   const int32_t kiStride2       = kiStride << 1;
260   const int32_t kiStride3       = kiStride + kiStride2;
261   uint8_t* ptopleft             = &pPred[- (kiStride + 1)];
262   /*get pTop*/
263   const uint8_t kuiT0           = * (ptopleft + 1);
264   const uint8_t kuiT1           = * (ptopleft + 2);
265   const uint8_t kuiT2           = * (ptopleft + 3);
266   const uint8_t kuiT3           = * (ptopleft + 4);
267   const uint16_t kuiT01         = 1 + kuiT0 + kuiT1;
268   const uint16_t kuiT12         = 1 + kuiT1 + kuiT2;
269   const uint16_t kuiT23         = 1 + kuiT2 + kuiT3;
270   const uint16_t kuiT33         = 1 + (kuiT3 << 1);
271   const uint8_t kuiVL0          = kuiT01 >> 1;
272   const uint8_t kuiVL1          = kuiT12 >> 1;
273   const uint8_t kuiVL2          = kuiT23 >> 1;
274   const uint8_t kuiVL3          = kuiT33 >> 1;
275   const uint8_t kuiVL4          = (kuiT01 + kuiT12) >> 2;
276   const uint8_t kuiVL5          = (kuiT12 + kuiT23) >> 2;
277   const uint8_t kuiVL6          = (kuiT23 + kuiT33) >> 2;
278   const uint8_t kuiVL7          = kuiVL3;
279   const uint8_t kuiList[10]     = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 };
280 
281   ST32A4 (pPred, LD32 (kuiList));
282   ST32A4 (pPred + kiStride, LD32 (kuiList + 5));
283   ST32A4 (pPred + kiStride2, LD32 (kuiList + 1));
284   ST32A4 (pPred + kiStride3, LD32 (kuiList + 6));
285 }
286 
287 
288 /*vertical right*/
WelsI4x4LumaPredVR_c(uint8_t * pPred,const int32_t kiStride)289 void WelsI4x4LumaPredVR_c (uint8_t* pPred, const int32_t kiStride) {
290   const int32_t kiStride2       = kiStride << 1;
291   const int32_t kiStride3       = kiStride + kiStride2;
292   const uint8_t kuiLT           = pPred[-kiStride - 1];
293   /*get pLeft and pTop*/
294   const uint8_t kuiL0           = pPred[          - 1];
295   const uint8_t kuiL1           = pPred[kiStride  - 1];
296   const uint8_t kuiL2           = pPred[kiStride2 - 1];
297   const uint8_t kuiT0           = pPred[ -kiStride];
298   const uint8_t kuiT1           = pPred[1 - kiStride];
299   const uint8_t kuiT2           = pPred[2 - kiStride];
300   const uint8_t kuiT3           = pPred[3 - kiStride];
301   const uint8_t kuiVR0          = (1 + kuiLT + kuiT0) >> 1;     // kuiVR0
302   const uint8_t kuiVR1          = (1 + kuiT0 + kuiT1) >> 1;     // kuiVR1
303   const uint8_t kuiVR2          = (1 + kuiT1 + kuiT2) >> 1;     // kuiVR2
304   const uint8_t kuiVR3          = (1 + kuiT2 + kuiT3) >> 1;     // kuiVR3
305   const uint8_t kuiVR4          = (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;      // kuiVR4
306   const uint8_t kuiVR5          = (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;      // kuiVR5
307   const uint8_t kuiVR6          = (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;      // kuiVR6
308   const uint8_t kuiVR7          = (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2;      // kuiVR7
309   const uint8_t kuiVR8          = (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;      // kuiVR8
310   const uint8_t kuiVR9          = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;      // kuiVR9
311   const uint8_t kuiList[10]     = { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 };
312 
313   ST32A4 (pPred, LD32 (kuiList + 1));
314   ST32A4 (pPred + kiStride, LD32 (kuiList + 6));
315   ST32A4 (pPred + kiStride2, LD32 (kuiList));
316   ST32A4 (pPred + kiStride3, LD32 (kuiList + 5));
317 }
318 
319 /*horizontal up*/
WelsI4x4LumaPredHU_c(uint8_t * pPred,const int32_t kiStride)320 void WelsI4x4LumaPredHU_c (uint8_t* pPred, const int32_t kiStride) {
321   const int32_t kiStride2       = kiStride << 1;
322   const int32_t kiStride3       = kiStride + kiStride2;
323   /*get pLeft*/
324   const uint8_t kuiL0           = pPred[          - 1];
325   const uint8_t kuiL1           = pPred[kiStride  - 1];
326   const uint8_t kuiL2           = pPred[kiStride2 - 1];
327   const uint8_t kuiL3           = pPred[kiStride3 - 1];
328   const uint16_t kuiL01         = 1 + kuiL0 + kuiL1;
329   const uint16_t kuiL12         = 1 + kuiL1 + kuiL2;
330   const uint16_t kuiL23         = 1 + kuiL2 + kuiL3;
331   const uint8_t kuiHU0          = kuiL01 >> 1;
332   const uint8_t kuiHU1          = (kuiL01 + kuiL12) >> 2;
333   const uint8_t kuiHU2          = kuiL12 >> 1;
334   const uint8_t kuiHU3          = (kuiL12 + kuiL23) >> 2;
335   const uint8_t kuiHU4          = kuiL23 >> 1;
336   const uint8_t kuiHU5          = (1 + kuiL23 + (kuiL3 << 1)) >> 2;
337   const uint8_t kuiList[10]     = { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 };
338 
339   ST32A4 (pPred, LD32 (kuiList));
340   ST32A4 (pPred + kiStride, LD32 (kuiList + 2));
341   ST32A4 (pPred + kiStride2, LD32 (kuiList + 4));
342   ST32A4 (pPred + kiStride3, LD32 (kuiList + 6));
343 }
344 
345 /*horizontal down*/
WelsI4x4LumaPredHD_c(uint8_t * pPred,const int32_t kiStride)346 void WelsI4x4LumaPredHD_c (uint8_t* pPred, const int32_t kiStride) {
347   const int32_t kiStride2       = kiStride << 1;
348   const int32_t kiStride3       = kiStride + kiStride2;
349   const uint8_t kuiLT           = pPred[- (kiStride + 1)];
350   /*get pLeft and pTop*/
351   const uint8_t kuiL0           = pPred[-1            ];
352   const uint8_t kuiL1           = pPred[-1 + kiStride ];
353   const uint8_t kuiL2           = pPred[-1 + kiStride2];
354   const uint8_t kuiL3           = pPred[-1 + kiStride3];
355   const uint8_t kuiT0           = pPred[-kiStride     ];
356   const uint8_t kuiT1           = pPred[-kiStride + 1 ];
357   const uint8_t kuiT2           = pPred[-kiStride + 2 ];
358   const uint16_t kuiTL0         = 1 + kuiLT + kuiL0;
359   const uint16_t kuiLT0         = 1 + kuiLT + kuiT0;
360   const uint16_t kuiT01         = 1 + kuiT0 + kuiT1;
361   const uint16_t kuiT12         = 1 + kuiT1 + kuiT2;
362   const uint16_t kuiL01         = 1 + kuiL0 + kuiL1;
363   const uint16_t kuiL12         = 1 + kuiL1 + kuiL2;
364   const uint16_t kuiL23         = 1 + kuiL2 + kuiL3;
365   const uint8_t kuiHD0          = kuiTL0 >> 1;
366   const uint8_t kuiHD1          = (kuiTL0 + kuiLT0) >> 2;
367   const uint8_t kuiHD2          = (kuiLT0 + kuiT01) >> 2;
368   const uint8_t kuiHD3          = (kuiT01 + kuiT12) >> 2;
369   const uint8_t kuiHD4          = kuiL01 >> 1;
370   const uint8_t kuiHD5          = (kuiTL0 + kuiL01) >> 2;
371   const uint8_t kuiHD6          = kuiL12 >> 1;
372   const uint8_t kuiHD7          = (kuiL01 + kuiL12) >> 2;
373   const uint8_t kuiHD8          = kuiL23 >> 1;
374   const uint8_t kuiHD9          = (kuiL12 + kuiL23) >> 2;
375   const uint8_t kuiList[10]     = { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 };
376 
377   ST32A4 (pPred, LD32 (kuiList + 6));
378   ST32A4 (pPred + kiStride, LD32 (kuiList + 4));
379   ST32A4 (pPred + kiStride2, LD32 (kuiList + 2));
380   ST32A4 (pPred + kiStride3, LD32 (kuiList));
381 }
382 
WelsI8x8LumaPredV_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)383 void WelsI8x8LumaPredV_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
384   uint64_t uiTop = 0;
385   int32_t iStride[8];
386   uint8_t uiPixelFilterT[8];
387   int32_t i;
388 
389   for (iStride[0] = 0, i = 1; i < 8; i++) {
390     iStride[i] = iStride[i - 1] + kiStride;
391   }
392 
393   uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
394                         pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
395   for (i = 1; i < 7; i++) {
396     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
397   }
398   uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
399                         pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
400 
401   // 8-89
402   for (i = 7; i >= 0; i--) {
403     uiTop = ((uiTop << 8) | uiPixelFilterT[i]);
404   }
405 
406   for (i = 0; i < 8; i++) {
407     ST64A8 (pPred + kiStride * i, uiTop);
408   }
409 }
410 
WelsI8x8LumaPredH_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)411 void WelsI8x8LumaPredH_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
412   uint64_t uiLeft;
413   int32_t iStride[8];
414   uint8_t uiPixelFilterL[8];
415   int32_t i;
416 
417   for (iStride[0] = 0, i = 1; i < 8; i++) {
418     iStride[i] = iStride[i - 1] + kiStride;
419   }
420 
421   uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
422                         pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
423   for (i = 1; i < 7; i++) {
424     uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
425                          2);
426   }
427   uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
428 
429   // 8-90
430   for (i = 0; i < 8; i++) {
431     uiLeft = 0x0101010101010101ULL * uiPixelFilterL[i];
432     ST64A8 (pPred + iStride[i], uiLeft);
433   }
434 }
435 
WelsI8x8LumaPredDc_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)436 void WelsI8x8LumaPredDc_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
437   int32_t iStride[8];
438   uint8_t uiPixelFilterL[8];
439   uint8_t uiPixelFilterT[8];
440   uint16_t uiTotal = 0;
441   int32_t i;
442 
443   for (iStride[0] = 0, i = 1; i < 8; i++) {
444     iStride[i] = iStride[i - 1] + kiStride;
445   }
446 
447   uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
448                         pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
449   uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
450                         pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
451   for (i = 1; i < 7; i++) {
452     uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
453                          2);
454     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
455   }
456   uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
457   uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
458                         pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
459 
460   // 8-91
461   for (i = 0; i < 8; i++) {
462     uiTotal += uiPixelFilterL[i];
463     uiTotal += uiPixelFilterT[i];
464   }
465 
466   const uint8_t kuiMean = ((uiTotal + 8) >> 4);
467   const uint64_t kuiMean64 = 0x0101010101010101ULL * kuiMean;
468 
469   for (i = 0; i < 8; i++) {
470     ST64A8 (pPred + iStride[i], kuiMean64);
471   }
472 }
473 
WelsI8x8LumaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)474 void WelsI8x8LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
475   int32_t iStride[8];
476   uint8_t uiPixelFilterL[8];
477   uint16_t uiTotal = 0;
478   int32_t i;
479 
480   for (iStride[0] = 0, i = 1; i < 8; i++) {
481     iStride[i] = iStride[i - 1] + kiStride;
482   }
483 
484   uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
485                         pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
486   for (i = 1; i < 7; i++) {
487     uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
488                          2);
489   }
490   uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
491 
492   // 8-92
493   for (i = 0; i < 8; i++) {
494     uiTotal += uiPixelFilterL[i];
495   }
496 
497   const uint8_t kuiMean = ((uiTotal + 4) >> 3);
498   const uint64_t kuiMean64 = 0x0101010101010101ULL * kuiMean;
499 
500   for (i = 0; i < 8; i++) {
501     ST64A8 (pPred + iStride[i], kuiMean64);
502   }
503 }
504 
WelsI8x8LumaPredDcTop_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)505 void WelsI8x8LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
506   int32_t iStride[8];
507   uint8_t uiPixelFilterT[8];
508   uint16_t uiTotal = 0;
509   int32_t i;
510 
511   for (iStride[0] = 0, i = 1; i < 8; i++) {
512     iStride[i] = iStride[i - 1] + kiStride;
513   }
514 
515   uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
516                         pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
517   for (i = 1; i < 7; i++) {
518     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
519   }
520   uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
521                         pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
522 
523   // 8-93
524   for (i = 0; i < 8; i++) {
525     uiTotal += uiPixelFilterT[i];
526   }
527 
528   const uint8_t kuiMean = ((uiTotal + 4) >> 3);
529   const uint64_t kuiMean64 = 0x0101010101010101ULL * kuiMean;
530 
531   for (i = 0; i < 8; i++) {
532     ST64A8 (pPred + iStride[i], kuiMean64);
533   }
534 }
535 
WelsI8x8LumaPredDcNA_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)536 void WelsI8x8LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
537   // for normal 8 bit depth, 8-94
538   const uint64_t kuiDC64 = 0x8080808080808080ULL;
539 
540   int32_t iStride[8];
541   int32_t i;
542   ST64A8 (pPred, kuiDC64);
543   for (iStride[0] = 0, i = 1; i < 8; i++) {
544     iStride[i] = iStride[i - 1] + kiStride;
545     ST64A8 (pPred + iStride[i], kuiDC64);
546   }
547 }
548 
549 /*down pLeft*/
WelsI8x8LumaPredDDL_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)550 void WelsI8x8LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
551   // Top and Top-right available
552   int32_t iStride[8];
553   uint8_t uiPixelFilterT[16];
554   int32_t i, j;
555 
556   for (iStride[0] = 0, i = 1; i < 8; i++) {
557     iStride[i] = iStride[i - 1] + kiStride;
558   }
559 
560   uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
561                         pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
562   for (i = 1; i < 15; i++) {
563     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
564   }
565   uiPixelFilterT[15] = ((pPred[14 - kiStride] + pPred[15 - kiStride] * 3 + 2) >> 2);
566 
567   for (i = 0; i < 8; i++) { // y
568     for (j = 0; j < 8; j++) { // x
569       if (i == 7 && j == 7) { // 8-95
570         pPred[j + iStride[i]] = (uiPixelFilterT[14] + 3 * uiPixelFilterT[15] + 2) >> 2;
571       } else { // 8-96
572         pPred[j + iStride[i]] = (uiPixelFilterT[i + j] + (uiPixelFilterT[i + j + 1] << 1) + uiPixelFilterT[i + j + 2] + 2) >> 2;
573       }
574     }
575   }
576 }
577 
578 /*down pLeft*/
WelsI8x8LumaPredDDLTop_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)579 void WelsI8x8LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
580   // Top available and Top-right unavailable
581   int32_t iStride[8];
582   uint8_t uiPixelFilterT[16];
583   int32_t i, j;
584 
585   for (iStride[0] = 0, i = 1; i < 8; i++) {
586     iStride[i] = iStride[i - 1] + kiStride;
587   }
588 
589   uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
590                         pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
591   for (i = 1; i < 7; i++) {
592     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
593   }
594   // p[x, -1] x=8...15 are replaced with p[7, -1]
595   uiPixelFilterT[7] = ((pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
596   for (i = 8; i < 16; i++) {
597     uiPixelFilterT[i] = pPred[7 - kiStride];
598   }
599 
600   for (i = 0; i < 8; i++) { // y
601     for (j = 0; j < 8; j++) { // x
602       if (i == 7 && j == 7) { // 8-95
603         pPred[j + iStride[i]] = (uiPixelFilterT[14] + 3 * uiPixelFilterT[15] + 2) >> 2;
604       } else { // 8-96
605         pPred[j + iStride[i]] = (uiPixelFilterT[i + j] + (uiPixelFilterT[i + j + 1] << 1) + uiPixelFilterT[i + j + 2] + 2) >> 2;
606       }
607     }
608   }
609 }
610 
611 /*down right*/
WelsI8x8LumaPredDDR_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)612 void WelsI8x8LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
613   // The TopLeft, Top, Left are all available under this mode
614   int32_t iStride[8];
615   uint8_t uiPixelFilterTL;
616   uint8_t uiPixelFilterL[8];
617   uint8_t uiPixelFilterT[8];
618   int32_t i, j;
619 
620   for (iStride[0] = 0, i = 1; i < 8; i++) {
621     iStride[i] = iStride[i - 1] + kiStride;
622   }
623 
624   uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
625 
626   uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
627   uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
628   for (i = 1; i < 7; i++) {
629     uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
630                          2);
631     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
632   }
633   uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
634   uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
635                         pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
636 
637   for (i = 0; i < 8; i++) { // y
638     // 8-98, x < y-1
639     for (j = 0; j < (i - 1); j++) {
640       pPred[j + iStride[i]] = (uiPixelFilterL[i - j - 2] + (uiPixelFilterL[i - j - 1] << 1) + uiPixelFilterL[i - j] + 2) >> 2;
641     }
642     // 8-98, special case, x == y-1
643     if (i >= 1) {
644       j = i - 1;
645       pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterL[0] << 1) + uiPixelFilterL[1] + 2) >> 2;
646     }
647     // 8-99, x==y
648     j = i;
649     pPred[j + iStride[i]] = (uiPixelFilterT[0] + (uiPixelFilterTL << 1) + uiPixelFilterL[0] + 2) >> 2;
650     // 8-97, special case, x == y+1
651     if (i < 7) {
652       j = i + 1;
653       pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterT[0] << 1) + uiPixelFilterT[1] + 2) >> 2;
654     }
655     for (j = i + 2; j < 8; j++) { // 8-97, x > y+1
656       pPred[j + iStride[i]] = (uiPixelFilterT[j - i - 2] + (uiPixelFilterT[j - i - 1] << 1) + uiPixelFilterT[j - i] + 2) >> 2;
657     }
658   }
659 }
660 
661 /*vertical pLeft*/
WelsI8x8LumaPredVL_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)662 void WelsI8x8LumaPredVL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
663   // Top and Top-right available
664   int32_t iStride[8];
665   uint8_t uiPixelFilterT[16];
666   int32_t i, j;
667 
668   for (iStride[0] = 0, i = 1; i < 8; i++) {
669     iStride[i] = iStride[i - 1] + kiStride;
670   }
671 
672   uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
673                         pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
674   for (i = 1; i < 15; i++) {
675     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
676   }
677   uiPixelFilterT[15] = ((pPred[14 - kiStride] + pPred[15 - kiStride] * 3 + 2) >> 2);
678 
679   for (i = 0; i < 8; i++) { // y
680     if ((i & 0x01) == 0) { // 8-108
681       for (j = 0; j < 8; j++) { // x
682         pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + uiPixelFilterT[j + (i >> 1) + 1] + 1) >> 1;
683       }
684     } else {  // 8-109
685       for (j = 0; j < 8; j++) { // x
686         pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + (uiPixelFilterT[j + (i >> 1) + 1] << 1) + uiPixelFilterT[j +
687                                  (i >> 1) + 2] + 2) >> 2;
688       }
689     }
690   }
691 }
692 
693 /*vertical pLeft*/
WelsI8x8LumaPredVLTop_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)694 void WelsI8x8LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
695   // Top available and Top-right unavailable
696   int32_t iStride[8];
697   uint8_t uiPixelFilterT[16];
698   int32_t i, j;
699 
700   for (iStride[0] = 0, i = 1; i < 8; i++) {
701     iStride[i] = iStride[i - 1] + kiStride;
702   }
703 
704   uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
705                         pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
706   for (i = 1; i < 7; i++) {
707     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
708   }
709   // p[x, -1] x=8...15 are replaced with p[7, -1]
710   uiPixelFilterT[7] = ((pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
711   for (i = 8; i < 16; i++) {
712     uiPixelFilterT[i] = pPred[7 - kiStride];
713   }
714 
715   for (i = 0; i < 8; i++) { // y
716     if ((i & 0x01) == 0) { // 8-108
717       for (j = 0; j < 8; j++) { // x
718         pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + uiPixelFilterT[j + (i >> 1) + 1] + 1) >> 1;
719       }
720     } else {  // 8-109
721       for (j = 0; j < 8; j++) { // x
722         pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + (uiPixelFilterT[j + (i >> 1) + 1] << 1) + uiPixelFilterT[j +
723                                  (i >> 1) + 2] + 2) >> 2;
724       }
725     }
726   }
727 }
728 
729 /*vertical right*/
WelsI8x8LumaPredVR_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)730 void WelsI8x8LumaPredVR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
731   // The TopLeft, Top, Left are always available under this mode
732   int32_t iStride[8];
733   uint8_t uiPixelFilterTL;
734   uint8_t uiPixelFilterL[8];
735   uint8_t uiPixelFilterT[8];
736   int32_t i, j;
737   int32_t izVR, izVRDiv;
738 
739   for (iStride[0] = 0, i = 1; i < 8; i++) {
740     iStride[i] = iStride[i - 1] + kiStride;
741   }
742 
743   uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
744 
745   uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
746   uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
747   for (i = 1; i < 7; i++) {
748     uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
749                          2);
750     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
751   }
752   uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
753   uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
754                         pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
755 
756   for (i = 0; i < 8; i++) { // y
757     for (j = 0; j < 8; j++) { // x
758       izVR = (j << 1) - i; // 2 * x - y
759       izVRDiv = j - (i >> 1);
760       if (izVR >= 0) {
761         if ((izVR & 0x01) == 0) {  // 8-100
762           if (izVRDiv > 0) {
763             pPred[j + iStride[i]] = (uiPixelFilterT[izVRDiv - 1] + uiPixelFilterT[izVRDiv] + 1) >> 1;
764           } else {
765             pPred[j + iStride[i]] = (uiPixelFilterTL + uiPixelFilterT[0] + 1) >> 1;
766           }
767         } else { // 8-101
768           if (izVRDiv > 1) {
769             pPred[j + iStride[i]] = (uiPixelFilterT[izVRDiv - 2] + (uiPixelFilterT[izVRDiv - 1] << 1) + uiPixelFilterT[izVRDiv] + 2)
770                                     >> 2;
771           } else {
772             pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterT[0] << 1) + uiPixelFilterT[1] + 2) >> 2;
773           }
774         }
775       } else if (izVR == -1) { // 8-102
776         pPred[j + iStride[i]] = (uiPixelFilterL[0] + (uiPixelFilterTL << 1) + uiPixelFilterT[0] + 2) >> 2;
777       } else if (izVR < -2) { // 8-103
778         pPred[j + iStride[i]] = (uiPixelFilterL[-izVR - 1] + (uiPixelFilterL[-izVR - 2] << 1) + uiPixelFilterL[-izVR - 3] + 2)
779                                 >> 2;
780       } else { // izVR==-2, 8-103, special case
781         pPred[j + iStride[i]] = (uiPixelFilterL[1] + (uiPixelFilterL[0] << 1) + uiPixelFilterTL + 2) >> 2;
782       }
783     }
784   }
785 }
786 
787 /*horizontal up*/
WelsI8x8LumaPredHU_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)788 void WelsI8x8LumaPredHU_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
789   int32_t iStride[8];
790   uint8_t uiPixelFilterL[8];
791   int32_t i, j;
792   int32_t izHU;
793 
794   for (iStride[0] = 0, i = 1; i < 8; i++) {
795     iStride[i] = iStride[i - 1] + kiStride;
796   }
797 
798   uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
799                         pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
800   for (i = 1; i < 7; i++) {
801     uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
802                          2);
803   }
804   uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
805 
806   for (i = 0; i < 8; i++) { // y
807     for (j = 0; j < 8; j++) { // x
808       izHU = j + (i << 1); // x + 2 * y
809       if (izHU < 13) {
810         if ((izHU & 0x01) == 0) {  // 8-110
811           pPred[j + iStride[i]] = (uiPixelFilterL[izHU >> 1] + uiPixelFilterL[1 + (izHU >> 1)] + 1) >> 1;
812         } else { // 8-111
813           pPred[j + iStride[i]] = (uiPixelFilterL[izHU >> 1] + (uiPixelFilterL[1 + (izHU >> 1)] << 1) + uiPixelFilterL[2 +
814                                    (izHU >> 1)] + 2) >> 2;
815         }
816       } else if (izHU == 13) { // 8-112
817         pPred[j + iStride[i]] = (uiPixelFilterL[6] + 3 * uiPixelFilterL[7] + 2) >> 2;
818       } else { // 8-113
819         pPred[j + iStride[i]] = uiPixelFilterL[7];
820       }
821     }
822   }
823 }
824 
825 /*horizontal down*/
WelsI8x8LumaPredHD_c(uint8_t * pPred,const int32_t kiStride,bool bTLAvail,bool bTRAvail)826 void WelsI8x8LumaPredHD_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
827   // The TopLeft, Top, Left are all available under this mode
828   int32_t iStride[8];
829   uint8_t uiPixelFilterTL;
830   uint8_t uiPixelFilterL[8];
831   uint8_t uiPixelFilterT[8];
832   int32_t i, j;
833   int32_t izHD, izHDDiv;
834 
835   for (iStride[0] = 0, i = 1; i < 8; i++) {
836     iStride[i] = iStride[i - 1] + kiStride;
837   }
838 
839   uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
840 
841   uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
842   uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
843   for (i = 1; i < 7; i++) {
844     uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
845                          2);
846     uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
847   }
848   uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
849   uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
850                         pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
851 
852   for (i = 0; i < 8; i++) { // y
853     for (j = 0; j < 8; j++) { // x
854       izHD = (i << 1) - j; // 2*y - x
855       izHDDiv = i - (j >> 1);
856       if (izHD >= 0) {
857         if ((izHD & 0x01) == 0) {  // 8-104
858           if (izHDDiv == 0) {
859             pPred[j + iStride[i]] = (uiPixelFilterTL + uiPixelFilterL[0] + 1) >> 1;
860           } else {
861             pPred[j + iStride[i]] = (uiPixelFilterL[izHDDiv - 1] + uiPixelFilterL[izHDDiv] + 1) >> 1;
862           }
863         } else {  // 8-105
864           if (izHDDiv == 1) {
865             pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterL[0] << 1) + uiPixelFilterL[1] + 2) >> 2;
866           } else {
867             pPred[j + iStride[i]] = (uiPixelFilterL[izHDDiv - 2] + (uiPixelFilterL[izHDDiv - 1] << 1) + uiPixelFilterL[izHDDiv] + 2)
868                                     >> 2;
869           }
870         }
871       } else if (izHD == -1) { // 8-106
872         pPred[j + iStride[i]] = (uiPixelFilterL[0] + (uiPixelFilterTL << 1) + uiPixelFilterT[0] + 2) >> 2;
873       } else if (izHD < -2) { // 8-107
874         pPred[j + iStride[i]] = (uiPixelFilterT[-izHD - 1] + (uiPixelFilterT[-izHD - 2] << 1) + uiPixelFilterT[-izHD - 3] + 2)
875                                 >> 2;
876       } else { // 8-107 special case, izHD==-2
877         pPred[j + iStride[i]] = (uiPixelFilterT[1] + (uiPixelFilterT[0] << 1) + uiPixelFilterTL + 2) >> 2;
878       }
879     }
880   }
881 }
882 
883 
WelsIChromaPredV_c(uint8_t * pPred,const int32_t kiStride)884 void WelsIChromaPredV_c (uint8_t* pPred, const int32_t kiStride) {
885   const uint64_t kuiVal64 = LD64A8 (&pPred[-kiStride]);
886   const int32_t kiStride2 = kiStride  << 1;
887   const int32_t kiStride4 = kiStride2 << 1;
888 
889   ST64A8 (pPred, kuiVal64);
890   ST64A8 (pPred + kiStride, kuiVal64);
891   ST64A8 (pPred + kiStride2, kuiVal64);
892   ST64A8 (pPred + kiStride2 + kiStride, kuiVal64);
893   ST64A8 (pPred + kiStride4, kuiVal64);
894   ST64A8 (pPred + kiStride4 + kiStride, kuiVal64);
895   ST64A8 (pPred + kiStride4 + kiStride2, kuiVal64);
896   ST64A8 (pPred + (kiStride << 3) - kiStride, kuiVal64);
897 }
898 
WelsIChromaPredH_c(uint8_t * pPred,const int32_t kiStride)899 void WelsIChromaPredH_c (uint8_t* pPred, const int32_t kiStride) {
900   int32_t iTmp = (kiStride << 3) - kiStride;
901   uint8_t i = 7;
902 
903   do {
904     const uint8_t kuiVal8   = pPred[iTmp - 1];
905     const uint64_t kuiVal64 = 0x0101010101010101ULL * kuiVal8;
906 
907     ST64A8 (pPred + iTmp, kuiVal64);
908 
909     iTmp -= kiStride;
910   } while (i-- > 0);
911 }
912 
913 
WelsIChromaPredPlane_c(uint8_t * pPred,const int32_t kiStride)914 void WelsIChromaPredPlane_c (uint8_t* pPred, const int32_t kiStride) {
915   int32_t a = 0, b = 0, c = 0, H = 0, V = 0;
916   int32_t i, j;
917   uint8_t* pTop = &pPred[-kiStride];
918   uint8_t* pLeft = &pPred[-1];
919 
920   for (i = 0 ; i < 4 ; i ++) {
921     H += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
922     V += (i + 1) * (pLeft[ (4 + i) * kiStride] - pLeft[ (2 - i) * kiStride]);
923   }
924 
925   a = (pLeft[7 * kiStride] + pTop[7]) << 4;
926   b = (17 * H + 16) >> 5;
927   c = (17 * V + 16) >> 5;
928 
929   for (i = 0 ; i < 8 ; i ++) {
930     for (j = 0 ; j < 8 ; j ++) {
931       int32_t iTmp = (a + b * (j - 3) + c * (i - 3) + 16) >> 5;
932       iTmp = WelsClip1 (iTmp);
933       pPred[j] = iTmp;
934     }
935     pPred += kiStride;
936   }
937 }
938 
939 
WelsIChromaPredDc_c(uint8_t * pPred,const int32_t kiStride)940 void WelsIChromaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
941   const int32_t kiL1            = kiStride - 1;
942   const int32_t kiL2            = kiL1 + kiStride;
943   const int32_t kiL3            = kiL2 + kiStride;
944   const int32_t kiL4            = kiL3 + kiStride;
945   const int32_t kiL5            = kiL4 + kiStride;
946   const int32_t kiL6            = kiL5 + kiStride;
947   const int32_t kiL7            = kiL6 + kiStride;
948   /*caculate the kMean value*/
949   const uint8_t kuiM1           = (pPred[-kiStride] + pPred[1 - kiStride] + pPred[2 - kiStride] + pPred[3 - kiStride] +
950                                    pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 4) >> 3 ;
951   const uint32_t kuiSum2        = pPred[4 - kiStride] + pPred[5 - kiStride] + pPred[6 - kiStride] + pPred[7 - kiStride];
952   const uint32_t kuiSum3        = pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7];
953   const uint8_t kuiM2           = (kuiSum2 + 2) >> 2;
954   const uint8_t kuiM3           = (kuiSum3 + 2) >> 2;
955   const uint8_t kuiM4           = (kuiSum2 + kuiSum3 + 4) >> 3;
956   const uint8_t kuiMUP[8]       = {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
957   const uint8_t kuiMDown[8]     = {kuiM3, kuiM3, kuiM3, kuiM3, kuiM4, kuiM4, kuiM4, kuiM4};
958   const uint64_t kuiUP64        = LD64 (kuiMUP);
959   const uint64_t kuiDN64        = LD64 (kuiMDown);
960 
961   ST64A8 (pPred, kuiUP64);
962   ST64A8 (pPred + kiL1 + 1, kuiUP64);
963   ST64A8 (pPred + kiL2 + 1, kuiUP64);
964   ST64A8 (pPred + kiL3 + 1, kuiUP64);
965   ST64A8 (pPred + kiL4 + 1, kuiDN64);
966   ST64A8 (pPred + kiL5 + 1, kuiDN64);
967   ST64A8 (pPred + kiL6 + 1, kuiDN64);
968   ST64A8 (pPred + kiL7 + 1, kuiDN64);
969 }
970 
WelsIChromaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride)971 void WelsIChromaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
972   const int32_t kiL1    =   -1 + kiStride;
973   const int32_t kiL2    = kiL1 + kiStride;
974   const int32_t kiL3    = kiL2 + kiStride;
975   const int32_t kiL4    = kiL3 + kiStride;
976   const int32_t kiL5    = kiL4 + kiStride;
977   const int32_t kiL6    = kiL5 + kiStride;
978   const int32_t kiL7    = kiL6 + kiStride;
979   /*caculate the kMean value*/
980   const uint8_t kuiMUP   = (pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 2) >> 2 ;
981   const uint8_t kuiMDown = (pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7] + 2) >> 2;
982   const uint64_t kuiUP64 = 0x0101010101010101ULL * kuiMUP;
983   const uint64_t kuiDN64 = 0x0101010101010101ULL * kuiMDown;
984 
985   ST64A8 (pPred, kuiUP64);
986   ST64A8 (pPred + kiL1 + 1, kuiUP64);
987   ST64A8 (pPred + kiL2 + 1, kuiUP64);
988   ST64A8 (pPred + kiL3 + 1, kuiUP64);
989   ST64A8 (pPred + kiL4 + 1, kuiDN64);
990   ST64A8 (pPred + kiL5 + 1, kuiDN64);
991   ST64A8 (pPred + kiL6 + 1, kuiDN64);
992   ST64A8 (pPred + kiL7 + 1, kuiDN64);
993 }
994 
WelsIChromaPredDcTop_c(uint8_t * pPred,const int32_t kiStride)995 void WelsIChromaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
996   int32_t iTmp          = (kiStride << 3) - kiStride;
997   /*caculate the kMean value*/
998   const uint8_t kuiM1   = (pPred[-kiStride] + pPred[1 - kiStride] + pPred[2 - kiStride] + pPred[3 - kiStride] + 2) >> 2;
999   const uint8_t kuiM2   = (pPred[4 - kiStride] + pPred[5 - kiStride] + pPred[6 - kiStride] + pPred[7 - kiStride] + 2) >>
1000                           2;
1001   const uint8_t kuiM[8] = {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
1002 
1003   uint8_t i = 7;
1004 
1005   do {
1006     ST64A8 (pPred + iTmp, LD64 (kuiM));
1007 
1008     iTmp -= kiStride;
1009   } while (i-- > 0);
1010 }
1011 
WelsIChromaPredDcNA_c(uint8_t * pPred,const int32_t kiStride)1012 void WelsIChromaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
1013   int32_t iTmp = (kiStride << 3) - kiStride;
1014   const uint64_t kuiDC64 = 0x8080808080808080ULL;
1015   uint8_t i = 7;
1016 
1017   do {
1018     ST64A8 (pPred + iTmp, kuiDC64);
1019 
1020     iTmp -= kiStride;
1021   } while (i-- > 0);
1022 }
1023 
WelsI16x16LumaPredV_c(uint8_t * pPred,const int32_t kiStride)1024 void WelsI16x16LumaPredV_c (uint8_t* pPred, const int32_t kiStride) {
1025   int32_t iTmp            = (kiStride << 4) - kiStride;
1026   const uint64_t kuiTop1  = LD64A8 (pPred - kiStride);
1027   const uint64_t kuiTop2  = LD64A8 (pPred - kiStride + 8);
1028   uint8_t i = 15;
1029 
1030   do {
1031     ST64A8 (pPred + iTmp, kuiTop1);
1032     ST64A8 (pPred + iTmp + 8, kuiTop2);
1033 
1034     iTmp -= kiStride;
1035   } while (i-- > 0);
1036 }
1037 
WelsI16x16LumaPredH_c(uint8_t * pPred,const int32_t kiStride)1038 void WelsI16x16LumaPredH_c (uint8_t* pPred, const int32_t kiStride) {
1039   int32_t iTmp = (kiStride << 4) - kiStride;
1040   uint8_t i = 15;
1041 
1042   do {
1043     const uint8_t kuiVal8   = pPred[iTmp - 1];
1044     const uint64_t kuiVal64 = 0x0101010101010101ULL * kuiVal8;
1045 
1046     ST64A8 (pPred + iTmp, kuiVal64);
1047     ST64A8 (pPred + iTmp + 8, kuiVal64);
1048 
1049     iTmp -= kiStride;
1050   } while (i-- > 0);
1051 }
1052 
WelsI16x16LumaPredPlane_c(uint8_t * pPred,const int32_t kiStride)1053 void WelsI16x16LumaPredPlane_c (uint8_t* pPred, const int32_t kiStride) {
1054   int32_t a = 0, b = 0, c = 0, H = 0, V = 0;
1055   int32_t i, j;
1056   uint8_t* pTop = &pPred[-kiStride];
1057   uint8_t* pLeft = &pPred[-1];
1058 
1059   for (i = 0 ; i < 8 ; i ++) {
1060     H += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
1061     V += (i + 1) * (pLeft[ (8 + i) * kiStride] - pLeft[ (6 - i) * kiStride]);
1062   }
1063 
1064   a = (pLeft[15 * kiStride] + pTop[15]) << 4;
1065   b = (5 * H + 32) >> 6;
1066   c = (5 * V + 32) >> 6;
1067 
1068   for (i = 0 ; i < 16 ; i ++) {
1069     for (j = 0 ; j < 16 ; j ++) {
1070       int32_t iTmp = (a + b * (j - 7) + c * (i - 7) + 16) >> 5;
1071       iTmp = WelsClip1 (iTmp);
1072       pPred[j] = iTmp;
1073     }
1074     pPred += kiStride;
1075   }
1076 }
1077 
WelsI16x16LumaPredDc_c(uint8_t * pPred,const int32_t kiStride)1078 void WelsI16x16LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) {
1079   int32_t iTmp = (kiStride << 4) - kiStride;
1080   int32_t iSum = 0;
1081   uint8_t i = 15;
1082   uint8_t uiMean = 0;
1083 
1084   /*caculate the kMean value*/
1085   do {
1086     iSum += pPred[-1 + iTmp] + pPred[-kiStride + i];
1087     iTmp -= kiStride;
1088   } while (i-- > 0);
1089   uiMean = (16 + iSum) >> 5;
1090 
1091   iTmp = (kiStride << 4) - kiStride;
1092   i = 15;
1093   do {
1094     memset (&pPred[iTmp], uiMean, I16x16_COUNT);
1095     iTmp -= kiStride;
1096   } while (i-- > 0);
1097 }
1098 
1099 
WelsI16x16LumaPredDcTop_c(uint8_t * pPred,const int32_t kiStride)1100 void WelsI16x16LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) {
1101   int32_t iTmp = (kiStride << 4) - kiStride;
1102   int32_t iSum = 0;
1103   uint8_t i = 15;
1104   uint8_t uiMean = 0;
1105 
1106   /*caculate the kMean value*/
1107   do {
1108     iSum += pPred[-kiStride + i];
1109   } while (i-- > 0);
1110   uiMean = (8 + iSum) >> 4;
1111 
1112   i = 15;
1113   do {
1114     memset (&pPred[iTmp], uiMean, I16x16_COUNT);
1115     iTmp -= kiStride;
1116   } while (i-- > 0);
1117 }
1118 
WelsI16x16LumaPredDcLeft_c(uint8_t * pPred,const int32_t kiStride)1119 void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) {
1120   int32_t iTmp = (kiStride << 4) - kiStride;
1121   int32_t iSum = 0;
1122   uint64_t uiMean64 = 0;
1123   uint8_t uiMean = 0;
1124   uint8_t i = 15;
1125 
1126   /*caculate the kMean value*/
1127   do {
1128     iSum += pPred[-1 + iTmp];
1129     iTmp -= kiStride;
1130   } while (i-- > 0);
1131   uiMean   = (8 + iSum) >> 4;
1132   uiMean64 = 0x0101010101010101ULL * uiMean;
1133 
1134   iTmp = (kiStride << 4) - kiStride;
1135   i = 15;
1136   do {
1137     ST64A8 (pPred + iTmp, uiMean64);
1138     ST64A8 (pPred + iTmp + 8, uiMean64);
1139 
1140     iTmp -= kiStride;
1141   } while (i-- > 0);
1142 }
1143 
WelsI16x16LumaPredDcNA_c(uint8_t * pPred,const int32_t kiStride)1144 void WelsI16x16LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) {
1145   const uint64_t kuiDC64 = 0x8080808080808080ULL;
1146   int32_t iTmp = (kiStride << 4) - kiStride;
1147   uint8_t i = 15;
1148 
1149   do {
1150     ST64A8 (pPred + iTmp, kuiDC64);
1151     ST64A8 (pPred + iTmp + 8, kuiDC64);
1152 
1153     iTmp -= kiStride;
1154   } while (i-- > 0);
1155 }
1156 
1157 } // namespace WelsDec
1158