1 /*
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /**
23 * @file
24 * H.264 / AVC / MPEG-4 part10 prediction functions.
25 * @author Michael Niedermayer <michaelni@gmx.at>
26 */
27
28 #include "libavutil/intreadwrite.h"
29
30 #include "mathops.h"
31
32 #include "bit_depth_template.c"
33
FUNCC(pred4x4_vertical)34 static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
35 ptrdiff_t _stride)
36 {
37 pixel *src = (pixel*)_src;
38 int stride = _stride>>(sizeof(pixel)-1);
39 const pixel4 a= AV_RN4PA(src-stride);
40
41 AV_WN4PA(src+0*stride, a);
42 AV_WN4PA(src+1*stride, a);
43 AV_WN4PA(src+2*stride, a);
44 AV_WN4PA(src+3*stride, a);
45 }
46
FUNCC(pred4x4_horizontal)47 static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
48 ptrdiff_t _stride)
49 {
50 pixel *src = (pixel*)_src;
51 int stride = _stride>>(sizeof(pixel)-1);
52 AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
53 AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
54 AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
55 AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
56 }
57
FUNCC(pred4x4_dc)58 static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
59 ptrdiff_t _stride)
60 {
61 pixel *src = (pixel*)_src;
62 int stride = _stride>>(sizeof(pixel)-1);
63 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
64 + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
65 const pixel4 a = PIXEL_SPLAT_X4(dc);
66
67 AV_WN4PA(src+0*stride, a);
68 AV_WN4PA(src+1*stride, a);
69 AV_WN4PA(src+2*stride, a);
70 AV_WN4PA(src+3*stride, a);
71 }
72
FUNCC(pred4x4_left_dc)73 static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
74 ptrdiff_t _stride)
75 {
76 pixel *src = (pixel*)_src;
77 int stride = _stride>>(sizeof(pixel)-1);
78 const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
79 const pixel4 a = PIXEL_SPLAT_X4(dc);
80
81 AV_WN4PA(src+0*stride, a);
82 AV_WN4PA(src+1*stride, a);
83 AV_WN4PA(src+2*stride, a);
84 AV_WN4PA(src+3*stride, a);
85 }
86
FUNCC(pred4x4_top_dc)87 static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
88 ptrdiff_t _stride)
89 {
90 pixel *src = (pixel*)_src;
91 int stride = _stride>>(sizeof(pixel)-1);
92 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
93 const pixel4 a = PIXEL_SPLAT_X4(dc);
94
95 AV_WN4PA(src+0*stride, a);
96 AV_WN4PA(src+1*stride, a);
97 AV_WN4PA(src+2*stride, a);
98 AV_WN4PA(src+3*stride, a);
99 }
100
FUNCC(pred4x4_128_dc)101 static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
102 ptrdiff_t _stride)
103 {
104 pixel *src = (pixel*)_src;
105 int stride = _stride>>(sizeof(pixel)-1);
106 const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
107
108 AV_WN4PA(src+0*stride, a);
109 AV_WN4PA(src+1*stride, a);
110 AV_WN4PA(src+2*stride, a);
111 AV_WN4PA(src+3*stride, a);
112 }
113
114
115 #define LOAD_TOP_RIGHT_EDGE\
116 const unsigned av_unused t4 = topright[0];\
117 const unsigned av_unused t5 = topright[1];\
118 const unsigned av_unused t6 = topright[2];\
119 const unsigned av_unused t7 = topright[3];\
120
121 #define LOAD_DOWN_LEFT_EDGE\
122 const unsigned av_unused l4 = src[-1+4*stride];\
123 const unsigned av_unused l5 = src[-1+5*stride];\
124 const unsigned av_unused l6 = src[-1+6*stride];\
125 const unsigned av_unused l7 = src[-1+7*stride];\
126
127 #define LOAD_LEFT_EDGE\
128 const unsigned av_unused l0 = src[-1+0*stride];\
129 const unsigned av_unused l1 = src[-1+1*stride];\
130 const unsigned av_unused l2 = src[-1+2*stride];\
131 const unsigned av_unused l3 = src[-1+3*stride];\
132
133 #define LOAD_TOP_EDGE\
134 const unsigned av_unused t0 = src[ 0-1*stride];\
135 const unsigned av_unused t1 = src[ 1-1*stride];\
136 const unsigned av_unused t2 = src[ 2-1*stride];\
137 const unsigned av_unused t3 = src[ 3-1*stride];\
138
FUNCC(pred4x4_down_right)139 static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
140 ptrdiff_t _stride)
141 {
142 pixel *src = (pixel*)_src;
143 int stride = _stride>>(sizeof(pixel)-1);
144 const int lt= src[-1-1*stride];
145 LOAD_TOP_EDGE
146 LOAD_LEFT_EDGE
147
148 src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
149 src[0+2*stride]=
150 src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
151 src[0+1*stride]=
152 src[1+2*stride]=
153 src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
154 src[0+0*stride]=
155 src[1+1*stride]=
156 src[2+2*stride]=
157 src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
158 src[1+0*stride]=
159 src[2+1*stride]=
160 src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
161 src[2+0*stride]=
162 src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
163 src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
164 }
165
FUNCC(pred4x4_down_left)166 static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
167 ptrdiff_t _stride)
168 {
169 pixel *src = (pixel*)_src;
170 const pixel *topright = (const pixel*)_topright;
171 int stride = _stride>>(sizeof(pixel)-1);
172 LOAD_TOP_EDGE
173 LOAD_TOP_RIGHT_EDGE
174 // LOAD_LEFT_EDGE
175
176 src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
177 src[1+0*stride]=
178 src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
179 src[2+0*stride]=
180 src[1+1*stride]=
181 src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
182 src[3+0*stride]=
183 src[2+1*stride]=
184 src[1+2*stride]=
185 src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
186 src[3+1*stride]=
187 src[2+2*stride]=
188 src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
189 src[3+2*stride]=
190 src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
191 src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
192 }
193
FUNCC(pred4x4_vertical_right)194 static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
195 const uint8_t *topright,
196 ptrdiff_t _stride)
197 {
198 pixel *src = (pixel*)_src;
199 int stride = _stride>>(sizeof(pixel)-1);
200 const int lt= src[-1-1*stride];
201 LOAD_TOP_EDGE
202 LOAD_LEFT_EDGE
203
204 src[0+0*stride]=
205 src[1+2*stride]=(lt + t0 + 1)>>1;
206 src[1+0*stride]=
207 src[2+2*stride]=(t0 + t1 + 1)>>1;
208 src[2+0*stride]=
209 src[3+2*stride]=(t1 + t2 + 1)>>1;
210 src[3+0*stride]=(t2 + t3 + 1)>>1;
211 src[0+1*stride]=
212 src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
213 src[1+1*stride]=
214 src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
215 src[2+1*stride]=
216 src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
217 src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
218 src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
219 src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
220 }
221
FUNCC(pred4x4_vertical_left)222 static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
223 const uint8_t *_topright,
224 ptrdiff_t _stride)
225 {
226 pixel *src = (pixel*)_src;
227 const pixel *topright = (const pixel*)_topright;
228 int stride = _stride>>(sizeof(pixel)-1);
229 LOAD_TOP_EDGE
230 LOAD_TOP_RIGHT_EDGE
231
232 src[0+0*stride]=(t0 + t1 + 1)>>1;
233 src[1+0*stride]=
234 src[0+2*stride]=(t1 + t2 + 1)>>1;
235 src[2+0*stride]=
236 src[1+2*stride]=(t2 + t3 + 1)>>1;
237 src[3+0*stride]=
238 src[2+2*stride]=(t3 + t4+ 1)>>1;
239 src[3+2*stride]=(t4 + t5+ 1)>>1;
240 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
241 src[1+1*stride]=
242 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
243 src[2+1*stride]=
244 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
245 src[3+1*stride]=
246 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
247 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
248 }
249
FUNCC(pred4x4_horizontal_up)250 static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
251 ptrdiff_t _stride)
252 {
253 pixel *src = (pixel*)_src;
254 int stride = _stride>>(sizeof(pixel)-1);
255 LOAD_LEFT_EDGE
256
257 src[0+0*stride]=(l0 + l1 + 1)>>1;
258 src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
259 src[2+0*stride]=
260 src[0+1*stride]=(l1 + l2 + 1)>>1;
261 src[3+0*stride]=
262 src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
263 src[2+1*stride]=
264 src[0+2*stride]=(l2 + l3 + 1)>>1;
265 src[3+1*stride]=
266 src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
267 src[3+2*stride]=
268 src[1+3*stride]=
269 src[0+3*stride]=
270 src[2+2*stride]=
271 src[2+3*stride]=
272 src[3+3*stride]=l3;
273 }
274
FUNCC(pred4x4_horizontal_down)275 static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
276 const uint8_t *topright,
277 ptrdiff_t _stride)
278 {
279 pixel *src = (pixel*)_src;
280 int stride = _stride>>(sizeof(pixel)-1);
281 const int lt= src[-1-1*stride];
282 LOAD_TOP_EDGE
283 LOAD_LEFT_EDGE
284
285 src[0+0*stride]=
286 src[2+1*stride]=(lt + l0 + 1)>>1;
287 src[1+0*stride]=
288 src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
289 src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
290 src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
291 src[0+1*stride]=
292 src[2+2*stride]=(l0 + l1 + 1)>>1;
293 src[1+1*stride]=
294 src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
295 src[0+2*stride]=
296 src[2+3*stride]=(l1 + l2+ 1)>>1;
297 src[1+2*stride]=
298 src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
299 src[0+3*stride]=(l2 + l3 + 1)>>1;
300 src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
301 }
302
FUNCC(pred16x16_vertical)303 static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
304 {
305 int i;
306 pixel *src = (pixel*)_src;
307 int stride = _stride>>(sizeof(pixel)-1);
308 const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
309 const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
310 const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
311 const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
312
313 for(i=0; i<16; i++){
314 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
315 AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
316 AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
317 AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
318 }
319 }
320
FUNCC(pred16x16_horizontal)321 static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
322 {
323 int i;
324 pixel *src = (pixel*)_src;
325 stride >>= sizeof(pixel)-1;
326
327 for(i=0; i<16; i++){
328 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
329
330 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
331 AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
332 AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
333 AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
334 }
335 }
336
337 #define PREDICT_16x16_DC(v)\
338 for(i=0; i<16; i++){\
339 AV_WN4PA(src+ 0, v);\
340 AV_WN4PA(src+ 4, v);\
341 AV_WN4PA(src+ 8, v);\
342 AV_WN4PA(src+12, v);\
343 src += stride;\
344 }
345
FUNCC(pred16x16_dc)346 static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
347 {
348 int i, dc=0;
349 pixel *src = (pixel*)_src;
350 pixel4 dcsplat;
351 stride >>= sizeof(pixel)-1;
352
353 for(i=0;i<16; i++){
354 dc+= src[-1+i*stride];
355 }
356
357 for(i=0;i<16; i++){
358 dc+= src[i-stride];
359 }
360
361 dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
362 PREDICT_16x16_DC(dcsplat);
363 }
364
FUNCC(pred16x16_left_dc)365 static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
366 {
367 int i, dc=0;
368 pixel *src = (pixel*)_src;
369 pixel4 dcsplat;
370 stride >>= sizeof(pixel)-1;
371
372 for(i=0;i<16; i++){
373 dc+= src[-1+i*stride];
374 }
375
376 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
377 PREDICT_16x16_DC(dcsplat);
378 }
379
FUNCC(pred16x16_top_dc)380 static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
381 {
382 int i, dc=0;
383 pixel *src = (pixel*)_src;
384 pixel4 dcsplat;
385 stride >>= sizeof(pixel)-1;
386
387 for(i=0;i<16; i++){
388 dc+= src[i-stride];
389 }
390
391 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
392 PREDICT_16x16_DC(dcsplat);
393 }
394
395 #define PRED16x16_X(n, v) \
396 static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
397 {\
398 int i;\
399 pixel *src = (pixel*)_src;\
400 stride >>= sizeof(pixel)-1;\
401 PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
402 }
403
404 PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0)
405 #if BIT_DEPTH == 8
406 PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1)
407 PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1)
408 #endif
409
FUNCC(pred16x16_plane_compat)410 static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
411 ptrdiff_t _stride,
412 const int svq3,
413 const int rv40)
414 {
415 int i, j, k;
416 int a;
417 INIT_CLIP
418 pixel *src = (pixel*)_src;
419 int stride = _stride>>(sizeof(pixel)-1);
420 const pixel * const src0 = src +7-stride;
421 const pixel * src1 = src +8*stride-1;
422 const pixel * src2 = src1-2*stride; // == src+6*stride-1;
423 int H = src0[1] - src0[-1];
424 int V = src1[0] - src2[ 0];
425 for(k=2; k<=8; ++k) {
426 src1 += stride; src2 -= stride;
427 H += k*(src0[k] - src0[-k]);
428 V += k*(src1[0] - src2[ 0]);
429 }
430 if(svq3){
431 H = ( 5*(H/4) ) / 16;
432 V = ( 5*(V/4) ) / 16;
433
434 /* required for 100% accuracy */
435 i = H; H = V; V = i;
436 }else if(rv40){
437 H = ( H + (H>>2) ) >> 4;
438 V = ( V + (V>>2) ) >> 4;
439 }else{
440 H = ( 5*H+32 ) >> 6;
441 V = ( 5*V+32 ) >> 6;
442 }
443
444 a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
445 for(j=16; j>0; --j) {
446 int b = a;
447 a += V;
448 for(i=-16; i<0; i+=4) {
449 src[16+i] = CLIP((b ) >> 5);
450 src[17+i] = CLIP((b+ H) >> 5);
451 src[18+i] = CLIP((b+2*H) >> 5);
452 src[19+i] = CLIP((b+3*H) >> 5);
453 b += 4*H;
454 }
455 src += stride;
456 }
457 }
458
FUNCC(pred16x16_plane)459 static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride)
460 {
461 FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
462 }
463
FUNCC(pred8x8_vertical)464 static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
465 {
466 int i;
467 pixel *src = (pixel*)_src;
468 int stride = _stride>>(sizeof(pixel)-1);
469 const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
470 const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
471
472 for(i=0; i<8; i++){
473 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
474 AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
475 }
476 }
477
FUNCC(pred8x16_vertical)478 static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
479 {
480 int i;
481 pixel *src = (pixel*)_src;
482 int stride = _stride>>(sizeof(pixel)-1);
483 const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
484 const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
485
486 for(i=0; i<16; i++){
487 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
488 AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
489 }
490 }
491
FUNCC(pred8x8_horizontal)492 static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
493 {
494 int i;
495 pixel *src = (pixel*)_src;
496 stride >>= sizeof(pixel)-1;
497
498 for(i=0; i<8; i++){
499 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
500 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
501 AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
502 }
503 }
504
FUNCC(pred8x16_horizontal)505 static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
506 {
507 int i;
508 pixel *src = (pixel*)_src;
509 stride >>= sizeof(pixel)-1;
510 for(i=0; i<16; i++){
511 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
512 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
513 AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
514 }
515 }
516
517 #define PRED8x8_X(n, v)\
518 static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
519 {\
520 int i;\
521 const pixel4 a = PIXEL_SPLAT_X4(v);\
522 pixel *src = (pixel*)_src;\
523 stride >>= sizeof(pixel)-1;\
524 for(i=0; i<8; i++){\
525 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
526 AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
527 }\
528 }
529
530 PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0)
531 #if BIT_DEPTH == 8
532 PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1)
533 PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1)
534 #endif
535
FUNCC(pred8x16_128_dc)536 static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride)
537 {
538 FUNCC(pred8x8_128_dc)(_src, stride);
539 FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
540 }
541
FUNCC(pred8x8_left_dc)542 static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
543 {
544 int i;
545 int dc0, dc2;
546 pixel4 dc0splat, dc2splat;
547 pixel *src = (pixel*)_src;
548 stride >>= sizeof(pixel)-1;
549
550 dc0=dc2=0;
551 for(i=0;i<4; i++){
552 dc0+= src[-1+i*stride];
553 dc2+= src[-1+(i+4)*stride];
554 }
555 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
556 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
557
558 for(i=0; i<4; i++){
559 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
560 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
561 }
562 for(i=4; i<8; i++){
563 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
564 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
565 }
566 }
567
FUNCC(pred8x16_left_dc)568 static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
569 {
570 FUNCC(pred8x8_left_dc)(_src, stride);
571 FUNCC(pred8x8_left_dc)(_src+8*stride, stride);
572 }
573
FUNCC(pred8x8_top_dc)574 static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
575 {
576 int i;
577 int dc0, dc1;
578 pixel4 dc0splat, dc1splat;
579 pixel *src = (pixel*)_src;
580 stride >>= sizeof(pixel)-1;
581
582 dc0=dc1=0;
583 for(i=0;i<4; i++){
584 dc0+= src[i-stride];
585 dc1+= src[4+i-stride];
586 }
587 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
588 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
589
590 for(i=0; i<4; i++){
591 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
592 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
593 }
594 for(i=4; i<8; i++){
595 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
596 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
597 }
598 }
599
FUNCC(pred8x16_top_dc)600 static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
601 {
602 int i;
603 int dc0, dc1;
604 pixel4 dc0splat, dc1splat;
605 pixel *src = (pixel*)_src;
606 stride >>= sizeof(pixel)-1;
607
608 dc0=dc1=0;
609 for(i=0;i<4; i++){
610 dc0+= src[i-stride];
611 dc1+= src[4+i-stride];
612 }
613 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
614 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
615
616 for(i=0; i<16; i++){
617 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
618 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
619 }
620 }
621
FUNCC(pred8x8_dc)622 static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
623 {
624 int i;
625 int dc0, dc1, dc2;
626 pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
627 pixel *src = (pixel*)_src;
628 stride >>= sizeof(pixel)-1;
629
630 dc0=dc1=dc2=0;
631 for(i=0;i<4; i++){
632 dc0+= src[-1+i*stride] + src[i-stride];
633 dc1+= src[4+i-stride];
634 dc2+= src[-1+(i+4)*stride];
635 }
636 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
637 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
638 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
639 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
640
641 for(i=0; i<4; i++){
642 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
643 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
644 }
645 for(i=4; i<8; i++){
646 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
647 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
648 }
649 }
650
FUNCC(pred8x16_dc)651 static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
652 {
653 int i;
654 int dc0, dc1, dc2, dc3, dc4;
655 pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
656 pixel *src = (pixel*)_src;
657 stride >>= sizeof(pixel)-1;
658
659 dc0=dc1=dc2=dc3=dc4=0;
660 for(i=0;i<4; i++){
661 dc0+= src[-1+i*stride] + src[i-stride];
662 dc1+= src[4+i-stride];
663 dc2+= src[-1+(i+4)*stride];
664 dc3+= src[-1+(i+8)*stride];
665 dc4+= src[-1+(i+12)*stride];
666 }
667 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
668 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
669 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
670 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
671 dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
672 dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
673 dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
674 dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
675
676 for(i=0; i<4; i++){
677 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
678 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
679 }
680 for(i=4; i<8; i++){
681 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
682 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
683 }
684 for(i=8; i<12; i++){
685 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
686 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
687 }
688 for(i=12; i<16; i++){
689 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
690 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
691 }
692 }
693
694 //the following 4 function should not be optimized!
FUNC(pred8x8_mad_cow_dc_l0t)695 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
696 {
697 FUNCC(pred8x8_top_dc)(src, stride);
698 FUNCC(pred4x4_dc)(src, NULL, stride);
699 }
700
FUNC(pred8x16_mad_cow_dc_l0t)701 static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
702 {
703 FUNCC(pred8x16_top_dc)(src, stride);
704 FUNCC(pred4x4_dc)(src, NULL, stride);
705 }
706
FUNC(pred8x8_mad_cow_dc_0lt)707 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
708 {
709 FUNCC(pred8x8_dc)(src, stride);
710 FUNCC(pred4x4_top_dc)(src, NULL, stride);
711 }
712
FUNC(pred8x16_mad_cow_dc_0lt)713 static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
714 {
715 FUNCC(pred8x16_dc)(src, stride);
716 FUNCC(pred4x4_top_dc)(src, NULL, stride);
717 }
718
FUNC(pred8x8_mad_cow_dc_l00)719 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
720 {
721 FUNCC(pred8x8_left_dc)(src, stride);
722 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
723 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
724 }
725
FUNC(pred8x16_mad_cow_dc_l00)726 static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
727 {
728 FUNCC(pred8x16_left_dc)(src, stride);
729 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
730 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
731 }
732
FUNC(pred8x8_mad_cow_dc_0l0)733 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
734 {
735 FUNCC(pred8x8_left_dc)(src, stride);
736 FUNCC(pred4x4_128_dc)(src , NULL, stride);
737 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
738 }
739
FUNC(pred8x16_mad_cow_dc_0l0)740 static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
741 {
742 FUNCC(pred8x16_left_dc)(src, stride);
743 FUNCC(pred4x4_128_dc)(src , NULL, stride);
744 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
745 }
746
FUNCC(pred8x8_plane)747 static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
748 {
749 int j, k;
750 int a;
751 INIT_CLIP
752 pixel *src = (pixel*)_src;
753 int stride = _stride>>(sizeof(pixel)-1);
754 const pixel * const src0 = src +3-stride;
755 const pixel * src1 = src +4*stride-1;
756 const pixel * src2 = src1-2*stride; // == src+2*stride-1;
757 int H = src0[1] - src0[-1];
758 int V = src1[0] - src2[ 0];
759 for(k=2; k<=4; ++k) {
760 src1 += stride; src2 -= stride;
761 H += k*(src0[k] - src0[-k]);
762 V += k*(src1[0] - src2[ 0]);
763 }
764 H = ( 17*H+16 ) >> 5;
765 V = ( 17*V+16 ) >> 5;
766
767 a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
768 for(j=8; j>0; --j) {
769 int b = a;
770 a += V;
771 src[0] = CLIP((b ) >> 5);
772 src[1] = CLIP((b+ H) >> 5);
773 src[2] = CLIP((b+2*H) >> 5);
774 src[3] = CLIP((b+3*H) >> 5);
775 src[4] = CLIP((b+4*H) >> 5);
776 src[5] = CLIP((b+5*H) >> 5);
777 src[6] = CLIP((b+6*H) >> 5);
778 src[7] = CLIP((b+7*H) >> 5);
779 src += stride;
780 }
781 }
782
FUNCC(pred8x16_plane)783 static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride)
784 {
785 int j, k;
786 int a;
787 INIT_CLIP
788 pixel *src = (pixel*)_src;
789 int stride = _stride>>(sizeof(pixel)-1);
790 const pixel * const src0 = src +3-stride;
791 const pixel * src1 = src +8*stride-1;
792 const pixel * src2 = src1-2*stride; // == src+6*stride-1;
793 int H = src0[1] - src0[-1];
794 int V = src1[0] - src2[ 0];
795
796 for (k = 2; k <= 4; ++k) {
797 src1 += stride; src2 -= stride;
798 H += k*(src0[k] - src0[-k]);
799 V += k*(src1[0] - src2[ 0]);
800 }
801 for (; k <= 8; ++k) {
802 src1 += stride; src2 -= stride;
803 V += k*(src1[0] - src2[0]);
804 }
805
806 H = (17*H+16) >> 5;
807 V = (5*V+32) >> 6;
808
809 a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
810 for(j=16; j>0; --j) {
811 int b = a;
812 a += V;
813 src[0] = CLIP((b ) >> 5);
814 src[1] = CLIP((b+ H) >> 5);
815 src[2] = CLIP((b+2*H) >> 5);
816 src[3] = CLIP((b+3*H) >> 5);
817 src[4] = CLIP((b+4*H) >> 5);
818 src[5] = CLIP((b+5*H) >> 5);
819 src[6] = CLIP((b+6*H) >> 5);
820 src[7] = CLIP((b+7*H) >> 5);
821 src += stride;
822 }
823 }
824
825 #define SRC(x,y) src[(x)+(y)*stride]
826 #define PL(y) \
827 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
828 #define PREDICT_8x8_LOAD_LEFT \
829 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
830 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
831 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
832 const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
833
834 #define PT(x) \
835 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
836 #define PREDICT_8x8_LOAD_TOP \
837 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
838 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
839 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
840 const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
841 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
842
843 #define PTR(x) \
844 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
845 #define PREDICT_8x8_LOAD_TOPRIGHT \
846 int t8, t9, t10, t11, t12, t13, t14, t15; \
847 if(has_topright) { \
848 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
849 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
850 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
851
852 #define PREDICT_8x8_LOAD_TOPLEFT \
853 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
854
855 #define PREDICT_8x8_DC(v) \
856 int y; \
857 for( y = 0; y < 8; y++ ) { \
858 AV_WN4PA(((pixel4*)src)+0, v); \
859 AV_WN4PA(((pixel4*)src)+1, v); \
860 src += stride; \
861 }
862
FUNCC(pred8x8l_128_dc)863 static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
864 int has_topright, ptrdiff_t _stride)
865 {
866 pixel *src = (pixel*)_src;
867 int stride = _stride>>(sizeof(pixel)-1);
868
869 PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
870 }
FUNCC(pred8x8l_left_dc)871 static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
872 int has_topright, ptrdiff_t _stride)
873 {
874 pixel *src = (pixel*)_src;
875 int stride = _stride>>(sizeof(pixel)-1);
876
877 PREDICT_8x8_LOAD_LEFT;
878 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
879 PREDICT_8x8_DC(dc);
880 }
FUNCC(pred8x8l_top_dc)881 static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
882 int has_topright, ptrdiff_t _stride)
883 {
884 pixel *src = (pixel*)_src;
885 int stride = _stride>>(sizeof(pixel)-1);
886
887 PREDICT_8x8_LOAD_TOP;
888 const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
889 PREDICT_8x8_DC(dc);
890 }
FUNCC(pred8x8l_dc)891 static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
892 int has_topright, ptrdiff_t _stride)
893 {
894 pixel *src = (pixel*)_src;
895 int stride = _stride>>(sizeof(pixel)-1);
896
897 PREDICT_8x8_LOAD_LEFT;
898 PREDICT_8x8_LOAD_TOP;
899 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
900 +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
901 PREDICT_8x8_DC(dc);
902 }
FUNCC(pred8x8l_horizontal)903 static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
904 int has_topright, ptrdiff_t _stride)
905 {
906 pixel *src = (pixel*)_src;
907 int stride = _stride>>(sizeof(pixel)-1);
908 pixel4 a;
909
910 PREDICT_8x8_LOAD_LEFT;
911 #define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
912 AV_WN4PA(src+y*stride, a); \
913 AV_WN4PA(src+y*stride+4, a);
914 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
915 #undef ROW
916 }
FUNCC(pred8x8l_vertical)917 static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
918 int has_topright, ptrdiff_t _stride)
919 {
920 int y;
921 pixel *src = (pixel*)_src;
922 int stride = _stride>>(sizeof(pixel)-1);
923 pixel4 a, b;
924
925 PREDICT_8x8_LOAD_TOP;
926 src[0] = t0;
927 src[1] = t1;
928 src[2] = t2;
929 src[3] = t3;
930 src[4] = t4;
931 src[5] = t5;
932 src[6] = t6;
933 src[7] = t7;
934 a = AV_RN4PA(((pixel4*)src)+0);
935 b = AV_RN4PA(((pixel4*)src)+1);
936 for( y = 1; y < 8; y++ ) {
937 AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
938 AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
939 }
940 }
FUNCC(pred8x8l_down_left)941 static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
942 int has_topright, ptrdiff_t _stride)
943 {
944 pixel *src = (pixel*)_src;
945 int stride = _stride>>(sizeof(pixel)-1);
946 PREDICT_8x8_LOAD_TOP;
947 PREDICT_8x8_LOAD_TOPRIGHT;
948 SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
949 SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
950 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
951 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
952 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
953 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
954 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
955 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
956 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
957 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
958 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
959 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
960 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
961 SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
962 SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
963 }
FUNCC(pred8x8l_down_right)964 static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
965 int has_topright, ptrdiff_t _stride)
966 {
967 pixel *src = (pixel*)_src;
968 int stride = _stride>>(sizeof(pixel)-1);
969 PREDICT_8x8_LOAD_TOP;
970 PREDICT_8x8_LOAD_LEFT;
971 PREDICT_8x8_LOAD_TOPLEFT;
972 SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
973 SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
974 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
975 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
976 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
977 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
978 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
979 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
980 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
981 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
982 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
983 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
984 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
985 SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
986 SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
987 }
FUNCC(pred8x8l_vertical_right)988 static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
989 int has_topright, ptrdiff_t _stride)
990 {
991 pixel *src = (pixel*)_src;
992 int stride = _stride>>(sizeof(pixel)-1);
993 PREDICT_8x8_LOAD_TOP;
994 PREDICT_8x8_LOAD_LEFT;
995 PREDICT_8x8_LOAD_TOPLEFT;
996 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
997 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
998 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
999 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1000 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1001 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1002 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1003 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1004 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1005 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1006 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1007 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1008 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1009 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1010 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1011 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1012 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1013 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1014 SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1015 SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1016 SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1017 SRC(7,0)= (t6 + t7 + 1) >> 1;
1018 }
FUNCC(pred8x8l_horizontal_down)1019 static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
1020 int has_topright, ptrdiff_t _stride)
1021 {
1022 pixel *src = (pixel*)_src;
1023 int stride = _stride>>(sizeof(pixel)-1);
1024 PREDICT_8x8_LOAD_TOP;
1025 PREDICT_8x8_LOAD_LEFT;
1026 PREDICT_8x8_LOAD_TOPLEFT;
1027 SRC(0,7)= (l6 + l7 + 1) >> 1;
1028 SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1029 SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1030 SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1031 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1032 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1033 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1034 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1035 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1036 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1037 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1038 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1039 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1040 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1041 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1042 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1043 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1044 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1045 SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1046 SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1047 SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1048 SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1049 }
FUNCC(pred8x8l_vertical_left)1050 static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
1051 int has_topright, ptrdiff_t _stride)
1052 {
1053 pixel *src = (pixel*)_src;
1054 int stride = _stride>>(sizeof(pixel)-1);
1055 PREDICT_8x8_LOAD_TOP;
1056 PREDICT_8x8_LOAD_TOPRIGHT;
1057 SRC(0,0)= (t0 + t1 + 1) >> 1;
1058 SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1059 SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1060 SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1061 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1062 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1063 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1064 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1065 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1066 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1067 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1068 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1069 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1070 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1071 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1072 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1073 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1074 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1075 SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1076 SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1077 SRC(7,6)= (t10 + t11 + 1) >> 1;
1078 SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1079 }
FUNCC(pred8x8l_horizontal_up)1080 static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
1081 int has_topright, ptrdiff_t _stride)
1082 {
1083 pixel *src = (pixel*)_src;
1084 int stride = _stride>>(sizeof(pixel)-1);
1085 PREDICT_8x8_LOAD_LEFT;
1086 SRC(0,0)= (l0 + l1 + 1) >> 1;
1087 SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1088 SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1089 SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1090 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1091 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1092 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1093 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1094 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1095 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1096 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1097 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1098 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1099 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1100 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1101 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1102 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1103 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1104 }
1105
FUNCC(pred8x8l_vertical_filter_add)1106 static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1107 int has_topright, ptrdiff_t _stride)
1108 {
1109 int i;
1110 pixel *src = (pixel*)_src;
1111 const dctcoef *block = (const dctcoef*)_block;
1112 pixel pix[8];
1113 int stride = _stride>>(sizeof(pixel)-1);
1114 PREDICT_8x8_LOAD_TOP;
1115
1116 pix[0] = t0;
1117 pix[1] = t1;
1118 pix[2] = t2;
1119 pix[3] = t3;
1120 pix[4] = t4;
1121 pix[5] = t5;
1122 pix[6] = t6;
1123 pix[7] = t7;
1124
1125 for(i=0; i<8; i++){
1126 pixel v = pix[i];
1127 src[0*stride]= v += block[0];
1128 src[1*stride]= v += block[8];
1129 src[2*stride]= v += block[16];
1130 src[3*stride]= v += block[24];
1131 src[4*stride]= v += block[32];
1132 src[5*stride]= v += block[40];
1133 src[6*stride]= v += block[48];
1134 src[7*stride]= v + block[56];
1135 src++;
1136 block++;
1137 }
1138
1139 memset(_block, 0, sizeof(dctcoef) * 64);
1140 }
1141
FUNCC(pred8x8l_horizontal_filter_add)1142 static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1143 int has_topright, ptrdiff_t _stride)
1144 {
1145 int i;
1146 pixel *src = (pixel*)_src;
1147 const dctcoef *block = (const dctcoef*)_block;
1148 pixel pix[8];
1149 int stride = _stride>>(sizeof(pixel)-1);
1150 PREDICT_8x8_LOAD_LEFT;
1151
1152 pix[0] = l0;
1153 pix[1] = l1;
1154 pix[2] = l2;
1155 pix[3] = l3;
1156 pix[4] = l4;
1157 pix[5] = l5;
1158 pix[6] = l6;
1159 pix[7] = l7;
1160
1161 for(i=0; i<8; i++){
1162 pixel v = pix[i];
1163 src[0]= v += block[0];
1164 src[1]= v += block[1];
1165 src[2]= v += block[2];
1166 src[3]= v += block[3];
1167 src[4]= v += block[4];
1168 src[5]= v += block[5];
1169 src[6]= v += block[6];
1170 src[7]= v + block[7];
1171 src+= stride;
1172 block+= 8;
1173 }
1174
1175 memset(_block, 0, sizeof(dctcoef) * 64);
1176 }
1177
1178 #undef PREDICT_8x8_LOAD_LEFT
1179 #undef PREDICT_8x8_LOAD_TOP
1180 #undef PREDICT_8x8_LOAD_TOPLEFT
1181 #undef PREDICT_8x8_LOAD_TOPRIGHT
1182 #undef PREDICT_8x8_DC
1183 #undef PTR
1184 #undef PT
1185 #undef PL
1186 #undef SRC
1187
FUNCC(pred4x4_vertical_add)1188 static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
1189 ptrdiff_t stride)
1190 {
1191 int i;
1192 pixel *pix = (pixel*)_pix;
1193 const dctcoef *block = (const dctcoef*)_block;
1194 stride >>= sizeof(pixel)-1;
1195 pix -= stride;
1196 for(i=0; i<4; i++){
1197 pixel v = pix[0];
1198 pix[1*stride]= v += block[0];
1199 pix[2*stride]= v += block[4];
1200 pix[3*stride]= v += block[8];
1201 pix[4*stride]= v + block[12];
1202 pix++;
1203 block++;
1204 }
1205
1206 memset(_block, 0, sizeof(dctcoef) * 16);
1207 }
1208
FUNCC(pred4x4_horizontal_add)1209 static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
1210 ptrdiff_t stride)
1211 {
1212 int i;
1213 pixel *pix = (pixel*)_pix;
1214 const dctcoef *block = (const dctcoef*)_block;
1215 stride >>= sizeof(pixel)-1;
1216 for(i=0; i<4; i++){
1217 pixel v = pix[-1];
1218 pix[0]= v += block[0];
1219 pix[1]= v += block[1];
1220 pix[2]= v += block[2];
1221 pix[3]= v + block[3];
1222 pix+= stride;
1223 block+= 4;
1224 }
1225
1226 memset(_block, 0, sizeof(dctcoef) * 16);
1227 }
1228
FUNCC(pred8x8l_vertical_add)1229 static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
1230 ptrdiff_t stride)
1231 {
1232 int i;
1233 pixel *pix = (pixel*)_pix;
1234 const dctcoef *block = (const dctcoef*)_block;
1235 stride >>= sizeof(pixel)-1;
1236 pix -= stride;
1237 for(i=0; i<8; i++){
1238 pixel v = pix[0];
1239 pix[1*stride]= v += block[0];
1240 pix[2*stride]= v += block[8];
1241 pix[3*stride]= v += block[16];
1242 pix[4*stride]= v += block[24];
1243 pix[5*stride]= v += block[32];
1244 pix[6*stride]= v += block[40];
1245 pix[7*stride]= v += block[48];
1246 pix[8*stride]= v + block[56];
1247 pix++;
1248 block++;
1249 }
1250
1251 memset(_block, 0, sizeof(dctcoef) * 64);
1252 }
1253
FUNCC(pred8x8l_horizontal_add)1254 static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
1255 ptrdiff_t stride)
1256 {
1257 int i;
1258 pixel *pix = (pixel*)_pix;
1259 const dctcoef *block = (const dctcoef*)_block;
1260 stride >>= sizeof(pixel)-1;
1261 for(i=0; i<8; i++){
1262 pixel v = pix[-1];
1263 pix[0]= v += block[0];
1264 pix[1]= v += block[1];
1265 pix[2]= v += block[2];
1266 pix[3]= v += block[3];
1267 pix[4]= v += block[4];
1268 pix[5]= v += block[5];
1269 pix[6]= v += block[6];
1270 pix[7]= v + block[7];
1271 pix+= stride;
1272 block+= 8;
1273 }
1274
1275 memset(_block, 0, sizeof(dctcoef) * 64);
1276 }
1277
FUNCC(pred16x16_vertical_add)1278 static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
1279 int16_t *block,
1280 ptrdiff_t stride)
1281 {
1282 int i;
1283 for(i=0; i<16; i++)
1284 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1285 }
1286
FUNCC(pred16x16_horizontal_add)1287 static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
1288 const int *block_offset,
1289 int16_t *block,
1290 ptrdiff_t stride)
1291 {
1292 int i;
1293 for(i=0; i<16; i++)
1294 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1295 }
1296
FUNCC(pred8x8_vertical_add)1297 static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
1298 int16_t *block, ptrdiff_t stride)
1299 {
1300 int i;
1301 for(i=0; i<4; i++)
1302 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1303 }
1304
FUNCC(pred8x16_vertical_add)1305 static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
1306 int16_t *block, ptrdiff_t stride)
1307 {
1308 int i;
1309 for(i=0; i<4; i++)
1310 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1311 for(i=4; i<8; i++)
1312 FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1313 }
1314
FUNCC(pred8x8_horizontal_add)1315 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
1316 int16_t *block,
1317 ptrdiff_t stride)
1318 {
1319 int i;
1320 for(i=0; i<4; i++)
1321 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1322 }
1323
FUNCC(pred8x16_horizontal_add)1324 static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix,
1325 const int *block_offset,
1326 int16_t *block, ptrdiff_t stride)
1327 {
1328 int i;
1329 for(i=0; i<4; i++)
1330 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1331 for(i=4; i<8; i++)
1332 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1333 }
1334