1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 #include "avcdec_lib.h"
19 #include "oscl_mem.h"
20
21
22 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \
23 x = 0xFF & (~(x>>31));}
24
25 /* (blkwidth << 2) + (dy << 1) + dx */
26 static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
27 {
28 &ChromaFullMC_SIMD,
29 &ChromaHorizontalMC_SIMD,
30 &ChromaVerticalMC_SIMD,
31 &ChromaDiagonalMC_SIMD,
32 &ChromaFullMC_SIMD,
33 &ChromaHorizontalMC2_SIMD,
34 &ChromaVerticalMC2_SIMD,
35 &ChromaDiagonalMC2_SIMD
36 };
37 /* Perform motion prediction and compensation with residue if exist. */
InterMBPrediction(AVCCommonObj * video)38 void InterMBPrediction(AVCCommonObj *video)
39 {
40 AVCMacroblock *currMB = video->currMB;
41 AVCPictureData *currPic = video->currPic;
42 int mbPartIdx, subMbPartIdx;
43 int ref_idx;
44 int offset_MbPart_indx = 0;
45 int16 *mv;
46 uint32 x_pos, y_pos;
47 uint8 *curL, *curCb, *curCr;
48 uint8 *ref_l, *ref_Cb, *ref_Cr;
49 uint8 *predBlock, *predCb, *predCr;
50 int block_x, block_y, offset_x, offset_y, offsetP, offset;
51 int x_position = (video->mb_x << 4);
52 int y_position = (video->mb_y << 4);
53 int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
54 int picWidth = currPic->pitch;
55 int picHeight = currPic->height;
56 int16 *dataBlock;
57 uint32 cbp4x4;
58 uint32 tmp_word;
59
60 tmp_word = y_position * picWidth;
61 curL = currPic->Sl + tmp_word + x_position;
62 offset = (tmp_word >> 2) + (x_position >> 1);
63 curCb = currPic->Scb + offset;
64 curCr = currPic->Scr + offset;
65
66 #ifdef USE_PRED_BLOCK
67 predBlock = video->pred + 84;
68 predCb = video->pred + 452;
69 predCr = video->pred + 596;
70 #else
71 predBlock = curL;
72 predCb = curCb;
73 predCr = curCr;
74 #endif
75
76 GetMotionVectorPredictor(video, false);
77
78 for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
79 {
80 MbHeight = currMB->SubMbPartHeight[mbPartIdx];
81 MbWidth = currMB->SubMbPartWidth[mbPartIdx];
82 mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
83 mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
84 ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
85 offset_indx = 0;
86
87 ref_l = video->RefPicList0[ref_idx]->Sl;
88 ref_Cb = video->RefPicList0[ref_idx]->Scb;
89 ref_Cr = video->RefPicList0[ref_idx]->Scr;
90
91 for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
92 {
93 block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); // check this
94 block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
95 mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
96 offset_x = x_position + (block_x << 2);
97 offset_y = y_position + (block_y << 2);
98 x_pos = (offset_x << 2) + *mv++; /*quarter pel */
99 y_pos = (offset_y << 2) + *mv; /*quarter pel */
100
101 //offset = offset_y * currPic->width;
102 //offsetC = (offset >> 2) + (offset_x >> 1);
103 #ifdef USE_PRED_BLOCK
104 offsetP = (block_y * 80) + (block_x << 2);
105 LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
106 /*comp_Sl + offset + offset_x,*/
107 predBlock + offsetP, 20, MbWidth, MbHeight);
108 #else
109 offsetP = (block_y << 2) * picWidth + (block_x << 2);
110 LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
111 /*comp_Sl + offset + offset_x,*/
112 predBlock + offsetP, picWidth, MbWidth, MbHeight);
113 #endif
114
115 #ifdef USE_PRED_BLOCK
116 offsetP = (block_y * 24) + (block_x << 1);
117 ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
118 /*comp_Scb + offsetC,*/
119 predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
120 ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
121 /*comp_Scr + offsetC,*/
122 predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
123 #else
124 offsetP = (block_y * picWidth) + (block_x << 1);
125 ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
126 /*comp_Scb + offsetC,*/
127 predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
128 ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
129 /*comp_Scr + offsetC,*/
130 predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
131 #endif
132
133 offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
134 }
135 offset_MbPart_indx = currMB->MbPartWidth >> 4;
136 }
137
138 /* used in decoder, used to be if(!encFlag) */
139
140 /* transform in raster scan order */
141 dataBlock = video->block;
142 cbp4x4 = video->cbp4x4;
143 /* luma */
144 for (block_y = 4; block_y > 0; block_y--)
145 {
146 for (block_x = 4; block_x > 0; block_x--)
147 {
148 #ifdef USE_PRED_BLOCK
149 if (cbp4x4&1)
150 {
151 itrans(dataBlock, predBlock, predBlock, 20);
152 }
153 #else
154 if (cbp4x4&1)
155 {
156 itrans(dataBlock, curL, curL, picWidth);
157 }
158 #endif
159 cbp4x4 >>= 1;
160 dataBlock += 4;
161 #ifdef USE_PRED_BLOCK
162 predBlock += 4;
163 #else
164 curL += 4;
165 #endif
166 }
167 dataBlock += 48;
168 #ifdef USE_PRED_BLOCK
169 predBlock += 64;
170 #else
171 curL += ((picWidth << 2) - 16);
172 #endif
173 }
174
175 /* chroma */
176 picWidth = (picWidth >> 1);
177 for (block_y = 2; block_y > 0; block_y--)
178 {
179 for (block_x = 2; block_x > 0; block_x--)
180 {
181 #ifdef USE_PRED_BLOCK
182 if (cbp4x4&1)
183 {
184 ictrans(dataBlock, predCb, predCb, 12);
185 }
186 #else
187 if (cbp4x4&1)
188 {
189 ictrans(dataBlock, curCb, curCb, picWidth);
190 }
191 #endif
192 cbp4x4 >>= 1;
193 dataBlock += 4;
194 #ifdef USE_PRED_BLOCK
195 predCb += 4;
196 #else
197 curCb += 4;
198 #endif
199 }
200 for (block_x = 2; block_x > 0; block_x--)
201 {
202 #ifdef USE_PRED_BLOCK
203 if (cbp4x4&1)
204 {
205 ictrans(dataBlock, predCr, predCr, 12);
206 }
207 #else
208 if (cbp4x4&1)
209 {
210 ictrans(dataBlock, curCr, curCr, picWidth);
211 }
212 #endif
213 cbp4x4 >>= 1;
214 dataBlock += 4;
215 #ifdef USE_PRED_BLOCK
216 predCr += 4;
217 #else
218 curCr += 4;
219 #endif
220 }
221 dataBlock += 48;
222 #ifdef USE_PRED_BLOCK
223 predCb += 40;
224 predCr += 40;
225 #else
226 curCb += ((picWidth << 2) - 8);
227 curCr += ((picWidth << 2) - 8);
228 #endif
229 }
230
231 #ifdef MB_BASED_DEBLOCK
232 SaveNeighborForIntraPred(video, offset);
233 #endif
234
235 return ;
236 }
237
238
239 /* preform the actual motion comp here */
LumaMotionComp(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * pred,int pred_pitch,int blkwidth,int blkheight)240 void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
241 int x_pos, int y_pos,
242 uint8 *pred, int pred_pitch,
243 int blkwidth, int blkheight)
244 {
245 int dx, dy;
246 uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
247 int temp2[21][21]; /* for intermediate results */
248 uint8 *ref2;
249
250 dx = x_pos & 3;
251 dy = y_pos & 3;
252 x_pos = x_pos >> 2; /* round it to full-pel resolution */
253 y_pos = y_pos >> 2;
254
255 /* perform actual motion compensation */
256 if (dx == 0 && dy == 0)
257 { /* fullpel position *//* G */
258 if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
259 {
260 ref += y_pos * picwidth + x_pos;
261 FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
262 }
263 else
264 {
265 CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
266 FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight);
267 }
268
269 } /* other positions */
270 else if (dy == 0)
271 { /* no vertical interpolation *//* a,b,c*/
272
273 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
274 {
275 ref += y_pos * picwidth + x_pos;
276
277 HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
278 }
279 else /* need padding */
280 {
281 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);
282
283 HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
284 }
285 }
286 else if (dx == 0)
287 { /*no horizontal interpolation *//* d,h,n */
288
289 if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
290 {
291 ref += y_pos * picwidth + x_pos;
292
293 VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy);
294 }
295 else /* need padding */
296 {
297 CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);
298
299 VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy);
300 }
301 }
302 else if (dy == 2)
303 { /* horizontal cross *//* i, j, k */
304
305 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
306 {
307 ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */
308
309 VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);
310
311 HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
312 }
313 else /* need padding */
314 {
315 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
316
317 VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);
318
319 HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
320 }
321 }
322 else if (dx == 2)
323 { /* vertical cross */ /* f,q */
324
325 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
326 {
327 ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */
328
329 HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
330 VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
331 }
332 else /* need padding */
333 {
334 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
335 HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
336 VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
337 }
338 }
339 else
340 { /* diagonal *//* e,g,p,r */
341
342 if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
343 y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
344 {
345 ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;
346
347 ref += (y_pos * picwidth) + x_pos + (dx / 2);
348
349 DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
350 }
351 else /* need padding */
352 {
353 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));
354
355 ref2 = &temp[2 + (dy/2)][2];
356
357 ref = &temp[2][2 + (dx/2)];
358
359 DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight);
360 }
361 }
362
363 return ;
364 }
365
CreateAlign(uint8 * ref,int picwidth,int y_pos,uint8 * out,int blkwidth,int blkheight)366 void CreateAlign(uint8 *ref, int picwidth, int y_pos,
367 uint8 *out, int blkwidth, int blkheight)
368 {
369 int i, j;
370 int offset, out_offset;
371 uint32 prev_pix, result, pix1, pix2, pix4;
372
373 out_offset = 24 - blkwidth;
374
375 //switch(x_pos&0x3){
376 switch (((uint32)ref)&0x3)
377 {
378 case 1:
379 ref += y_pos * picwidth;
380 offset = picwidth - blkwidth - 3;
381 for (j = 0; j < blkheight; j++)
382 {
383 pix1 = *ref++;
384 pix2 = *((uint16*)ref);
385 ref += 2;
386 result = (pix2 << 8) | pix1;
387
388 for (i = 3; i < blkwidth; i += 4)
389 {
390 pix4 = *((uint32*)ref);
391 ref += 4;
392 prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
393 result |= prev_pix;
394 *((uint32*)out) = result; /* write 4 bytes */
395 out += 4;
396 result = pix4 >> 8; /* for the next loop */
397 }
398 ref += offset;
399 out += out_offset;
400 }
401 break;
402 case 2:
403 ref += y_pos * picwidth;
404 offset = picwidth - blkwidth - 2;
405 for (j = 0; j < blkheight; j++)
406 {
407 result = *((uint16*)ref);
408 ref += 2;
409 for (i = 2; i < blkwidth; i += 4)
410 {
411 pix4 = *((uint32*)ref);
412 ref += 4;
413 prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
414 result |= prev_pix;
415 *((uint32*)out) = result; /* write 4 bytes */
416 out += 4;
417 result = pix4 >> 16; /* for the next loop */
418 }
419 ref += offset;
420 out += out_offset;
421 }
422 break;
423 case 3:
424 ref += y_pos * picwidth;
425 offset = picwidth - blkwidth - 1;
426 for (j = 0; j < blkheight; j++)
427 {
428 result = *ref++;
429 for (i = 1; i < blkwidth; i += 4)
430 {
431 pix4 = *((uint32*)ref);
432 ref += 4;
433 prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
434 result |= prev_pix;
435 *((uint32*)out) = result; /* write 4 bytes */
436 out += 4;
437 result = pix4 >> 24; /* for the next loop */
438 }
439 ref += offset;
440 out += out_offset;
441 }
442 break;
443 }
444 }
445
CreatePad(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * out,int blkwidth,int blkheight)446 void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
447 uint8 *out, int blkwidth, int blkheight)
448 {
449 int x_inc0, x_mid;
450 int y_inc, y_inc0, y_inc1, y_mid;
451 int i, j;
452 int offset;
453
454 if (x_pos < 0)
455 {
456 x_inc0 = 0; /* increment for the first part */
457 x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth); /* stopping point */
458 x_pos = 0;
459 }
460 else if (x_pos + blkwidth > picwidth)
461 {
462 x_inc0 = 1; /* increasing */
463 x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0); /* clip negative to zero, encode fool proof! */
464 }
465 else /* normal case */
466 {
467 x_inc0 = 1;
468 x_mid = blkwidth; /* just one run */
469 }
470
471
472 /* boundary for y_pos, taking the result from x_pos into account */
473 if (y_pos < 0)
474 {
475 y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
476 y_inc1 = picwidth + y_inc0;
477 y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
478 y_pos = 0;
479 }
480 else if (y_pos + blkheight > picheight)
481 {
482 y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
483 y_inc0 = picwidth + y_inc1; /* increasing */
484 y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
485 }
486 else /* normal case */
487 {
488 y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
489 y_inc0 = picwidth + y_inc1;
490 y_mid = blkheight;
491 }
492
493 /* clip y_pos and x_pos */
494 if (y_pos > picheight - 1) y_pos = picheight - 1;
495 if (x_pos > picwidth - 1) x_pos = picwidth - 1;
496
497 ref += y_pos * picwidth + x_pos;
498
499 y_inc = y_inc0; /* start with top half */
500
501 offset = 24 - blkwidth; /* to use in offset out */
502 blkwidth -= x_mid; /* to use in the loop limit */
503
504 if (x_inc0 == 0)
505 {
506 for (j = 0; j < blkheight; j++)
507 {
508 if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */
509 {
510 y_inc = y_inc1; /* switch to lower half */
511 }
512 for (i = x_mid; i > 0; i--) /* first or third quarter */
513 {
514 *out++ = *ref;
515 }
516 for (i = blkwidth; i > 0; i--) /* second or fourth quarter */
517 {
518 *out++ = *ref++;
519 }
520 out += offset;
521 ref += y_inc;
522 }
523 }
524 else
525 {
526 for (j = 0; j < blkheight; j++)
527 {
528 if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */
529 {
530 y_inc = y_inc1; /* switch to lower half */
531 }
532 for (i = x_mid; i > 0; i--) /* first or third quarter */
533 {
534 *out++ = *ref++;
535 }
536 for (i = blkwidth; i > 0; i--) /* second or fourth quarter */
537 {
538 *out++ = *ref;
539 }
540 out += offset;
541 ref += y_inc;
542 }
543 }
544
545 return ;
546 }
547
HorzInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)548 void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
549 int blkwidth, int blkheight, int dx)
550 {
551 uint8 *p_ref;
552 uint32 *p_cur;
553 uint32 tmp, pkres;
554 int result, curr_offset, ref_offset;
555 int j;
556 int32 r0, r1, r2, r3, r4, r5;
557 int32 r13, r6;
558
559 p_cur = (uint32*)out; /* assume it's word aligned */
560 curr_offset = (outpitch - blkwidth) >> 2;
561 p_ref = in;
562 ref_offset = inpitch - blkwidth;
563
564 if (dx&1)
565 {
566 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
567 p_ref -= 2;
568 r13 = 0;
569 for (j = blkheight; j > 0; j--)
570 {
571 tmp = (uint32)(p_ref + blkwidth);
572 r0 = p_ref[0];
573 r1 = p_ref[2];
574 r0 |= (r1 << 16); /* 0,c,0,a */
575 r1 = p_ref[1];
576 r2 = p_ref[3];
577 r1 |= (r2 << 16); /* 0,d,0,b */
578 while ((uint32)p_ref < tmp)
579 {
580 r2 = *(p_ref += 4); /* move pointer to e */
581 r3 = p_ref[2];
582 r2 |= (r3 << 16); /* 0,g,0,e */
583 r3 = p_ref[1];
584 r4 = p_ref[3];
585 r3 |= (r4 << 16); /* 0,h,0,f */
586
587 r4 = r0 + r3; /* c+h, a+f */
588 r5 = r0 + r1; /* c+d, a+b */
589 r6 = r2 + r3; /* g+h, e+f */
590 r5 >>= 16;
591 r5 |= (r6 << 16); /* e+f, c+d */
592 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
593 r4 += 0x100010; /* +16, +16 */
594 r5 = r1 + r2; /* d+g, b+e */
595 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
596 r4 >>= 5;
597 r13 |= r4; /* check clipping */
598
599 r5 = p_ref[dx+2];
600 r6 = p_ref[dx+4];
601 r5 |= (r6 << 16);
602 r4 += r5;
603 r4 += 0x10001;
604 r4 = (r4 >> 1) & 0xFF00FF;
605
606 r5 = p_ref[4]; /* i */
607 r6 = (r5 << 16);
608 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
609 r5 += r1; /* d+i, b+g */ /* r5 not free */
610 r1 >>= 16;
611 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
612 r1 += r2; /* f+g, d+e */
613 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
614 r0 >>= 16;
615 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
616 r0 += r3; /* e+h, c+f */
617 r5 += 0x100010; /* 16,16 */
618 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
619 r5 >>= 5;
620 r13 |= r5; /* check clipping */
621
622 r0 = p_ref[dx+3];
623 r1 = p_ref[dx+5];
624 r0 |= (r1 << 16);
625 r5 += r0;
626 r5 += 0x10001;
627 r5 = (r5 >> 1) & 0xFF00FF;
628
629 r4 |= (r5 << 8); /* pack them together */
630 *p_cur++ = r4;
631 r1 = r3;
632 r0 = r2;
633 }
634 p_cur += curr_offset; /* move to the next line */
635 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
636
637 if (r13&0xFF000700) /* need clipping */
638 {
639 /* move back to the beginning of the line */
640 p_ref -= (ref_offset + blkwidth); /* input */
641 p_cur -= (outpitch >> 2);
642
643 tmp = (uint32)(p_ref + blkwidth);
644 for (; (uint32)p_ref < tmp;)
645 {
646
647 r0 = *p_ref++;
648 r1 = *p_ref++;
649 r2 = *p_ref++;
650 r3 = *p_ref++;
651 r4 = *p_ref++;
652 /* first pixel */
653 r5 = *p_ref++;
654 result = (r0 + r5);
655 r0 = (r1 + r4);
656 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
657 r0 = (r2 + r3);
658 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
659 result = (result + 16) >> 5;
660 CLIP_RESULT(result)
661 /* 3/4 pel, no need to clip */
662 result = (result + p_ref[dx] + 1);
663 pkres = (result >> 1) ;
664 /* second pixel */
665 r0 = *p_ref++;
666 result = (r1 + r0);
667 r1 = (r2 + r5);
668 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
669 r1 = (r3 + r4);
670 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
671 result = (result + 16) >> 5;
672 CLIP_RESULT(result)
673 /* 3/4 pel, no need to clip */
674 result = (result + p_ref[dx] + 1);
675 result = (result >> 1);
676 pkres |= (result << 8);
677 /* third pixel */
678 r1 = *p_ref++;
679 result = (r2 + r1);
680 r2 = (r3 + r0);
681 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
682 r2 = (r4 + r5);
683 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
684 result = (result + 16) >> 5;
685 CLIP_RESULT(result)
686 /* 3/4 pel, no need to clip */
687 result = (result + p_ref[dx] + 1);
688 result = (result >> 1);
689 pkres |= (result << 16);
690 /* fourth pixel */
691 r2 = *p_ref++;
692 result = (r3 + r2);
693 r3 = (r4 + r1);
694 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
695 r3 = (r5 + r0);
696 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
697 result = (result + 16) >> 5;
698 CLIP_RESULT(result)
699 /* 3/4 pel, no need to clip */
700 result = (result + p_ref[dx] + 1);
701 result = (result >> 1);
702 pkres |= (result << 24);
703 *p_cur++ = pkres; /* write 4 pixels */
704 p_ref -= 5; /* offset back to the middle of filter */
705 }
706 p_cur += curr_offset; /* move to the next line */
707 p_ref += ref_offset; /* move to the next line */
708 }
709 }
710 }
711 else
712 {
713 p_ref -= 2;
714 r13 = 0;
715 for (j = blkheight; j > 0; j--)
716 {
717 tmp = (uint32)(p_ref + blkwidth);
718 r0 = p_ref[0];
719 r1 = p_ref[2];
720 r0 |= (r1 << 16); /* 0,c,0,a */
721 r1 = p_ref[1];
722 r2 = p_ref[3];
723 r1 |= (r2 << 16); /* 0,d,0,b */
724 while ((uint32)p_ref < tmp)
725 {
726 r2 = *(p_ref += 4); /* move pointer to e */
727 r3 = p_ref[2];
728 r2 |= (r3 << 16); /* 0,g,0,e */
729 r3 = p_ref[1];
730 r4 = p_ref[3];
731 r3 |= (r4 << 16); /* 0,h,0,f */
732
733 r4 = r0 + r3; /* c+h, a+f */
734 r5 = r0 + r1; /* c+d, a+b */
735 r6 = r2 + r3; /* g+h, e+f */
736 r5 >>= 16;
737 r5 |= (r6 << 16); /* e+f, c+d */
738 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
739 r4 += 0x100010; /* +16, +16 */
740 r5 = r1 + r2; /* d+g, b+e */
741 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
742 r4 >>= 5;
743 r13 |= r4; /* check clipping */
744 r4 &= 0xFF00FF; /* mask */
745
746 r5 = p_ref[4]; /* i */
747 r6 = (r5 << 16);
748 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
749 r5 += r1; /* d+i, b+g */ /* r5 not free */
750 r1 >>= 16;
751 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
752 r1 += r2; /* f+g, d+e */
753 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
754 r0 >>= 16;
755 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
756 r0 += r3; /* e+h, c+f */
757 r5 += 0x100010; /* 16,16 */
758 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
759 r5 >>= 5;
760 r13 |= r5; /* check clipping */
761 r5 &= 0xFF00FF; /* mask */
762
763 r4 |= (r5 << 8); /* pack them together */
764 *p_cur++ = r4;
765 r1 = r3;
766 r0 = r2;
767 }
768 p_cur += curr_offset; /* move to the next line */
769 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
770
771 if (r13&0xFF000700) /* need clipping */
772 {
773 /* move back to the beginning of the line */
774 p_ref -= (ref_offset + blkwidth); /* input */
775 p_cur -= (outpitch >> 2);
776
777 tmp = (uint32)(p_ref + blkwidth);
778 for (; (uint32)p_ref < tmp;)
779 {
780
781 r0 = *p_ref++;
782 r1 = *p_ref++;
783 r2 = *p_ref++;
784 r3 = *p_ref++;
785 r4 = *p_ref++;
786 /* first pixel */
787 r5 = *p_ref++;
788 result = (r0 + r5);
789 r0 = (r1 + r4);
790 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
791 r0 = (r2 + r3);
792 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
793 result = (result + 16) >> 5;
794 CLIP_RESULT(result)
795 pkres = result;
796 /* second pixel */
797 r0 = *p_ref++;
798 result = (r1 + r0);
799 r1 = (r2 + r5);
800 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
801 r1 = (r3 + r4);
802 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
803 result = (result + 16) >> 5;
804 CLIP_RESULT(result)
805 pkres |= (result << 8);
806 /* third pixel */
807 r1 = *p_ref++;
808 result = (r2 + r1);
809 r2 = (r3 + r0);
810 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
811 r2 = (r4 + r5);
812 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
813 result = (result + 16) >> 5;
814 CLIP_RESULT(result)
815 pkres |= (result << 16);
816 /* fourth pixel */
817 r2 = *p_ref++;
818 result = (r3 + r2);
819 r3 = (r4 + r1);
820 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
821 r3 = (r5 + r0);
822 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
823 result = (result + 16) >> 5;
824 CLIP_RESULT(result)
825 pkres |= (result << 24);
826 *p_cur++ = pkres; /* write 4 pixels */
827 p_ref -= 5;
828 }
829 p_cur += curr_offset; /* move to the next line */
830 p_ref += ref_offset;
831 }
832 }
833 }
834
835 return ;
836 }
837
HorzInterp2MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)838 void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
839 int blkwidth, int blkheight, int dx)
840 {
841 int *p_ref;
842 uint32 *p_cur;
843 uint32 tmp, pkres;
844 int result, result2, curr_offset, ref_offset;
845 int j, r0, r1, r2, r3, r4, r5;
846
847 p_cur = (uint32*)out; /* assume it's word aligned */
848 curr_offset = (outpitch - blkwidth) >> 2;
849 p_ref = in;
850 ref_offset = inpitch - blkwidth;
851
852 if (dx&1)
853 {
854 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
855
856 for (j = blkheight; j > 0 ; j--)
857 {
858 tmp = (uint32)(p_ref + blkwidth);
859 for (; (uint32)p_ref < tmp;)
860 {
861
862 r0 = p_ref[-2];
863 r1 = p_ref[-1];
864 r2 = *p_ref++;
865 r3 = *p_ref++;
866 r4 = *p_ref++;
867 /* first pixel */
868 r5 = *p_ref++;
869 result = (r0 + r5);
870 r0 = (r1 + r4);
871 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
872 r0 = (r2 + r3);
873 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
874 result = (result + 512) >> 10;
875 CLIP_RESULT(result)
876 result2 = ((p_ref[dx] + 16) >> 5);
877 CLIP_RESULT(result2)
878 /* 3/4 pel, no need to clip */
879 result = (result + result2 + 1);
880 pkres = (result >> 1);
881 /* second pixel */
882 r0 = *p_ref++;
883 result = (r1 + r0);
884 r1 = (r2 + r5);
885 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
886 r1 = (r3 + r4);
887 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
888 result = (result + 512) >> 10;
889 CLIP_RESULT(result)
890 result2 = ((p_ref[dx] + 16) >> 5);
891 CLIP_RESULT(result2)
892 /* 3/4 pel, no need to clip */
893 result = (result + result2 + 1);
894 result = (result >> 1);
895 pkres |= (result << 8);
896 /* third pixel */
897 r1 = *p_ref++;
898 result = (r2 + r1);
899 r2 = (r3 + r0);
900 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
901 r2 = (r4 + r5);
902 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
903 result = (result + 512) >> 10;
904 CLIP_RESULT(result)
905 result2 = ((p_ref[dx] + 16) >> 5);
906 CLIP_RESULT(result2)
907 /* 3/4 pel, no need to clip */
908 result = (result + result2 + 1);
909 result = (result >> 1);
910 pkres |= (result << 16);
911 /* fourth pixel */
912 r2 = *p_ref++;
913 result = (r3 + r2);
914 r3 = (r4 + r1);
915 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
916 r3 = (r5 + r0);
917 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
918 result = (result + 512) >> 10;
919 CLIP_RESULT(result)
920 result2 = ((p_ref[dx] + 16) >> 5);
921 CLIP_RESULT(result2)
922 /* 3/4 pel, no need to clip */
923 result = (result + result2 + 1);
924 result = (result >> 1);
925 pkres |= (result << 24);
926 *p_cur++ = pkres; /* write 4 pixels */
927 p_ref -= 3; /* offset back to the middle of filter */
928 }
929 p_cur += curr_offset; /* move to the next line */
930 p_ref += ref_offset; /* move to the next line */
931 }
932 }
933 else
934 {
935 for (j = blkheight; j > 0 ; j--)
936 {
937 tmp = (uint32)(p_ref + blkwidth);
938 for (; (uint32)p_ref < tmp;)
939 {
940
941 r0 = p_ref[-2];
942 r1 = p_ref[-1];
943 r2 = *p_ref++;
944 r3 = *p_ref++;
945 r4 = *p_ref++;
946 /* first pixel */
947 r5 = *p_ref++;
948 result = (r0 + r5);
949 r0 = (r1 + r4);
950 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
951 r0 = (r2 + r3);
952 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
953 result = (result + 512) >> 10;
954 CLIP_RESULT(result)
955 pkres = result;
956 /* second pixel */
957 r0 = *p_ref++;
958 result = (r1 + r0);
959 r1 = (r2 + r5);
960 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
961 r1 = (r3 + r4);
962 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
963 result = (result + 512) >> 10;
964 CLIP_RESULT(result)
965 pkres |= (result << 8);
966 /* third pixel */
967 r1 = *p_ref++;
968 result = (r2 + r1);
969 r2 = (r3 + r0);
970 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
971 r2 = (r4 + r5);
972 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
973 result = (result + 512) >> 10;
974 CLIP_RESULT(result)
975 pkres |= (result << 16);
976 /* fourth pixel */
977 r2 = *p_ref++;
978 result = (r3 + r2);
979 r3 = (r4 + r1);
980 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
981 r3 = (r5 + r0);
982 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
983 result = (result + 512) >> 10;
984 CLIP_RESULT(result)
985 pkres |= (result << 24);
986 *p_cur++ = pkres; /* write 4 pixels */
987 p_ref -= 3; /* offset back to the middle of filter */
988 }
989 p_cur += curr_offset; /* move to the next line */
990 p_ref += ref_offset; /* move to the next line */
991 }
992 }
993
994 return ;
995 }
996
HorzInterp3MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)997 void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
998 int blkwidth, int blkheight)
999 {
1000 uint8 *p_ref;
1001 int *p_cur;
1002 uint32 tmp;
1003 int result, curr_offset, ref_offset;
1004 int j, r0, r1, r2, r3, r4, r5;
1005
1006 p_cur = out;
1007 curr_offset = (outpitch - blkwidth);
1008 p_ref = in;
1009 ref_offset = inpitch - blkwidth;
1010
1011 for (j = blkheight; j > 0 ; j--)
1012 {
1013 tmp = (uint32)(p_ref + blkwidth);
1014 for (; (uint32)p_ref < tmp;)
1015 {
1016
1017 r0 = p_ref[-2];
1018 r1 = p_ref[-1];
1019 r2 = *p_ref++;
1020 r3 = *p_ref++;
1021 r4 = *p_ref++;
1022 /* first pixel */
1023 r5 = *p_ref++;
1024 result = (r0 + r5);
1025 r0 = (r1 + r4);
1026 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1027 r0 = (r2 + r3);
1028 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1029 *p_cur++ = result;
1030 /* second pixel */
1031 r0 = *p_ref++;
1032 result = (r1 + r0);
1033 r1 = (r2 + r5);
1034 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1035 r1 = (r3 + r4);
1036 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1037 *p_cur++ = result;
1038 /* third pixel */
1039 r1 = *p_ref++;
1040 result = (r2 + r1);
1041 r2 = (r3 + r0);
1042 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1043 r2 = (r4 + r5);
1044 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1045 *p_cur++ = result;
1046 /* fourth pixel */
1047 r2 = *p_ref++;
1048 result = (r3 + r2);
1049 r3 = (r4 + r1);
1050 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1051 r3 = (r5 + r0);
1052 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1053 *p_cur++ = result;
1054 p_ref -= 3; /* move back to the middle of the filter */
1055 }
1056 p_cur += curr_offset; /* move to the next line */
1057 p_ref += ref_offset;
1058 }
1059
1060 return ;
1061 }
VertInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)1062 void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1063 int blkwidth, int blkheight, int dy)
1064 {
1065 uint8 *p_cur, *p_ref;
1066 uint32 tmp;
1067 int result, curr_offset, ref_offset;
1068 int j, i;
1069 int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
1070 uint8 tmp_in[24][24];
1071
1072 /* not word-aligned */
1073 if (((uint32)in)&0x3)
1074 {
1075 CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1076 in = &tmp_in[2][0];
1077 inpitch = 24;
1078 }
1079 p_cur = out;
1080 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1081 ref_offset = blkheight * inpitch; /* for limit */
1082
1083 curr_offset += 3;
1084
1085 if (dy&1)
1086 {
1087 dy = (dy >> 1) ? 0 : -inpitch;
1088
1089 for (j = 0; j < blkwidth; j += 4, in += 4)
1090 {
1091 r13 = 0;
1092 p_ref = in;
1093 p_cur -= outpitch; /* compensate for the first offset */
1094 tmp = (uint32)(p_ref + ref_offset); /* limit */
1095 while ((uint32)p_ref < tmp) /* the loop un-rolled */
1096 {
1097 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1098 p_ref += inpitch;
1099 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1100 r0 &= 0xFF00FF;
1101
1102 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
1103 r7 = (r1 >> 8) & 0xFF00FF;
1104 r1 &= 0xFF00FF;
1105
1106 r0 += r1;
1107 r6 += r7;
1108
1109 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1110 r8 = (r2 >> 8) & 0xFF00FF;
1111 r2 &= 0xFF00FF;
1112
1113 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1114 r7 = (r1 >> 8) & 0xFF00FF;
1115 r1 &= 0xFF00FF;
1116 r1 += r2;
1117
1118 r7 += r8;
1119
1120 r0 += 20 * r1;
1121 r6 += 20 * r7;
1122 r0 += 0x100010;
1123 r6 += 0x100010;
1124
1125 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1126 r8 = (r2 >> 8) & 0xFF00FF;
1127 r2 &= 0xFF00FF;
1128
1129 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1130 r7 = (r1 >> 8) & 0xFF00FF;
1131 r1 &= 0xFF00FF;
1132 r1 += r2;
1133
1134 r7 += r8;
1135
1136 r0 -= 5 * r1;
1137 r6 -= 5 * r7;
1138
1139 r0 >>= 5;
1140 r6 >>= 5;
1141 /* clip */
1142 r13 |= r6;
1143 r13 |= r0;
1144 //CLIPPACK(r6,result)
1145
1146 r1 = *((uint32*)(p_ref + dy));
1147 r2 = (r1 >> 8) & 0xFF00FF;
1148 r1 &= 0xFF00FF;
1149 r0 += r1;
1150 r6 += r2;
1151 r0 += 0x10001;
1152 r6 += 0x10001;
1153 r0 = (r0 >> 1) & 0xFF00FF;
1154 r6 = (r6 >> 1) & 0xFF00FF;
1155
1156 r0 |= (r6 << 8); /* pack it back */
1157 *((uint32*)(p_cur += outpitch)) = r0;
1158 }
1159 p_cur += curr_offset; /* offset to the next pixel */
1160 if (r13 & 0xFF000700) /* this column need clipping */
1161 {
1162 p_cur -= 4;
1163 for (i = 0; i < 4; i++)
1164 {
1165 p_ref = in + i;
1166 p_cur -= outpitch; /* compensate for the first offset */
1167
1168 tmp = (uint32)(p_ref + ref_offset); /* limit */
1169 while ((uint32)p_ref < tmp)
1170 { /* loop un-rolled */
1171 r0 = *(p_ref - (inpitch << 1));
1172 r1 = *(p_ref - inpitch);
1173 r2 = *p_ref;
1174 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1175 r4 = *(p_ref += inpitch);
1176 /* first pixel */
1177 r5 = *(p_ref += inpitch);
1178 result = (r0 + r5);
1179 r0 = (r1 + r4);
1180 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1181 r0 = (r2 + r3);
1182 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1183 result = (result + 16) >> 5;
1184 CLIP_RESULT(result)
1185 /* 3/4 pel, no need to clip */
1186 result = (result + p_ref[dy-(inpitch<<1)] + 1);
1187 result = (result >> 1);
1188 *(p_cur += outpitch) = result;
1189 /* second pixel */
1190 r0 = *(p_ref += inpitch);
1191 result = (r1 + r0);
1192 r1 = (r2 + r5);
1193 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1194 r1 = (r3 + r4);
1195 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1196 result = (result + 16) >> 5;
1197 CLIP_RESULT(result)
1198 /* 3/4 pel, no need to clip */
1199 result = (result + p_ref[dy-(inpitch<<1)] + 1);
1200 result = (result >> 1);
1201 *(p_cur += outpitch) = result;
1202 /* third pixel */
1203 r1 = *(p_ref += inpitch);
1204 result = (r2 + r1);
1205 r2 = (r3 + r0);
1206 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1207 r2 = (r4 + r5);
1208 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1209 result = (result + 16) >> 5;
1210 CLIP_RESULT(result)
1211 /* 3/4 pel, no need to clip */
1212 result = (result + p_ref[dy-(inpitch<<1)] + 1);
1213 result = (result >> 1);
1214 *(p_cur += outpitch) = result;
1215 /* fourth pixel */
1216 r2 = *(p_ref += inpitch);
1217 result = (r3 + r2);
1218 r3 = (r4 + r1);
1219 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1220 r3 = (r5 + r0);
1221 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1222 result = (result + 16) >> 5;
1223 CLIP_RESULT(result)
1224 /* 3/4 pel, no need to clip */
1225 result = (result + p_ref[dy-(inpitch<<1)] + 1);
1226 result = (result >> 1);
1227 *(p_cur += outpitch) = result;
1228 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1229 }
1230 p_cur += (curr_offset - 3);
1231 }
1232 }
1233 }
1234 }
1235 else
1236 {
1237 for (j = 0; j < blkwidth; j += 4, in += 4)
1238 {
1239 r13 = 0;
1240 p_ref = in;
1241 p_cur -= outpitch; /* compensate for the first offset */
1242 tmp = (uint32)(p_ref + ref_offset); /* limit */
1243 while ((uint32)p_ref < tmp) /* the loop un-rolled */
1244 {
1245 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1246 p_ref += inpitch;
1247 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1248 r0 &= 0xFF00FF;
1249
1250 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
1251 r7 = (r1 >> 8) & 0xFF00FF;
1252 r1 &= 0xFF00FF;
1253
1254 r0 += r1;
1255 r6 += r7;
1256
1257 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1258 r8 = (r2 >> 8) & 0xFF00FF;
1259 r2 &= 0xFF00FF;
1260
1261 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1262 r7 = (r1 >> 8) & 0xFF00FF;
1263 r1 &= 0xFF00FF;
1264 r1 += r2;
1265
1266 r7 += r8;
1267
1268 r0 += 20 * r1;
1269 r6 += 20 * r7;
1270 r0 += 0x100010;
1271 r6 += 0x100010;
1272
1273 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1274 r8 = (r2 >> 8) & 0xFF00FF;
1275 r2 &= 0xFF00FF;
1276
1277 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1278 r7 = (r1 >> 8) & 0xFF00FF;
1279 r1 &= 0xFF00FF;
1280 r1 += r2;
1281
1282 r7 += r8;
1283
1284 r0 -= 5 * r1;
1285 r6 -= 5 * r7;
1286
1287 r0 >>= 5;
1288 r6 >>= 5;
1289 /* clip */
1290 r13 |= r6;
1291 r13 |= r0;
1292 //CLIPPACK(r6,result)
1293 r0 &= 0xFF00FF;
1294 r6 &= 0xFF00FF;
1295 r0 |= (r6 << 8); /* pack it back */
1296 *((uint32*)(p_cur += outpitch)) = r0;
1297 }
1298 p_cur += curr_offset; /* offset to the next pixel */
1299 if (r13 & 0xFF000700) /* this column need clipping */
1300 {
1301 p_cur -= 4;
1302 for (i = 0; i < 4; i++)
1303 {
1304 p_ref = in + i;
1305 p_cur -= outpitch; /* compensate for the first offset */
1306 tmp = (uint32)(p_ref + ref_offset); /* limit */
1307 while ((uint32)p_ref < tmp)
1308 { /* loop un-rolled */
1309 r0 = *(p_ref - (inpitch << 1));
1310 r1 = *(p_ref - inpitch);
1311 r2 = *p_ref;
1312 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1313 r4 = *(p_ref += inpitch);
1314 /* first pixel */
1315 r5 = *(p_ref += inpitch);
1316 result = (r0 + r5);
1317 r0 = (r1 + r4);
1318 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1319 r0 = (r2 + r3);
1320 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1321 result = (result + 16) >> 5;
1322 CLIP_RESULT(result)
1323 *(p_cur += outpitch) = result;
1324 /* second pixel */
1325 r0 = *(p_ref += inpitch);
1326 result = (r1 + r0);
1327 r1 = (r2 + r5);
1328 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1329 r1 = (r3 + r4);
1330 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1331 result = (result + 16) >> 5;
1332 CLIP_RESULT(result)
1333 *(p_cur += outpitch) = result;
1334 /* third pixel */
1335 r1 = *(p_ref += inpitch);
1336 result = (r2 + r1);
1337 r2 = (r3 + r0);
1338 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1339 r2 = (r4 + r5);
1340 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1341 result = (result + 16) >> 5;
1342 CLIP_RESULT(result)
1343 *(p_cur += outpitch) = result;
1344 /* fourth pixel */
1345 r2 = *(p_ref += inpitch);
1346 result = (r3 + r2);
1347 r3 = (r4 + r1);
1348 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1349 r3 = (r5 + r0);
1350 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1351 result = (result + 16) >> 5;
1352 CLIP_RESULT(result)
1353 *(p_cur += outpitch) = result;
1354 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1355 }
1356 p_cur += (curr_offset - 3);
1357 }
1358 }
1359 }
1360 }
1361
1362 return ;
1363 }
1364
VertInterp2MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)1365 void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
1366 int blkwidth, int blkheight)
1367 {
1368 int *p_cur;
1369 uint8 *p_ref;
1370 uint32 tmp;
1371 int result, curr_offset, ref_offset;
1372 int j, r0, r1, r2, r3, r4, r5;
1373
1374 p_cur = out;
1375 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1376 ref_offset = blkheight * inpitch; /* for limit */
1377
1378 for (j = 0; j < blkwidth; j++)
1379 {
1380 p_cur -= outpitch; /* compensate for the first offset */
1381 p_ref = in++;
1382
1383 tmp = (uint32)(p_ref + ref_offset); /* limit */
1384 while ((uint32)p_ref < tmp)
1385 { /* loop un-rolled */
1386 r0 = *(p_ref - (inpitch << 1));
1387 r1 = *(p_ref - inpitch);
1388 r2 = *p_ref;
1389 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1390 r4 = *(p_ref += inpitch);
1391 /* first pixel */
1392 r5 = *(p_ref += inpitch);
1393 result = (r0 + r5);
1394 r0 = (r1 + r4);
1395 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1396 r0 = (r2 + r3);
1397 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1398 *(p_cur += outpitch) = result;
1399 /* second pixel */
1400 r0 = *(p_ref += inpitch);
1401 result = (r1 + r0);
1402 r1 = (r2 + r5);
1403 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1404 r1 = (r3 + r4);
1405 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1406 *(p_cur += outpitch) = result;
1407 /* third pixel */
1408 r1 = *(p_ref += inpitch);
1409 result = (r2 + r1);
1410 r2 = (r3 + r0);
1411 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1412 r2 = (r4 + r5);
1413 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1414 *(p_cur += outpitch) = result;
1415 /* fourth pixel */
1416 r2 = *(p_ref += inpitch);
1417 result = (r3 + r2);
1418 r3 = (r4 + r1);
1419 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1420 r3 = (r5 + r0);
1421 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1422 *(p_cur += outpitch) = result;
1423 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1424 }
1425 p_cur += curr_offset;
1426 }
1427
1428 return ;
1429 }
1430
VertInterp3MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)1431 void VertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
1432 int blkwidth, int blkheight, int dy)
1433 {
1434 uint8 *p_cur;
1435 int *p_ref;
1436 uint32 tmp;
1437 int result, result2, curr_offset, ref_offset;
1438 int j, r0, r1, r2, r3, r4, r5;
1439
1440 p_cur = out;
1441 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1442 ref_offset = blkheight * inpitch; /* for limit */
1443
1444 if (dy&1)
1445 {
1446 dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
1447
1448 for (j = 0; j < blkwidth; j++)
1449 {
1450 p_cur -= outpitch; /* compensate for the first offset */
1451 p_ref = in++;
1452
1453 tmp = (uint32)(p_ref + ref_offset); /* limit */
1454 while ((uint32)p_ref < tmp)
1455 { /* loop un-rolled */
1456 r0 = *(p_ref - (inpitch << 1));
1457 r1 = *(p_ref - inpitch);
1458 r2 = *p_ref;
1459 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1460 r4 = *(p_ref += inpitch);
1461 /* first pixel */
1462 r5 = *(p_ref += inpitch);
1463 result = (r0 + r5);
1464 r0 = (r1 + r4);
1465 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1466 r0 = (r2 + r3);
1467 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1468 result = (result + 512) >> 10;
1469 CLIP_RESULT(result)
1470 result2 = ((p_ref[dy] + 16) >> 5);
1471 CLIP_RESULT(result2)
1472 /* 3/4 pel, no need to clip */
1473 result = (result + result2 + 1);
1474 result = (result >> 1);
1475 *(p_cur += outpitch) = result;
1476 /* second pixel */
1477 r0 = *(p_ref += inpitch);
1478 result = (r1 + r0);
1479 r1 = (r2 + r5);
1480 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1481 r1 = (r3 + r4);
1482 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1483 result = (result + 512) >> 10;
1484 CLIP_RESULT(result)
1485 result2 = ((p_ref[dy] + 16) >> 5);
1486 CLIP_RESULT(result2)
1487 /* 3/4 pel, no need to clip */
1488 result = (result + result2 + 1);
1489 result = (result >> 1);
1490 *(p_cur += outpitch) = result;
1491 /* third pixel */
1492 r1 = *(p_ref += inpitch);
1493 result = (r2 + r1);
1494 r2 = (r3 + r0);
1495 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1496 r2 = (r4 + r5);
1497 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1498 result = (result + 512) >> 10;
1499 CLIP_RESULT(result)
1500 result2 = ((p_ref[dy] + 16) >> 5);
1501 CLIP_RESULT(result2)
1502 /* 3/4 pel, no need to clip */
1503 result = (result + result2 + 1);
1504 result = (result >> 1);
1505 *(p_cur += outpitch) = result;
1506 /* fourth pixel */
1507 r2 = *(p_ref += inpitch);
1508 result = (r3 + r2);
1509 r3 = (r4 + r1);
1510 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1511 r3 = (r5 + r0);
1512 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1513 result = (result + 512) >> 10;
1514 CLIP_RESULT(result)
1515 result2 = ((p_ref[dy] + 16) >> 5);
1516 CLIP_RESULT(result2)
1517 /* 3/4 pel, no need to clip */
1518 result = (result + result2 + 1);
1519 result = (result >> 1);
1520 *(p_cur += outpitch) = result;
1521 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1522 }
1523 p_cur += curr_offset;
1524 }
1525 }
1526 else
1527 {
1528 for (j = 0; j < blkwidth; j++)
1529 {
1530 p_cur -= outpitch; /* compensate for the first offset */
1531 p_ref = in++;
1532
1533 tmp = (uint32)(p_ref + ref_offset); /* limit */
1534 while ((uint32)p_ref < tmp)
1535 { /* loop un-rolled */
1536 r0 = *(p_ref - (inpitch << 1));
1537 r1 = *(p_ref - inpitch);
1538 r2 = *p_ref;
1539 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1540 r4 = *(p_ref += inpitch);
1541 /* first pixel */
1542 r5 = *(p_ref += inpitch);
1543 result = (r0 + r5);
1544 r0 = (r1 + r4);
1545 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1546 r0 = (r2 + r3);
1547 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1548 result = (result + 512) >> 10;
1549 CLIP_RESULT(result)
1550 *(p_cur += outpitch) = result;
1551 /* second pixel */
1552 r0 = *(p_ref += inpitch);
1553 result = (r1 + r0);
1554 r1 = (r2 + r5);
1555 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1556 r1 = (r3 + r4);
1557 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1558 result = (result + 512) >> 10;
1559 CLIP_RESULT(result)
1560 *(p_cur += outpitch) = result;
1561 /* third pixel */
1562 r1 = *(p_ref += inpitch);
1563 result = (r2 + r1);
1564 r2 = (r3 + r0);
1565 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1566 r2 = (r4 + r5);
1567 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1568 result = (result + 512) >> 10;
1569 CLIP_RESULT(result)
1570 *(p_cur += outpitch) = result;
1571 /* fourth pixel */
1572 r2 = *(p_ref += inpitch);
1573 result = (r3 + r2);
1574 r3 = (r4 + r1);
1575 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1576 r3 = (r5 + r0);
1577 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1578 result = (result + 512) >> 10;
1579 CLIP_RESULT(result)
1580 *(p_cur += outpitch) = result;
1581 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1582 }
1583 p_cur += curr_offset;
1584 }
1585 }
1586
1587 return ;
1588 }
1589
DiagonalInterpMC(uint8 * in1,uint8 * in2,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1590 void DiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
1591 uint8 *out, int outpitch,
1592 int blkwidth, int blkheight)
1593 {
1594 int j, i;
1595 int result;
1596 uint8 *p_cur, *p_ref, *p_tmp8;
1597 int curr_offset, ref_offset;
1598 uint8 tmp_res[24][24], tmp_in[24][24];
1599 uint32 *p_tmp;
1600 uint32 tmp, pkres, tmp_result;
1601 int32 r0, r1, r2, r3, r4, r5;
1602 int32 r6, r7, r8, r9, r10, r13;
1603
1604 ref_offset = inpitch - blkwidth;
1605 p_ref = in1 - 2;
1606 /* perform horizontal interpolation */
1607 /* not word-aligned */
1608 /* It is faster to read 1 byte at time to avoid calling CreateAlign */
1609 /* if(((uint32)p_ref)&0x3)
1610 {
1611 CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
1612 p_ref = &tmp_in[0][0];
1613 ref_offset = 24-blkwidth;
1614 }*/
1615
1616 p_tmp = (uint32*) & (tmp_res[0][0]);
1617 for (j = blkheight; j > 0; j--)
1618 {
1619 r13 = 0;
1620 tmp = (uint32)(p_ref + blkwidth);
1621
1622 //r0 = *((uint32*)p_ref); /* d,c,b,a */
1623 //r1 = (r0>>8)&0xFF00FF; /* 0,d,0,b */
1624 //r0 &= 0xFF00FF; /* 0,c,0,a */
1625 /* It is faster to read 1 byte at a time, */
1626 r0 = p_ref[0];
1627 r1 = p_ref[2];
1628 r0 |= (r1 << 16); /* 0,c,0,a */
1629 r1 = p_ref[1];
1630 r2 = p_ref[3];
1631 r1 |= (r2 << 16); /* 0,d,0,b */
1632
1633 while ((uint32)p_ref < tmp)
1634 {
1635 //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
1636 //r3 = (r2>>8)&0xFF00FF; /* 0,h,0,f */
1637 //r2 &= 0xFF00FF; /* 0,g,0,e */
1638 /* It is faster to read 1 byte at a time, */
1639 r2 = *(p_ref += 4);
1640 r3 = p_ref[2];
1641 r2 |= (r3 << 16); /* 0,g,0,e */
1642 r3 = p_ref[1];
1643 r4 = p_ref[3];
1644 r3 |= (r4 << 16); /* 0,h,0,f */
1645
1646 r4 = r0 + r3; /* c+h, a+f */
1647 r5 = r0 + r1; /* c+d, a+b */
1648 r6 = r2 + r3; /* g+h, e+f */
1649 r5 >>= 16;
1650 r5 |= (r6 << 16); /* e+f, c+d */
1651 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
1652 r4 += 0x100010; /* +16, +16 */
1653 r5 = r1 + r2; /* d+g, b+e */
1654 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
1655 r4 >>= 5;
1656 r13 |= r4; /* check clipping */
1657 r4 &= 0xFF00FF; /* mask */
1658
1659 r5 = p_ref[4]; /* i */
1660 r6 = (r5 << 16);
1661 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
1662 r5 += r1; /* d+i, b+g */ /* r5 not free */
1663 r1 >>= 16;
1664 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
1665 r1 += r2; /* f+g, d+e */
1666 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
1667 r0 >>= 16;
1668 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
1669 r0 += r3; /* e+h, c+f */
1670 r5 += 0x100010; /* 16,16 */
1671 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
1672 r5 >>= 5;
1673 r13 |= r5; /* check clipping */
1674 r5 &= 0xFF00FF; /* mask */
1675
1676 r4 |= (r5 << 8); /* pack them together */
1677 *p_tmp++ = r4;
1678 r1 = r3;
1679 r0 = r2;
1680 }
1681 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1682 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
1683
1684 if (r13&0xFF000700) /* need clipping */
1685 {
1686 /* move back to the beginning of the line */
1687 p_ref -= (ref_offset + blkwidth); /* input */
1688 p_tmp -= 6; /* intermediate output */
1689 tmp = (uint32)(p_ref + blkwidth);
1690 while ((uint32)p_ref < tmp)
1691 {
1692 r0 = *p_ref++;
1693 r1 = *p_ref++;
1694 r2 = *p_ref++;
1695 r3 = *p_ref++;
1696 r4 = *p_ref++;
1697 /* first pixel */
1698 r5 = *p_ref++;
1699 result = (r0 + r5);
1700 r0 = (r1 + r4);
1701 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1702 r0 = (r2 + r3);
1703 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1704 result = (result + 16) >> 5;
1705 CLIP_RESULT(result)
1706 pkres = result;
1707 /* second pixel */
1708 r0 = *p_ref++;
1709 result = (r1 + r0);
1710 r1 = (r2 + r5);
1711 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1712 r1 = (r3 + r4);
1713 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1714 result = (result + 16) >> 5;
1715 CLIP_RESULT(result)
1716 pkres |= (result << 8);
1717 /* third pixel */
1718 r1 = *p_ref++;
1719 result = (r2 + r1);
1720 r2 = (r3 + r0);
1721 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1722 r2 = (r4 + r5);
1723 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1724 result = (result + 16) >> 5;
1725 CLIP_RESULT(result)
1726 pkres |= (result << 16);
1727 /* fourth pixel */
1728 r2 = *p_ref++;
1729 result = (r3 + r2);
1730 r3 = (r4 + r1);
1731 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1732 r3 = (r5 + r0);
1733 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1734 result = (result + 16) >> 5;
1735 CLIP_RESULT(result)
1736 pkres |= (result << 24);
1737
1738 *p_tmp++ = pkres; /* write 4 pixel */
1739 p_ref -= 5;
1740 }
1741 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1742 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
1743 }
1744 }
1745
1746 /* perform vertical interpolation */
1747 /* not word-aligned */
1748 if (((uint32)in2)&0x3)
1749 {
1750 CreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1751 in2 = &tmp_in[2][0];
1752 inpitch = 24;
1753 }
1754
1755 p_cur = out;
1756 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
1757 pkres = blkheight * inpitch; /* reuse it for limit */
1758
1759 curr_offset += 3;
1760
1761 for (j = 0; j < blkwidth; j += 4, in2 += 4)
1762 {
1763 r13 = 0;
1764 p_ref = in2;
1765 p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
1766 p_tmp8 -= 24; /* compensate for the first offset */
1767 p_cur -= outpitch; /* compensate for the first offset */
1768 tmp = (uint32)(p_ref + pkres); /* limit */
1769 while ((uint32)p_ref < tmp) /* the loop un-rolled */
1770 {
1771 /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign, */
1772 /*p_ref8 = p_ref-(inpitch<<1); r0 = p_ref8[0]; r1 = p_ref8[2];
1773 r0 |= (r1<<16); r6 = p_ref8[1]; r1 = p_ref8[3];
1774 r6 |= (r1<<16); p_ref+=inpitch; */
1775 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1776 p_ref += inpitch;
1777 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1778 r0 &= 0xFF00FF;
1779
1780 /*p_ref8 = p_ref+(inpitch<<1);
1781 r1 = p_ref8[0]; r7 = p_ref8[2]; r1 |= (r7<<16);
1782 r7 = p_ref8[1]; r2 = p_ref8[3]; r7 |= (r2<<16);*/
1783 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
1784 r7 = (r1 >> 8) & 0xFF00FF;
1785 r1 &= 0xFF00FF;
1786
1787 r0 += r1;
1788 r6 += r7;
1789
1790 /*r2 = p_ref[0]; r8 = p_ref[2]; r2 |= (r8<<16);
1791 r8 = p_ref[1]; r1 = p_ref[3]; r8 |= (r1<<16);*/
1792 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1793 r8 = (r2 >> 8) & 0xFF00FF;
1794 r2 &= 0xFF00FF;
1795
1796 /*p_ref8 = p_ref-inpitch; r1 = p_ref8[0]; r7 = p_ref8[2];
1797 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1];
1798 r2 = p_ref8[3]; r7 |= (r2<<16);*/
1799 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1800 r7 = (r1 >> 8) & 0xFF00FF;
1801 r1 &= 0xFF00FF;
1802 r1 += r2;
1803
1804 r7 += r8;
1805
1806 r0 += 20 * r1;
1807 r6 += 20 * r7;
1808 r0 += 0x100010;
1809 r6 += 0x100010;
1810
1811 /*p_ref8 = p_ref-(inpitch<<1); r2 = p_ref8[0]; r8 = p_ref8[2];
1812 r2 |= (r8<<16); r8 = p_ref8[1]; r1 = p_ref8[3]; r8 |= (r1<<16);*/
1813 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1814 r8 = (r2 >> 8) & 0xFF00FF;
1815 r2 &= 0xFF00FF;
1816
1817 /*p_ref8 = p_ref+inpitch; r1 = p_ref8[0]; r7 = p_ref8[2];
1818 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1];
1819 r2 = p_ref8[3]; r7 |= (r2<<16);*/
1820 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1821 r7 = (r1 >> 8) & 0xFF00FF;
1822 r1 &= 0xFF00FF;
1823 r1 += r2;
1824
1825 r7 += r8;
1826
1827 r0 -= 5 * r1;
1828 r6 -= 5 * r7;
1829
1830 r0 >>= 5;
1831 r6 >>= 5;
1832 /* clip */
1833 r13 |= r6;
1834 r13 |= r0;
1835 //CLIPPACK(r6,result)
1836 /* add with horizontal results */
1837 r10 = *((uint32*)(p_tmp8 += 24));
1838 r9 = (r10 >> 8) & 0xFF00FF;
1839 r10 &= 0xFF00FF;
1840
1841 r0 += r10;
1842 r0 += 0x10001;
1843 r0 = (r0 >> 1) & 0xFF00FF; /* mask to 8 bytes */
1844
1845 r6 += r9;
1846 r6 += 0x10001;
1847 r6 = (r6 >> 1) & 0xFF00FF; /* mask to 8 bytes */
1848
1849 r0 |= (r6 << 8); /* pack it back */
1850 *((uint32*)(p_cur += outpitch)) = r0;
1851 }
1852 p_cur += curr_offset; /* offset to the next pixel */
1853 if (r13 & 0xFF000700) /* this column need clipping */
1854 {
1855 p_cur -= 4;
1856 for (i = 0; i < 4; i++)
1857 {
1858 p_ref = in2 + i;
1859 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
1860 p_tmp8 -= 24; /* compensate for the first offset */
1861 p_cur -= outpitch; /* compensate for the first offset */
1862 tmp = (uint32)(p_ref + pkres); /* limit */
1863 while ((uint32)p_ref < tmp) /* the loop un-rolled */
1864 {
1865 r0 = *(p_ref - (inpitch << 1));
1866 r1 = *(p_ref - inpitch);
1867 r2 = *p_ref;
1868 r3 = *(p_ref += inpitch); /* modify pointer before loading */
1869 r4 = *(p_ref += inpitch);
1870 /* first pixel */
1871 r5 = *(p_ref += inpitch);
1872 result = (r0 + r5);
1873 r0 = (r1 + r4);
1874 result -= (r0 * 5);//result -= r0; result -= (r0<<2);
1875 r0 = (r2 + r3);
1876 result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
1877 result = (result + 16) >> 5;
1878 CLIP_RESULT(result)
1879 tmp_result = *(p_tmp8 += 24); /* modify pointer before loading */
1880 result = (result + tmp_result + 1); /* no clip */
1881 result = (result >> 1);
1882 *(p_cur += outpitch) = result;
1883 /* second pixel */
1884 r0 = *(p_ref += inpitch);
1885 result = (r1 + r0);
1886 r1 = (r2 + r5);
1887 result -= (r1 * 5);//result -= r1; result -= (r1<<2);
1888 r1 = (r3 + r4);
1889 result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
1890 result = (result + 16) >> 5;
1891 CLIP_RESULT(result)
1892 tmp_result = *(p_tmp8 += 24); /* intermediate result */
1893 result = (result + tmp_result + 1); /* no clip */
1894 result = (result >> 1);
1895 *(p_cur += outpitch) = result;
1896 /* third pixel */
1897 r1 = *(p_ref += inpitch);
1898 result = (r2 + r1);
1899 r2 = (r3 + r0);
1900 result -= (r2 * 5);//result -= r2; result -= (r2<<2);
1901 r2 = (r4 + r5);
1902 result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
1903 result = (result + 16) >> 5;
1904 CLIP_RESULT(result)
1905 tmp_result = *(p_tmp8 += 24); /* intermediate result */
1906 result = (result + tmp_result + 1); /* no clip */
1907 result = (result >> 1);
1908 *(p_cur += outpitch) = result;
1909 /* fourth pixel */
1910 r2 = *(p_ref += inpitch);
1911 result = (r3 + r2);
1912 r3 = (r4 + r1);
1913 result -= (r3 * 5);//result -= r3; result -= (r3<<2);
1914 r3 = (r5 + r0);
1915 result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
1916 result = (result + 16) >> 5;
1917 CLIP_RESULT(result)
1918 tmp_result = *(p_tmp8 += 24); /* intermediate result */
1919 result = (result + tmp_result + 1); /* no clip */
1920 result = (result >> 1);
1921 *(p_cur += outpitch) = result;
1922 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
1923 }
1924 p_cur += (curr_offset - 3);
1925 }
1926 }
1927 }
1928
1929 return ;
1930 }
1931
1932 /* position G */
FullPelMC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1933 void FullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1934 int blkwidth, int blkheight)
1935 {
1936 int i, j;
1937 int offset_in = inpitch - blkwidth;
1938 int offset_out = outpitch - blkwidth;
1939 uint32 temp;
1940 uint8 byte;
1941
1942 if (((uint32)in)&3)
1943 {
1944 for (j = blkheight; j > 0; j--)
1945 {
1946 for (i = blkwidth; i > 0; i -= 4)
1947 {
1948 temp = *in++;
1949 byte = *in++;
1950 temp |= (byte << 8);
1951 byte = *in++;
1952 temp |= (byte << 16);
1953 byte = *in++;
1954 temp |= (byte << 24);
1955
1956 *((uint32*)out) = temp; /* write 4 bytes */
1957 out += 4;
1958 }
1959 out += offset_out;
1960 in += offset_in;
1961 }
1962 }
1963 else
1964 {
1965 for (j = blkheight; j > 0; j--)
1966 {
1967 for (i = blkwidth; i > 0; i -= 4)
1968 {
1969 temp = *((uint32*)in);
1970 *((uint32*)out) = temp;
1971 in += 4;
1972 out += 4;
1973 }
1974 out += offset_out;
1975 in += offset_in;
1976 }
1977 }
1978 return ;
1979 }
1980
ChromaMotionComp(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * pred,int pred_pitch,int blkwidth,int blkheight)1981 void ChromaMotionComp(uint8 *ref, int picwidth, int picheight,
1982 int x_pos, int y_pos,
1983 uint8 *pred, int pred_pitch,
1984 int blkwidth, int blkheight)
1985 {
1986 int dx, dy;
1987 int offset_dx, offset_dy;
1988 int index;
1989 uint8 temp[24][24];
1990
1991 dx = x_pos & 7;
1992 dy = y_pos & 7;
1993 offset_dx = (dx + 7) >> 3;
1994 offset_dy = (dy + 7) >> 3;
1995 x_pos = x_pos >> 3; /* round it to full-pel resolution */
1996 y_pos = y_pos >> 3;
1997
1998 if ((x_pos >= 0 && x_pos + blkwidth + offset_dx <= picwidth) && (y_pos >= 0 && y_pos + blkheight + offset_dy <= picheight))
1999 {
2000 ref += y_pos * picwidth + x_pos;
2001 }
2002 else
2003 {
2004 CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth + offset_dx, blkheight + offset_dy);
2005 ref = &temp[0][0];
2006 picwidth = 24;
2007 }
2008
2009 index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
2010
2011 (*(ChromaMC_SIMD[index]))(ref, picwidth , dx, dy, pred, pred_pitch, blkwidth, blkheight);
2012 return ;
2013 }
2014
2015
2016 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
ChromaDiagonalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2017 void ChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2018 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2019 {
2020 int32 r0, r1, r2, r3, result0, result1;
2021 uint8 temp[288];
2022 uint8 *ref, *out;
2023 int i, j;
2024 int dx_8 = 8 - dx;
2025 int dy_8 = 8 - dy;
2026
2027 /* horizontal first */
2028 out = temp;
2029 for (i = 0; i < blkheight + 1; i++)
2030 {
2031 ref = pRef;
2032 r0 = ref[0];
2033 for (j = 0; j < blkwidth; j += 4)
2034 {
2035 r0 |= (ref[2] << 16);
2036 result0 = dx_8 * r0;
2037
2038 r1 = ref[1] | (ref[3] << 16);
2039 result0 += dx * r1;
2040 *(int32 *)out = result0;
2041
2042 result0 = dx_8 * r1;
2043
2044 r2 = ref[4];
2045 r0 = r0 >> 16;
2046 r1 = r0 | (r2 << 16);
2047 result0 += dx * r1;
2048 *(int32 *)(out + 16) = result0;
2049
2050 ref += 4;
2051 out += 4;
2052 r0 = r2;
2053 }
2054 pRef += srcPitch;
2055 out += (32 - blkwidth);
2056 }
2057
2058 // pRef -= srcPitch*(blkheight+1);
2059 ref = temp;
2060
2061 for (j = 0; j < blkwidth; j += 4)
2062 {
2063 r0 = *(int32 *)ref;
2064 r1 = *(int32 *)(ref + 16);
2065 ref += 32;
2066 out = pOut;
2067 for (i = 0; i < (blkheight >> 1); i++)
2068 {
2069 result0 = dy_8 * r0 + 0x00200020;
2070 r2 = *(int32 *)ref;
2071 result0 += dy * r2;
2072 result0 >>= 6;
2073 result0 &= 0x00FF00FF;
2074 r0 = r2;
2075
2076 result1 = dy_8 * r1 + 0x00200020;
2077 r3 = *(int32 *)(ref + 16);
2078 result1 += dy * r3;
2079 result1 >>= 6;
2080 result1 &= 0x00FF00FF;
2081 r1 = r3;
2082 *(int32 *)out = result0 | (result1 << 8);
2083 out += predPitch;
2084 ref += 32;
2085
2086 result0 = dy_8 * r0 + 0x00200020;
2087 r2 = *(int32 *)ref;
2088 result0 += dy * r2;
2089 result0 >>= 6;
2090 result0 &= 0x00FF00FF;
2091 r0 = r2;
2092
2093 result1 = dy_8 * r1 + 0x00200020;
2094 r3 = *(int32 *)(ref + 16);
2095 result1 += dy * r3;
2096 result1 >>= 6;
2097 result1 &= 0x00FF00FF;
2098 r1 = r3;
2099 *(int32 *)out = result0 | (result1 << 8);
2100 out += predPitch;
2101 ref += 32;
2102 }
2103 pOut += 4;
2104 ref = temp + 4; /* since it can only iterate twice max */
2105 }
2106 return;
2107 }
2108
ChromaHorizontalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2109 void ChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2110 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2111 {
2112 OSCL_UNUSED_ARG(dy);
2113 int32 r0, r1, r2, result0, result1;
2114 uint8 *ref, *out;
2115 int i, j;
2116 int dx_8 = 8 - dx;
2117
2118 /* horizontal first */
2119 for (i = 0; i < blkheight; i++)
2120 {
2121 ref = pRef;
2122 out = pOut;
2123
2124 r0 = ref[0];
2125 for (j = 0; j < blkwidth; j += 4)
2126 {
2127 r0 |= (ref[2] << 16);
2128 result0 = dx_8 * r0 + 0x00040004;
2129
2130 r1 = ref[1] | (ref[3] << 16);
2131 result0 += dx * r1;
2132 result0 >>= 3;
2133 result0 &= 0x00FF00FF;
2134
2135 result1 = dx_8 * r1 + 0x00040004;
2136
2137 r2 = ref[4];
2138 r0 = r0 >> 16;
2139 r1 = r0 | (r2 << 16);
2140 result1 += dx * r1;
2141 result1 >>= 3;
2142 result1 &= 0x00FF00FF;
2143
2144 *(int32 *)out = result0 | (result1 << 8);
2145
2146 ref += 4;
2147 out += 4;
2148 r0 = r2;
2149 }
2150
2151 pRef += srcPitch;
2152 pOut += predPitch;
2153 }
2154 return;
2155 }
2156
ChromaVerticalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2157 void ChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2158 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2159 {
2160 OSCL_UNUSED_ARG(dx);
2161 int32 r0, r1, r2, r3, result0, result1;
2162 int i, j;
2163 uint8 *ref, *out;
2164 int dy_8 = 8 - dy;
2165 /* vertical first */
2166 for (i = 0; i < blkwidth; i += 4)
2167 {
2168 ref = pRef;
2169 out = pOut;
2170
2171 r0 = ref[0] | (ref[2] << 16);
2172 r1 = ref[1] | (ref[3] << 16);
2173 ref += srcPitch;
2174 for (j = 0; j < blkheight; j++)
2175 {
2176 result0 = dy_8 * r0 + 0x00040004;
2177 r2 = ref[0] | (ref[2] << 16);
2178 result0 += dy * r2;
2179 result0 >>= 3;
2180 result0 &= 0x00FF00FF;
2181 r0 = r2;
2182
2183 result1 = dy_8 * r1 + 0x00040004;
2184 r3 = ref[1] | (ref[3] << 16);
2185 result1 += dy * r3;
2186 result1 >>= 3;
2187 result1 &= 0x00FF00FF;
2188 r1 = r3;
2189 *(int32 *)out = result0 | (result1 << 8);
2190 ref += srcPitch;
2191 out += predPitch;
2192 }
2193 pOut += 4;
2194 pRef += 4;
2195 }
2196 return;
2197 }
2198
ChromaDiagonalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2199 void ChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2200 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2201 {
2202 OSCL_UNUSED_ARG(blkwidth);
2203 int32 r0, r1, temp0, temp1, result;
2204 int32 temp[9];
2205 int32 *out;
2206 int i, r_temp;
2207 int dy_8 = 8 - dy;
2208
2209 /* horizontal first */
2210 out = temp;
2211 for (i = 0; i < blkheight + 1; i++)
2212 {
2213 r_temp = pRef[1];
2214 temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
2215 temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
2216 r0 = temp0 | (temp1 << 16);
2217 *out++ = r0;
2218 pRef += srcPitch;
2219 }
2220
2221 pRef -= srcPitch * (blkheight + 1);
2222
2223 out = temp;
2224
2225 r0 = *out++;
2226
2227 for (i = 0; i < blkheight; i++)
2228 {
2229 result = dy_8 * r0 + 0x00200020;
2230 r1 = *out++;
2231 result += dy * r1;
2232 result >>= 6;
2233 result &= 0x00FF00FF;
2234 *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2235 r0 = r1;
2236 pOut += predPitch;
2237 }
2238 return;
2239 }
2240
ChromaHorizontalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2241 void ChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2242 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2243 {
2244 OSCL_UNUSED_ARG(dy);
2245 OSCL_UNUSED_ARG(blkwidth);
2246 int i, temp, temp0, temp1;
2247
2248 /* horizontal first */
2249 for (i = 0; i < blkheight; i++)
2250 {
2251 temp = pRef[1];
2252 temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
2253 temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
2254
2255 *(int16 *)pOut = temp0 | (temp1 << 8);
2256 pRef += srcPitch;
2257 pOut += predPitch;
2258
2259 }
2260 return;
2261 }
ChromaVerticalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2262 void ChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2263 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2264 {
2265 OSCL_UNUSED_ARG(dx);
2266 OSCL_UNUSED_ARG(blkwidth);
2267 int32 r0, r1, result;
2268 int i;
2269 int dy_8 = 8 - dy;
2270 r0 = pRef[0] | (pRef[1] << 16);
2271 pRef += srcPitch;
2272 for (i = 0; i < blkheight; i++)
2273 {
2274 result = dy_8 * r0 + 0x00040004;
2275 r1 = pRef[0] | (pRef[1] << 16);
2276 result += dy * r1;
2277 result >>= 3;
2278 result &= 0x00FF00FF;
2279 *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2280 r0 = r1;
2281 pRef += srcPitch;
2282 pOut += predPitch;
2283 }
2284 return;
2285 }
2286
ChromaFullMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2287 void ChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2288 uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2289 {
2290 OSCL_UNUSED_ARG(dx);
2291 OSCL_UNUSED_ARG(dy);
2292 int i, j;
2293 int offset_in = srcPitch - blkwidth;
2294 int offset_out = predPitch - blkwidth;
2295 uint16 temp;
2296 uint8 byte;
2297
2298 if (((uint32)pRef)&1)
2299 {
2300 for (j = blkheight; j > 0; j--)
2301 {
2302 for (i = blkwidth; i > 0; i -= 2)
2303 {
2304 temp = *pRef++;
2305 byte = *pRef++;
2306 temp |= (byte << 8);
2307 *((uint16*)pOut) = temp; /* write 2 bytes */
2308 pOut += 2;
2309 }
2310 pOut += offset_out;
2311 pRef += offset_in;
2312 }
2313 }
2314 else
2315 {
2316 for (j = blkheight; j > 0; j--)
2317 {
2318 for (i = blkwidth; i > 0; i -= 2)
2319 {
2320 temp = *((uint16*)pRef);
2321 *((uint16*)pOut) = temp;
2322 pRef += 2;
2323 pOut += 2;
2324 }
2325 pOut += offset_out;
2326 pRef += offset_in;
2327 }
2328 }
2329 return ;
2330 }
2331