• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 #include "avcdec_lib.h"
19 #include "oscl_mem.h"
20 
21 
22 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
23                  x = 0xFF & (~(x>>31));}
24 
25 /* (blkwidth << 2) + (dy << 1) + dx */
26 static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
27 {
28     &ChromaFullMC_SIMD,
29     &ChromaHorizontalMC_SIMD,
30     &ChromaVerticalMC_SIMD,
31     &ChromaDiagonalMC_SIMD,
32     &ChromaFullMC_SIMD,
33     &ChromaHorizontalMC2_SIMD,
34     &ChromaVerticalMC2_SIMD,
35     &ChromaDiagonalMC2_SIMD
36 };
37 /* Perform motion prediction and compensation with residue if exist. */
InterMBPrediction(AVCCommonObj * video)38 void InterMBPrediction(AVCCommonObj *video)
39 {
40     AVCMacroblock *currMB = video->currMB;
41     AVCPictureData *currPic = video->currPic;
42     int mbPartIdx, subMbPartIdx;
43     int ref_idx;
44     int offset_MbPart_indx = 0;
45     int16 *mv;
46     uint32 x_pos, y_pos;
47     uint8 *curL, *curCb, *curCr;
48     uint8 *ref_l, *ref_Cb, *ref_Cr;
49     uint8 *predBlock, *predCb, *predCr;
50     int block_x, block_y, offset_x, offset_y, offsetP, offset;
51     int x_position = (video->mb_x << 4);
52     int y_position = (video->mb_y << 4);
53     int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
54     int picWidth = currPic->pitch;
55     int picHeight = currPic->height;
56     int16 *dataBlock;
57     uint32 cbp4x4;
58     uint32 tmp_word;
59 
60     tmp_word = y_position * picWidth;
61     curL = currPic->Sl + tmp_word + x_position;
62     offset = (tmp_word >> 2) + (x_position >> 1);
63     curCb = currPic->Scb + offset;
64     curCr = currPic->Scr + offset;
65 
66 #ifdef USE_PRED_BLOCK
67     predBlock = video->pred + 84;
68     predCb = video->pred + 452;
69     predCr = video->pred + 596;
70 #else
71     predBlock = curL;
72     predCb = curCb;
73     predCr = curCr;
74 #endif
75 
76     GetMotionVectorPredictor(video, false);
77 
78     for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
79     {
80         MbHeight = currMB->SubMbPartHeight[mbPartIdx];
81         MbWidth = currMB->SubMbPartWidth[mbPartIdx];
82         mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
83         mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
84         ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
85         offset_indx = 0;
86 
87         ref_l = video->RefPicList0[ref_idx]->Sl;
88         ref_Cb = video->RefPicList0[ref_idx]->Scb;
89         ref_Cr = video->RefPicList0[ref_idx]->Scr;
90 
91         for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
92         {
93             block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);  // check this
94             block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
95             mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
96             offset_x = x_position + (block_x << 2);
97             offset_y = y_position + (block_y << 2);
98             x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
99             y_pos = (offset_y << 2) + *mv;   /*quarter pel */
100 
101             //offset = offset_y * currPic->width;
102             //offsetC = (offset >> 2) + (offset_x >> 1);
103 #ifdef USE_PRED_BLOCK
104             offsetP = (block_y * 80) + (block_x << 2);
105             LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
106                            /*comp_Sl + offset + offset_x,*/
107                            predBlock + offsetP, 20, MbWidth, MbHeight);
108 #else
109             offsetP = (block_y << 2) * picWidth + (block_x << 2);
110             LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
111                            /*comp_Sl + offset + offset_x,*/
112                            predBlock + offsetP, picWidth, MbWidth, MbHeight);
113 #endif
114 
115 #ifdef USE_PRED_BLOCK
116             offsetP = (block_y * 24) + (block_x << 1);
117             ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
118                              /*comp_Scb +  offsetC,*/
119                              predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
120             ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
121                              /*comp_Scr +  offsetC,*/
122                              predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
123 #else
124             offsetP = (block_y * picWidth) + (block_x << 1);
125             ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
126                              /*comp_Scb +  offsetC,*/
127                              predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
128             ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
129                              /*comp_Scr +  offsetC,*/
130                              predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
131 #endif
132 
133             offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
134         }
135         offset_MbPart_indx = currMB->MbPartWidth >> 4;
136     }
137 
138     /* used in decoder, used to be if(!encFlag)  */
139 
140     /* transform in raster scan order */
141     dataBlock = video->block;
142     cbp4x4 = video->cbp4x4;
143     /* luma */
144     for (block_y = 4; block_y > 0; block_y--)
145     {
146         for (block_x = 4; block_x > 0; block_x--)
147         {
148 #ifdef USE_PRED_BLOCK
149             if (cbp4x4&1)
150             {
151                 itrans(dataBlock, predBlock, predBlock, 20);
152             }
153 #else
154             if (cbp4x4&1)
155             {
156                 itrans(dataBlock, curL, curL, picWidth);
157             }
158 #endif
159             cbp4x4 >>= 1;
160             dataBlock += 4;
161 #ifdef USE_PRED_BLOCK
162             predBlock += 4;
163 #else
164             curL += 4;
165 #endif
166         }
167         dataBlock += 48;
168 #ifdef USE_PRED_BLOCK
169         predBlock += 64;
170 #else
171         curL += ((picWidth << 2) - 16);
172 #endif
173     }
174 
175     /* chroma */
176     picWidth = (picWidth >> 1);
177     for (block_y = 2; block_y > 0; block_y--)
178     {
179         for (block_x = 2; block_x > 0; block_x--)
180         {
181 #ifdef USE_PRED_BLOCK
182             if (cbp4x4&1)
183             {
184                 ictrans(dataBlock, predCb, predCb, 12);
185             }
186 #else
187             if (cbp4x4&1)
188             {
189                 ictrans(dataBlock, curCb, curCb, picWidth);
190             }
191 #endif
192             cbp4x4 >>= 1;
193             dataBlock += 4;
194 #ifdef USE_PRED_BLOCK
195             predCb += 4;
196 #else
197             curCb += 4;
198 #endif
199         }
200         for (block_x = 2; block_x > 0; block_x--)
201         {
202 #ifdef USE_PRED_BLOCK
203             if (cbp4x4&1)
204             {
205                 ictrans(dataBlock, predCr, predCr, 12);
206             }
207 #else
208             if (cbp4x4&1)
209             {
210                 ictrans(dataBlock, curCr, curCr, picWidth);
211             }
212 #endif
213             cbp4x4 >>= 1;
214             dataBlock += 4;
215 #ifdef USE_PRED_BLOCK
216             predCr += 4;
217 #else
218             curCr += 4;
219 #endif
220         }
221         dataBlock += 48;
222 #ifdef USE_PRED_BLOCK
223         predCb += 40;
224         predCr += 40;
225 #else
226         curCb += ((picWidth << 2) - 8);
227         curCr += ((picWidth << 2) - 8);
228 #endif
229     }
230 
231 #ifdef MB_BASED_DEBLOCK
232     SaveNeighborForIntraPred(video, offset);
233 #endif
234 
235     return ;
236 }
237 
238 
239 /* preform the actual  motion comp here */
LumaMotionComp(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * pred,int pred_pitch,int blkwidth,int blkheight)240 void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
241                     int x_pos, int y_pos,
242                     uint8 *pred, int pred_pitch,
243                     int blkwidth, int blkheight)
244 {
245     int dx, dy;
246     uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
247     int temp2[21][21]; /* for intermediate results */
248     uint8 *ref2;
249 
250     dx = x_pos & 3;
251     dy = y_pos & 3;
252     x_pos = x_pos >> 2;  /* round it to full-pel resolution */
253     y_pos = y_pos >> 2;
254 
255     /* perform actual motion compensation */
256     if (dx == 0 && dy == 0)
257     {  /* fullpel position *//* G */
258         if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
259         {
260             ref += y_pos * picwidth + x_pos;
261             FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
262         }
263         else
264         {
265             CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
266             FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight);
267         }
268 
269     }   /* other positions */
270     else  if (dy == 0)
271     { /* no vertical interpolation *//* a,b,c*/
272 
273         if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
274         {
275             ref += y_pos * picwidth + x_pos;
276 
277             HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
278         }
279         else  /* need padding */
280         {
281             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);
282 
283             HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
284         }
285     }
286     else if (dx == 0)
287     { /*no horizontal interpolation *//* d,h,n */
288 
289         if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
290         {
291             ref += y_pos * picwidth + x_pos;
292 
293             VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy);
294         }
295         else  /* need padding */
296         {
297             CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);
298 
299             VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy);
300         }
301     }
302     else if (dy == 2)
303     {  /* horizontal cross *//* i, j, k */
304 
305         if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
306         {
307             ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */
308 
309             VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);
310 
311             HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
312         }
313         else /* need padding */
314         {
315             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
316 
317             VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);
318 
319             HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
320         }
321     }
322     else if (dx == 2)
323     { /* vertical cross */ /* f,q */
324 
325         if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
326         {
327             ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */
328 
329             HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
330             VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
331         }
332         else  /* need padding */
333         {
334             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
335             HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
336             VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
337         }
338     }
339     else
340     { /* diagonal *//* e,g,p,r */
341 
342         if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
343                 y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
344         {
345             ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;
346 
347             ref += (y_pos * picwidth) + x_pos + (dx / 2);
348 
349             DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
350         }
351         else  /* need padding */
352         {
353             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));
354 
355             ref2 = &temp[2 + (dy/2)][2];
356 
357             ref = &temp[2][2 + (dx/2)];
358 
359             DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight);
360         }
361     }
362 
363     return ;
364 }
365 
CreateAlign(uint8 * ref,int picwidth,int y_pos,uint8 * out,int blkwidth,int blkheight)366 void CreateAlign(uint8 *ref, int picwidth, int y_pos,
367                  uint8 *out, int blkwidth, int blkheight)
368 {
369     int i, j;
370     int offset, out_offset;
371     uint32 prev_pix, result, pix1, pix2, pix4;
372 
373     out_offset = 24 - blkwidth;
374 
375     //switch(x_pos&0x3){
376     switch (((uint32)ref)&0x3)
377     {
378         case 1:
379             ref += y_pos * picwidth;
380             offset =  picwidth - blkwidth - 3;
381             for (j = 0; j < blkheight; j++)
382             {
383                 pix1 = *ref++;
384                 pix2 = *((uint16*)ref);
385                 ref += 2;
386                 result = (pix2 << 8) | pix1;
387 
388                 for (i = 3; i < blkwidth; i += 4)
389                 {
390                     pix4 = *((uint32*)ref);
391                     ref += 4;
392                     prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
393                     result |= prev_pix;
394                     *((uint32*)out) = result;  /* write 4 bytes */
395                     out += 4;
396                     result = pix4 >> 8; /* for the next loop */
397                 }
398                 ref += offset;
399                 out += out_offset;
400             }
401             break;
402         case 2:
403             ref += y_pos * picwidth;
404             offset =  picwidth - blkwidth - 2;
405             for (j = 0; j < blkheight; j++)
406             {
407                 result = *((uint16*)ref);
408                 ref += 2;
409                 for (i = 2; i < blkwidth; i += 4)
410                 {
411                     pix4 = *((uint32*)ref);
412                     ref += 4;
413                     prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
414                     result |= prev_pix;
415                     *((uint32*)out) = result;  /* write 4 bytes */
416                     out += 4;
417                     result = pix4 >> 16; /* for the next loop */
418                 }
419                 ref += offset;
420                 out += out_offset;
421             }
422             break;
423         case 3:
424             ref += y_pos * picwidth;
425             offset =  picwidth - blkwidth - 1;
426             for (j = 0; j < blkheight; j++)
427             {
428                 result = *ref++;
429                 for (i = 1; i < blkwidth; i += 4)
430                 {
431                     pix4 = *((uint32*)ref);
432                     ref += 4;
433                     prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
434                     result |= prev_pix;
435                     *((uint32*)out) = result;  /* write 4 bytes */
436                     out += 4;
437                     result = pix4 >> 24; /* for the next loop */
438                 }
439                 ref += offset;
440                 out += out_offset;
441             }
442             break;
443     }
444 }
445 
CreatePad(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * out,int blkwidth,int blkheight)446 void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
447                uint8 *out, int blkwidth, int blkheight)
448 {
449     int x_inc0, x_mid;
450     int y_inc, y_inc0, y_inc1, y_mid;
451     int i, j;
452     int offset;
453 
454     if (x_pos < 0)
455     {
456         x_inc0 = 0;  /* increment for the first part */
457         x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth);  /* stopping point */
458         x_pos = 0;
459     }
460     else if (x_pos + blkwidth > picwidth)
461     {
462         x_inc0 = 1;  /* increasing */
463         x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0);  /* clip negative to zero, encode fool proof! */
464     }
465     else    /* normal case */
466     {
467         x_inc0 = 1;
468         x_mid = blkwidth; /* just one run */
469     }
470 
471 
472     /* boundary for y_pos, taking the result from x_pos into account */
473     if (y_pos < 0)
474     {
475         y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
476         y_inc1 = picwidth + y_inc0;
477         y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
478         y_pos = 0;
479     }
480     else  if (y_pos + blkheight > picheight)
481     {
482         y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
483         y_inc0 = picwidth + y_inc1;                 /* increasing */
484         y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
485     }
486     else  /* normal case */
487     {
488         y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
489         y_inc0 = picwidth + y_inc1;
490         y_mid = blkheight;
491     }
492 
493     /* clip y_pos and x_pos */
494     if (y_pos > picheight - 1) y_pos = picheight - 1;
495     if (x_pos > picwidth - 1) x_pos = picwidth - 1;
496 
497     ref += y_pos * picwidth + x_pos;
498 
499     y_inc = y_inc0;  /* start with top half */
500 
501     offset = 24 - blkwidth; /* to use in offset out */
502     blkwidth -= x_mid; /* to use in the loop limit */
503 
504     if (x_inc0 == 0)
505     {
506         for (j = 0; j < blkheight; j++)
507         {
508             if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
509             {
510                 y_inc = y_inc1;  /* switch to lower half */
511             }
512             for (i = x_mid; i > 0; i--)   /* first or third quarter */
513             {
514                 *out++ = *ref;
515             }
516             for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
517             {
518                 *out++ = *ref++;
519             }
520             out += offset;
521             ref += y_inc;
522         }
523     }
524     else
525     {
526         for (j = 0; j < blkheight; j++)
527         {
528             if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
529             {
530                 y_inc = y_inc1;  /* switch to lower half */
531             }
532             for (i = x_mid; i > 0; i--)   /* first or third quarter */
533             {
534                 *out++ = *ref++;
535             }
536             for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
537             {
538                 *out++ = *ref;
539             }
540             out += offset;
541             ref += y_inc;
542         }
543     }
544 
545     return ;
546 }
547 
HorzInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)548 void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
549                    int blkwidth, int blkheight, int dx)
550 {
551     uint8 *p_ref;
552     uint32 *p_cur;
553     uint32 tmp, pkres;
554     int result, curr_offset, ref_offset;
555     int j;
556     int32 r0, r1, r2, r3, r4, r5;
557     int32 r13, r6;
558 
559     p_cur = (uint32*)out; /* assume it's word aligned */
560     curr_offset = (outpitch - blkwidth) >> 2;
561     p_ref = in;
562     ref_offset = inpitch - blkwidth;
563 
564     if (dx&1)
565     {
566         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
567         p_ref -= 2;
568         r13 = 0;
569         for (j = blkheight; j > 0; j--)
570         {
571             tmp = (uint32)(p_ref + blkwidth);
572             r0 = p_ref[0];
573             r1 = p_ref[2];
574             r0 |= (r1 << 16);           /* 0,c,0,a */
575             r1 = p_ref[1];
576             r2 = p_ref[3];
577             r1 |= (r2 << 16);           /* 0,d,0,b */
578             while ((uint32)p_ref < tmp)
579             {
580                 r2 = *(p_ref += 4); /* move pointer to e */
581                 r3 = p_ref[2];
582                 r2 |= (r3 << 16);           /* 0,g,0,e */
583                 r3 = p_ref[1];
584                 r4 = p_ref[3];
585                 r3 |= (r4 << 16);           /* 0,h,0,f */
586 
587                 r4 = r0 + r3;       /* c+h, a+f */
588                 r5 = r0 + r1;   /* c+d, a+b */
589                 r6 = r2 + r3;   /* g+h, e+f */
590                 r5 >>= 16;
591                 r5 |= (r6 << 16);   /* e+f, c+d */
592                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
593                 r4 += 0x100010; /* +16, +16 */
594                 r5 = r1 + r2;       /* d+g, b+e */
595                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
596                 r4 >>= 5;
597                 r13 |= r4;      /* check clipping */
598 
599                 r5 = p_ref[dx+2];
600                 r6 = p_ref[dx+4];
601                 r5 |= (r6 << 16);
602                 r4 += r5;
603                 r4 += 0x10001;
604                 r4 = (r4 >> 1) & 0xFF00FF;
605 
606                 r5 = p_ref[4];  /* i */
607                 r6 = (r5 << 16);
608                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
609                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
610                 r1 >>= 16;
611                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
612                 r1 += r2;       /* f+g, d+e */
613                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
614                 r0 >>= 16;
615                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
616                 r0 += r3;       /* e+h, c+f */
617                 r5 += 0x100010; /* 16,16 */
618                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
619                 r5 >>= 5;
620                 r13 |= r5;      /* check clipping */
621 
622                 r0 = p_ref[dx+3];
623                 r1 = p_ref[dx+5];
624                 r0 |= (r1 << 16);
625                 r5 += r0;
626                 r5 += 0x10001;
627                 r5 = (r5 >> 1) & 0xFF00FF;
628 
629                 r4 |= (r5 << 8);    /* pack them together */
630                 *p_cur++ = r4;
631                 r1 = r3;
632                 r0 = r2;
633             }
634             p_cur += curr_offset; /* move to the next line */
635             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
636 
637             if (r13&0xFF000700) /* need clipping */
638             {
639                 /* move back to the beginning of the line */
640                 p_ref -= (ref_offset + blkwidth);   /* input */
641                 p_cur -= (outpitch >> 2);
642 
643                 tmp = (uint32)(p_ref + blkwidth);
644                 for (; (uint32)p_ref < tmp;)
645                 {
646 
647                     r0 = *p_ref++;
648                     r1 = *p_ref++;
649                     r2 = *p_ref++;
650                     r3 = *p_ref++;
651                     r4 = *p_ref++;
652                     /* first pixel */
653                     r5 = *p_ref++;
654                     result = (r0 + r5);
655                     r0 = (r1 + r4);
656                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
657                     r0 = (r2 + r3);
658                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
659                     result = (result + 16) >> 5;
660                     CLIP_RESULT(result)
661                     /* 3/4 pel,  no need to clip */
662                     result = (result + p_ref[dx] + 1);
663                     pkres = (result >> 1) ;
664                     /* second pixel */
665                     r0 = *p_ref++;
666                     result = (r1 + r0);
667                     r1 = (r2 + r5);
668                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
669                     r1 = (r3 + r4);
670                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
671                     result = (result + 16) >> 5;
672                     CLIP_RESULT(result)
673                     /* 3/4 pel,  no need to clip */
674                     result = (result + p_ref[dx] + 1);
675                     result = (result >> 1);
676                     pkres  |= (result << 8);
677                     /* third pixel */
678                     r1 = *p_ref++;
679                     result = (r2 + r1);
680                     r2 = (r3 + r0);
681                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
682                     r2 = (r4 + r5);
683                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
684                     result = (result + 16) >> 5;
685                     CLIP_RESULT(result)
686                     /* 3/4 pel,  no need to clip */
687                     result = (result + p_ref[dx] + 1);
688                     result = (result >> 1);
689                     pkres  |= (result << 16);
690                     /* fourth pixel */
691                     r2 = *p_ref++;
692                     result = (r3 + r2);
693                     r3 = (r4 + r1);
694                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
695                     r3 = (r5 + r0);
696                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
697                     result = (result + 16) >> 5;
698                     CLIP_RESULT(result)
699                     /* 3/4 pel,  no need to clip */
700                     result = (result + p_ref[dx] + 1);
701                     result = (result >> 1);
702                     pkres  |= (result << 24);
703                     *p_cur++ = pkres; /* write 4 pixels */
704                     p_ref -= 5;  /* offset back to the middle of filter */
705                 }
706                 p_cur += curr_offset;  /* move to the next line */
707                 p_ref += ref_offset;    /* move to the next line */
708             }
709         }
710     }
711     else
712     {
713         p_ref -= 2;
714         r13 = 0;
715         for (j = blkheight; j > 0; j--)
716         {
717             tmp = (uint32)(p_ref + blkwidth);
718             r0 = p_ref[0];
719             r1 = p_ref[2];
720             r0 |= (r1 << 16);           /* 0,c,0,a */
721             r1 = p_ref[1];
722             r2 = p_ref[3];
723             r1 |= (r2 << 16);           /* 0,d,0,b */
724             while ((uint32)p_ref < tmp)
725             {
726                 r2 = *(p_ref += 4); /* move pointer to e */
727                 r3 = p_ref[2];
728                 r2 |= (r3 << 16);           /* 0,g,0,e */
729                 r3 = p_ref[1];
730                 r4 = p_ref[3];
731                 r3 |= (r4 << 16);           /* 0,h,0,f */
732 
733                 r4 = r0 + r3;       /* c+h, a+f */
734                 r5 = r0 + r1;   /* c+d, a+b */
735                 r6 = r2 + r3;   /* g+h, e+f */
736                 r5 >>= 16;
737                 r5 |= (r6 << 16);   /* e+f, c+d */
738                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
739                 r4 += 0x100010; /* +16, +16 */
740                 r5 = r1 + r2;       /* d+g, b+e */
741                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
742                 r4 >>= 5;
743                 r13 |= r4;      /* check clipping */
744                 r4 &= 0xFF00FF; /* mask */
745 
746                 r5 = p_ref[4];  /* i */
747                 r6 = (r5 << 16);
748                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
749                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
750                 r1 >>= 16;
751                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
752                 r1 += r2;       /* f+g, d+e */
753                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
754                 r0 >>= 16;
755                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
756                 r0 += r3;       /* e+h, c+f */
757                 r5 += 0x100010; /* 16,16 */
758                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
759                 r5 >>= 5;
760                 r13 |= r5;      /* check clipping */
761                 r5 &= 0xFF00FF; /* mask */
762 
763                 r4 |= (r5 << 8);    /* pack them together */
764                 *p_cur++ = r4;
765                 r1 = r3;
766                 r0 = r2;
767             }
768             p_cur += curr_offset; /* move to the next line */
769             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
770 
771             if (r13&0xFF000700) /* need clipping */
772             {
773                 /* move back to the beginning of the line */
774                 p_ref -= (ref_offset + blkwidth);   /* input */
775                 p_cur -= (outpitch >> 2);
776 
777                 tmp = (uint32)(p_ref + blkwidth);
778                 for (; (uint32)p_ref < tmp;)
779                 {
780 
781                     r0 = *p_ref++;
782                     r1 = *p_ref++;
783                     r2 = *p_ref++;
784                     r3 = *p_ref++;
785                     r4 = *p_ref++;
786                     /* first pixel */
787                     r5 = *p_ref++;
788                     result = (r0 + r5);
789                     r0 = (r1 + r4);
790                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
791                     r0 = (r2 + r3);
792                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
793                     result = (result + 16) >> 5;
794                     CLIP_RESULT(result)
795                     pkres  = result;
796                     /* second pixel */
797                     r0 = *p_ref++;
798                     result = (r1 + r0);
799                     r1 = (r2 + r5);
800                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
801                     r1 = (r3 + r4);
802                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
803                     result = (result + 16) >> 5;
804                     CLIP_RESULT(result)
805                     pkres  |= (result << 8);
806                     /* third pixel */
807                     r1 = *p_ref++;
808                     result = (r2 + r1);
809                     r2 = (r3 + r0);
810                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
811                     r2 = (r4 + r5);
812                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
813                     result = (result + 16) >> 5;
814                     CLIP_RESULT(result)
815                     pkres  |= (result << 16);
816                     /* fourth pixel */
817                     r2 = *p_ref++;
818                     result = (r3 + r2);
819                     r3 = (r4 + r1);
820                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
821                     r3 = (r5 + r0);
822                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
823                     result = (result + 16) >> 5;
824                     CLIP_RESULT(result)
825                     pkres  |= (result << 24);
826                     *p_cur++ = pkres;   /* write 4 pixels */
827                     p_ref -= 5;
828                 }
829                 p_cur += curr_offset; /* move to the next line */
830                 p_ref += ref_offset;
831             }
832         }
833     }
834 
835     return ;
836 }
837 
HorzInterp2MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dx)838 void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
839                    int blkwidth, int blkheight, int dx)
840 {
841     int *p_ref;
842     uint32 *p_cur;
843     uint32 tmp, pkres;
844     int result, result2, curr_offset, ref_offset;
845     int j, r0, r1, r2, r3, r4, r5;
846 
847     p_cur = (uint32*)out; /* assume it's word aligned */
848     curr_offset = (outpitch - blkwidth) >> 2;
849     p_ref = in;
850     ref_offset = inpitch - blkwidth;
851 
852     if (dx&1)
853     {
854         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
855 
856         for (j = blkheight; j > 0 ; j--)
857         {
858             tmp = (uint32)(p_ref + blkwidth);
859             for (; (uint32)p_ref < tmp;)
860             {
861 
862                 r0 = p_ref[-2];
863                 r1 = p_ref[-1];
864                 r2 = *p_ref++;
865                 r3 = *p_ref++;
866                 r4 = *p_ref++;
867                 /* first pixel */
868                 r5 = *p_ref++;
869                 result = (r0 + r5);
870                 r0 = (r1 + r4);
871                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
872                 r0 = (r2 + r3);
873                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
874                 result = (result + 512) >> 10;
875                 CLIP_RESULT(result)
876                 result2 = ((p_ref[dx] + 16) >> 5);
877                 CLIP_RESULT(result2)
878                 /* 3/4 pel,  no need to clip */
879                 result = (result + result2 + 1);
880                 pkres = (result >> 1);
881                 /* second pixel */
882                 r0 = *p_ref++;
883                 result = (r1 + r0);
884                 r1 = (r2 + r5);
885                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
886                 r1 = (r3 + r4);
887                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
888                 result = (result + 512) >> 10;
889                 CLIP_RESULT(result)
890                 result2 = ((p_ref[dx] + 16) >> 5);
891                 CLIP_RESULT(result2)
892                 /* 3/4 pel,  no need to clip */
893                 result = (result + result2 + 1);
894                 result = (result >> 1);
895                 pkres  |= (result << 8);
896                 /* third pixel */
897                 r1 = *p_ref++;
898                 result = (r2 + r1);
899                 r2 = (r3 + r0);
900                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
901                 r2 = (r4 + r5);
902                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
903                 result = (result + 512) >> 10;
904                 CLIP_RESULT(result)
905                 result2 = ((p_ref[dx] + 16) >> 5);
906                 CLIP_RESULT(result2)
907                 /* 3/4 pel,  no need to clip */
908                 result = (result + result2 + 1);
909                 result = (result >> 1);
910                 pkres  |= (result << 16);
911                 /* fourth pixel */
912                 r2 = *p_ref++;
913                 result = (r3 + r2);
914                 r3 = (r4 + r1);
915                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
916                 r3 = (r5 + r0);
917                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
918                 result = (result + 512) >> 10;
919                 CLIP_RESULT(result)
920                 result2 = ((p_ref[dx] + 16) >> 5);
921                 CLIP_RESULT(result2)
922                 /* 3/4 pel,  no need to clip */
923                 result = (result + result2 + 1);
924                 result = (result >> 1);
925                 pkres  |= (result << 24);
926                 *p_cur++ = pkres; /* write 4 pixels */
927                 p_ref -= 3;  /* offset back to the middle of filter */
928             }
929             p_cur += curr_offset;  /* move to the next line */
930             p_ref += ref_offset;    /* move to the next line */
931         }
932     }
933     else
934     {
935         for (j = blkheight; j > 0 ; j--)
936         {
937             tmp = (uint32)(p_ref + blkwidth);
938             for (; (uint32)p_ref < tmp;)
939             {
940 
941                 r0 = p_ref[-2];
942                 r1 = p_ref[-1];
943                 r2 = *p_ref++;
944                 r3 = *p_ref++;
945                 r4 = *p_ref++;
946                 /* first pixel */
947                 r5 = *p_ref++;
948                 result = (r0 + r5);
949                 r0 = (r1 + r4);
950                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
951                 r0 = (r2 + r3);
952                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
953                 result = (result + 512) >> 10;
954                 CLIP_RESULT(result)
955                 pkres  = result;
956                 /* second pixel */
957                 r0 = *p_ref++;
958                 result = (r1 + r0);
959                 r1 = (r2 + r5);
960                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
961                 r1 = (r3 + r4);
962                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
963                 result = (result + 512) >> 10;
964                 CLIP_RESULT(result)
965                 pkres  |= (result << 8);
966                 /* third pixel */
967                 r1 = *p_ref++;
968                 result = (r2 + r1);
969                 r2 = (r3 + r0);
970                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
971                 r2 = (r4 + r5);
972                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
973                 result = (result + 512) >> 10;
974                 CLIP_RESULT(result)
975                 pkres  |= (result << 16);
976                 /* fourth pixel */
977                 r2 = *p_ref++;
978                 result = (r3 + r2);
979                 r3 = (r4 + r1);
980                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
981                 r3 = (r5 + r0);
982                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
983                 result = (result + 512) >> 10;
984                 CLIP_RESULT(result)
985                 pkres  |= (result << 24);
986                 *p_cur++ = pkres; /* write 4 pixels */
987                 p_ref -= 3;  /* offset back to the middle of filter */
988             }
989             p_cur += curr_offset;  /* move to the next line */
990             p_ref += ref_offset;    /* move to the next line */
991         }
992     }
993 
994     return ;
995 }
996 
HorzInterp3MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)997 void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
998                    int blkwidth, int blkheight)
999 {
1000     uint8 *p_ref;
1001     int   *p_cur;
1002     uint32 tmp;
1003     int result, curr_offset, ref_offset;
1004     int j, r0, r1, r2, r3, r4, r5;
1005 
1006     p_cur = out;
1007     curr_offset = (outpitch - blkwidth);
1008     p_ref = in;
1009     ref_offset = inpitch - blkwidth;
1010 
1011     for (j = blkheight; j > 0 ; j--)
1012     {
1013         tmp = (uint32)(p_ref + blkwidth);
1014         for (; (uint32)p_ref < tmp;)
1015         {
1016 
1017             r0 = p_ref[-2];
1018             r1 = p_ref[-1];
1019             r2 = *p_ref++;
1020             r3 = *p_ref++;
1021             r4 = *p_ref++;
1022             /* first pixel */
1023             r5 = *p_ref++;
1024             result = (r0 + r5);
1025             r0 = (r1 + r4);
1026             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1027             r0 = (r2 + r3);
1028             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1029             *p_cur++ = result;
1030             /* second pixel */
1031             r0 = *p_ref++;
1032             result = (r1 + r0);
1033             r1 = (r2 + r5);
1034             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1035             r1 = (r3 + r4);
1036             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1037             *p_cur++ = result;
1038             /* third pixel */
1039             r1 = *p_ref++;
1040             result = (r2 + r1);
1041             r2 = (r3 + r0);
1042             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1043             r2 = (r4 + r5);
1044             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1045             *p_cur++ = result;
1046             /* fourth pixel */
1047             r2 = *p_ref++;
1048             result = (r3 + r2);
1049             r3 = (r4 + r1);
1050             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1051             r3 = (r5 + r0);
1052             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1053             *p_cur++ = result;
1054             p_ref -= 3; /* move back to the middle of the filter */
1055         }
1056         p_cur += curr_offset; /* move to the next line */
1057         p_ref += ref_offset;
1058     }
1059 
1060     return ;
1061 }
VertInterp1MC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)1062 void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1063                    int blkwidth, int blkheight, int dy)
1064 {
1065     uint8 *p_cur, *p_ref;
1066     uint32 tmp;
1067     int result, curr_offset, ref_offset;
1068     int j, i;
1069     int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
1070     uint8  tmp_in[24][24];
1071 
1072     /* not word-aligned */
1073     if (((uint32)in)&0x3)
1074     {
1075         CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1076         in = &tmp_in[2][0];
1077         inpitch = 24;
1078     }
1079     p_cur = out;
1080     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1081     ref_offset = blkheight * inpitch; /* for limit */
1082 
1083     curr_offset += 3;
1084 
1085     if (dy&1)
1086     {
1087         dy = (dy >> 1) ? 0 : -inpitch;
1088 
1089         for (j = 0; j < blkwidth; j += 4, in += 4)
1090         {
1091             r13 = 0;
1092             p_ref = in;
1093             p_cur -= outpitch;  /* compensate for the first offset */
1094             tmp = (uint32)(p_ref + ref_offset); /* limit */
1095             while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1096             {
1097                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1098                 p_ref += inpitch;
1099                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1100                 r0 &= 0xFF00FF;
1101 
1102                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1103                 r7 = (r1 >> 8) & 0xFF00FF;
1104                 r1 &= 0xFF00FF;
1105 
1106                 r0 += r1;
1107                 r6 += r7;
1108 
1109                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1110                 r8 = (r2 >> 8) & 0xFF00FF;
1111                 r2 &= 0xFF00FF;
1112 
1113                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1114                 r7 = (r1 >> 8) & 0xFF00FF;
1115                 r1 &= 0xFF00FF;
1116                 r1 += r2;
1117 
1118                 r7 += r8;
1119 
1120                 r0 += 20 * r1;
1121                 r6 += 20 * r7;
1122                 r0 += 0x100010;
1123                 r6 += 0x100010;
1124 
1125                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1126                 r8 = (r2 >> 8) & 0xFF00FF;
1127                 r2 &= 0xFF00FF;
1128 
1129                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1130                 r7 = (r1 >> 8) & 0xFF00FF;
1131                 r1 &= 0xFF00FF;
1132                 r1 += r2;
1133 
1134                 r7 += r8;
1135 
1136                 r0 -= 5 * r1;
1137                 r6 -= 5 * r7;
1138 
1139                 r0 >>= 5;
1140                 r6 >>= 5;
1141                 /* clip */
1142                 r13 |= r6;
1143                 r13 |= r0;
1144                 //CLIPPACK(r6,result)
1145 
1146                 r1 = *((uint32*)(p_ref + dy));
1147                 r2 = (r1 >> 8) & 0xFF00FF;
1148                 r1 &= 0xFF00FF;
1149                 r0 += r1;
1150                 r6 += r2;
1151                 r0 += 0x10001;
1152                 r6 += 0x10001;
1153                 r0 = (r0 >> 1) & 0xFF00FF;
1154                 r6 = (r6 >> 1) & 0xFF00FF;
1155 
1156                 r0 |= (r6 << 8);  /* pack it back */
1157                 *((uint32*)(p_cur += outpitch)) = r0;
1158             }
1159             p_cur += curr_offset; /* offset to the next pixel */
1160             if (r13 & 0xFF000700) /* this column need clipping */
1161             {
1162                 p_cur -= 4;
1163                 for (i = 0; i < 4; i++)
1164                 {
1165                     p_ref = in + i;
1166                     p_cur -= outpitch;  /* compensate for the first offset */
1167 
1168                     tmp = (uint32)(p_ref + ref_offset); /* limit */
1169                     while ((uint32)p_ref < tmp)
1170                     {                           /* loop un-rolled */
1171                         r0 = *(p_ref - (inpitch << 1));
1172                         r1 = *(p_ref - inpitch);
1173                         r2 = *p_ref;
1174                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1175                         r4 = *(p_ref += inpitch);
1176                         /* first pixel */
1177                         r5 = *(p_ref += inpitch);
1178                         result = (r0 + r5);
1179                         r0 = (r1 + r4);
1180                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1181                         r0 = (r2 + r3);
1182                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1183                         result = (result + 16) >> 5;
1184                         CLIP_RESULT(result)
1185                         /* 3/4 pel,  no need to clip */
1186                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
1187                         result = (result >> 1);
1188                         *(p_cur += outpitch) = result;
1189                         /* second pixel */
1190                         r0 = *(p_ref += inpitch);
1191                         result = (r1 + r0);
1192                         r1 = (r2 + r5);
1193                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1194                         r1 = (r3 + r4);
1195                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1196                         result = (result + 16) >> 5;
1197                         CLIP_RESULT(result)
1198                         /* 3/4 pel,  no need to clip */
1199                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
1200                         result = (result >> 1);
1201                         *(p_cur += outpitch) = result;
1202                         /* third pixel */
1203                         r1 = *(p_ref += inpitch);
1204                         result = (r2 + r1);
1205                         r2 = (r3 + r0);
1206                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1207                         r2 = (r4 + r5);
1208                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1209                         result = (result + 16) >> 5;
1210                         CLIP_RESULT(result)
1211                         /* 3/4 pel,  no need to clip */
1212                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
1213                         result = (result >> 1);
1214                         *(p_cur += outpitch) = result;
1215                         /* fourth pixel */
1216                         r2 = *(p_ref += inpitch);
1217                         result = (r3 + r2);
1218                         r3 = (r4 + r1);
1219                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1220                         r3 = (r5 + r0);
1221                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1222                         result = (result + 16) >> 5;
1223                         CLIP_RESULT(result)
1224                         /* 3/4 pel,  no need to clip */
1225                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
1226                         result = (result >> 1);
1227                         *(p_cur += outpitch) = result;
1228                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1229                     }
1230                     p_cur += (curr_offset - 3);
1231                 }
1232             }
1233         }
1234     }
1235     else
1236     {
1237         for (j = 0; j < blkwidth; j += 4, in += 4)
1238         {
1239             r13 = 0;
1240             p_ref = in;
1241             p_cur -= outpitch;  /* compensate for the first offset */
1242             tmp = (uint32)(p_ref + ref_offset); /* limit */
1243             while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1244             {
1245                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1246                 p_ref += inpitch;
1247                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1248                 r0 &= 0xFF00FF;
1249 
1250                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1251                 r7 = (r1 >> 8) & 0xFF00FF;
1252                 r1 &= 0xFF00FF;
1253 
1254                 r0 += r1;
1255                 r6 += r7;
1256 
1257                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1258                 r8 = (r2 >> 8) & 0xFF00FF;
1259                 r2 &= 0xFF00FF;
1260 
1261                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1262                 r7 = (r1 >> 8) & 0xFF00FF;
1263                 r1 &= 0xFF00FF;
1264                 r1 += r2;
1265 
1266                 r7 += r8;
1267 
1268                 r0 += 20 * r1;
1269                 r6 += 20 * r7;
1270                 r0 += 0x100010;
1271                 r6 += 0x100010;
1272 
1273                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1274                 r8 = (r2 >> 8) & 0xFF00FF;
1275                 r2 &= 0xFF00FF;
1276 
1277                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1278                 r7 = (r1 >> 8) & 0xFF00FF;
1279                 r1 &= 0xFF00FF;
1280                 r1 += r2;
1281 
1282                 r7 += r8;
1283 
1284                 r0 -= 5 * r1;
1285                 r6 -= 5 * r7;
1286 
1287                 r0 >>= 5;
1288                 r6 >>= 5;
1289                 /* clip */
1290                 r13 |= r6;
1291                 r13 |= r0;
1292                 //CLIPPACK(r6,result)
1293                 r0 &= 0xFF00FF;
1294                 r6 &= 0xFF00FF;
1295                 r0 |= (r6 << 8);  /* pack it back */
1296                 *((uint32*)(p_cur += outpitch)) = r0;
1297             }
1298             p_cur += curr_offset; /* offset to the next pixel */
1299             if (r13 & 0xFF000700) /* this column need clipping */
1300             {
1301                 p_cur -= 4;
1302                 for (i = 0; i < 4; i++)
1303                 {
1304                     p_ref = in + i;
1305                     p_cur -= outpitch;  /* compensate for the first offset */
1306                     tmp = (uint32)(p_ref + ref_offset); /* limit */
1307                     while ((uint32)p_ref < tmp)
1308                     {                           /* loop un-rolled */
1309                         r0 = *(p_ref - (inpitch << 1));
1310                         r1 = *(p_ref - inpitch);
1311                         r2 = *p_ref;
1312                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1313                         r4 = *(p_ref += inpitch);
1314                         /* first pixel */
1315                         r5 = *(p_ref += inpitch);
1316                         result = (r0 + r5);
1317                         r0 = (r1 + r4);
1318                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1319                         r0 = (r2 + r3);
1320                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1321                         result = (result + 16) >> 5;
1322                         CLIP_RESULT(result)
1323                         *(p_cur += outpitch) = result;
1324                         /* second pixel */
1325                         r0 = *(p_ref += inpitch);
1326                         result = (r1 + r0);
1327                         r1 = (r2 + r5);
1328                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1329                         r1 = (r3 + r4);
1330                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1331                         result = (result + 16) >> 5;
1332                         CLIP_RESULT(result)
1333                         *(p_cur += outpitch) = result;
1334                         /* third pixel */
1335                         r1 = *(p_ref += inpitch);
1336                         result = (r2 + r1);
1337                         r2 = (r3 + r0);
1338                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1339                         r2 = (r4 + r5);
1340                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1341                         result = (result + 16) >> 5;
1342                         CLIP_RESULT(result)
1343                         *(p_cur += outpitch) = result;
1344                         /* fourth pixel */
1345                         r2 = *(p_ref += inpitch);
1346                         result = (r3 + r2);
1347                         r3 = (r4 + r1);
1348                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1349                         r3 = (r5 + r0);
1350                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1351                         result = (result + 16) >> 5;
1352                         CLIP_RESULT(result)
1353                         *(p_cur += outpitch) = result;
1354                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1355                     }
1356                     p_cur += (curr_offset - 3);
1357                 }
1358             }
1359         }
1360     }
1361 
1362     return ;
1363 }
1364 
VertInterp2MC(uint8 * in,int inpitch,int * out,int outpitch,int blkwidth,int blkheight)1365 void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
1366                    int blkwidth, int blkheight)
1367 {
1368     int *p_cur;
1369     uint8 *p_ref;
1370     uint32 tmp;
1371     int result, curr_offset, ref_offset;
1372     int j, r0, r1, r2, r3, r4, r5;
1373 
1374     p_cur = out;
1375     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1376     ref_offset = blkheight * inpitch; /* for limit */
1377 
1378     for (j = 0; j < blkwidth; j++)
1379     {
1380         p_cur -= outpitch; /* compensate for the first offset */
1381         p_ref = in++;
1382 
1383         tmp = (uint32)(p_ref + ref_offset); /* limit */
1384         while ((uint32)p_ref < tmp)
1385         {                           /* loop un-rolled */
1386             r0 = *(p_ref - (inpitch << 1));
1387             r1 = *(p_ref - inpitch);
1388             r2 = *p_ref;
1389             r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1390             r4 = *(p_ref += inpitch);
1391             /* first pixel */
1392             r5 = *(p_ref += inpitch);
1393             result = (r0 + r5);
1394             r0 = (r1 + r4);
1395             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1396             r0 = (r2 + r3);
1397             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1398             *(p_cur += outpitch) = result;
1399             /* second pixel */
1400             r0 = *(p_ref += inpitch);
1401             result = (r1 + r0);
1402             r1 = (r2 + r5);
1403             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1404             r1 = (r3 + r4);
1405             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1406             *(p_cur += outpitch) = result;
1407             /* third pixel */
1408             r1 = *(p_ref += inpitch);
1409             result = (r2 + r1);
1410             r2 = (r3 + r0);
1411             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1412             r2 = (r4 + r5);
1413             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1414             *(p_cur += outpitch) = result;
1415             /* fourth pixel */
1416             r2 = *(p_ref += inpitch);
1417             result = (r3 + r2);
1418             r3 = (r4 + r1);
1419             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1420             r3 = (r5 + r0);
1421             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1422             *(p_cur += outpitch) = result;
1423             p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1424         }
1425         p_cur += curr_offset;
1426     }
1427 
1428     return ;
1429 }
1430 
VertInterp3MC(int * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight,int dy)1431 void VertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
1432                    int blkwidth, int blkheight, int dy)
1433 {
1434     uint8 *p_cur;
1435     int *p_ref;
1436     uint32 tmp;
1437     int result, result2, curr_offset, ref_offset;
1438     int j, r0, r1, r2, r3, r4, r5;
1439 
1440     p_cur = out;
1441     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1442     ref_offset = blkheight * inpitch; /* for limit */
1443 
1444     if (dy&1)
1445     {
1446         dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
1447 
1448         for (j = 0; j < blkwidth; j++)
1449         {
1450             p_cur -= outpitch; /* compensate for the first offset */
1451             p_ref = in++;
1452 
1453             tmp = (uint32)(p_ref + ref_offset); /* limit */
1454             while ((uint32)p_ref < tmp)
1455             {                           /* loop un-rolled */
1456                 r0 = *(p_ref - (inpitch << 1));
1457                 r1 = *(p_ref - inpitch);
1458                 r2 = *p_ref;
1459                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1460                 r4 = *(p_ref += inpitch);
1461                 /* first pixel */
1462                 r5 = *(p_ref += inpitch);
1463                 result = (r0 + r5);
1464                 r0 = (r1 + r4);
1465                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1466                 r0 = (r2 + r3);
1467                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1468                 result = (result + 512) >> 10;
1469                 CLIP_RESULT(result)
1470                 result2 = ((p_ref[dy] + 16) >> 5);
1471                 CLIP_RESULT(result2)
1472                 /* 3/4 pel,  no need to clip */
1473                 result = (result + result2 + 1);
1474                 result = (result >> 1);
1475                 *(p_cur += outpitch) = result;
1476                 /* second pixel */
1477                 r0 = *(p_ref += inpitch);
1478                 result = (r1 + r0);
1479                 r1 = (r2 + r5);
1480                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1481                 r1 = (r3 + r4);
1482                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1483                 result = (result + 512) >> 10;
1484                 CLIP_RESULT(result)
1485                 result2 = ((p_ref[dy] + 16) >> 5);
1486                 CLIP_RESULT(result2)
1487                 /* 3/4 pel,  no need to clip */
1488                 result = (result + result2 + 1);
1489                 result = (result >> 1);
1490                 *(p_cur += outpitch) = result;
1491                 /* third pixel */
1492                 r1 = *(p_ref += inpitch);
1493                 result = (r2 + r1);
1494                 r2 = (r3 + r0);
1495                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1496                 r2 = (r4 + r5);
1497                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1498                 result = (result + 512) >> 10;
1499                 CLIP_RESULT(result)
1500                 result2 = ((p_ref[dy] + 16) >> 5);
1501                 CLIP_RESULT(result2)
1502                 /* 3/4 pel,  no need to clip */
1503                 result = (result + result2 + 1);
1504                 result = (result >> 1);
1505                 *(p_cur += outpitch) = result;
1506                 /* fourth pixel */
1507                 r2 = *(p_ref += inpitch);
1508                 result = (r3 + r2);
1509                 r3 = (r4 + r1);
1510                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1511                 r3 = (r5 + r0);
1512                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1513                 result = (result + 512) >> 10;
1514                 CLIP_RESULT(result)
1515                 result2 = ((p_ref[dy] + 16) >> 5);
1516                 CLIP_RESULT(result2)
1517                 /* 3/4 pel,  no need to clip */
1518                 result = (result + result2 + 1);
1519                 result = (result >> 1);
1520                 *(p_cur += outpitch) = result;
1521                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1522             }
1523             p_cur += curr_offset;
1524         }
1525     }
1526     else
1527     {
1528         for (j = 0; j < blkwidth; j++)
1529         {
1530             p_cur -= outpitch; /* compensate for the first offset */
1531             p_ref = in++;
1532 
1533             tmp = (uint32)(p_ref + ref_offset); /* limit */
1534             while ((uint32)p_ref < tmp)
1535             {                           /* loop un-rolled */
1536                 r0 = *(p_ref - (inpitch << 1));
1537                 r1 = *(p_ref - inpitch);
1538                 r2 = *p_ref;
1539                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1540                 r4 = *(p_ref += inpitch);
1541                 /* first pixel */
1542                 r5 = *(p_ref += inpitch);
1543                 result = (r0 + r5);
1544                 r0 = (r1 + r4);
1545                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1546                 r0 = (r2 + r3);
1547                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1548                 result = (result + 512) >> 10;
1549                 CLIP_RESULT(result)
1550                 *(p_cur += outpitch) = result;
1551                 /* second pixel */
1552                 r0 = *(p_ref += inpitch);
1553                 result = (r1 + r0);
1554                 r1 = (r2 + r5);
1555                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1556                 r1 = (r3 + r4);
1557                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1558                 result = (result + 512) >> 10;
1559                 CLIP_RESULT(result)
1560                 *(p_cur += outpitch) = result;
1561                 /* third pixel */
1562                 r1 = *(p_ref += inpitch);
1563                 result = (r2 + r1);
1564                 r2 = (r3 + r0);
1565                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1566                 r2 = (r4 + r5);
1567                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1568                 result = (result + 512) >> 10;
1569                 CLIP_RESULT(result)
1570                 *(p_cur += outpitch) = result;
1571                 /* fourth pixel */
1572                 r2 = *(p_ref += inpitch);
1573                 result = (r3 + r2);
1574                 r3 = (r4 + r1);
1575                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1576                 r3 = (r5 + r0);
1577                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1578                 result = (result + 512) >> 10;
1579                 CLIP_RESULT(result)
1580                 *(p_cur += outpitch) = result;
1581                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1582             }
1583             p_cur += curr_offset;
1584         }
1585     }
1586 
1587     return ;
1588 }
1589 
DiagonalInterpMC(uint8 * in1,uint8 * in2,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1590 void DiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
1591                       uint8 *out, int outpitch,
1592                       int blkwidth, int blkheight)
1593 {
1594     int j, i;
1595     int result;
1596     uint8 *p_cur, *p_ref, *p_tmp8;
1597     int curr_offset, ref_offset;
1598     uint8 tmp_res[24][24], tmp_in[24][24];
1599     uint32 *p_tmp;
1600     uint32 tmp, pkres, tmp_result;
1601     int32 r0, r1, r2, r3, r4, r5;
1602     int32 r6, r7, r8, r9, r10, r13;
1603 
1604     ref_offset = inpitch - blkwidth;
1605     p_ref = in1 - 2;
1606     /* perform horizontal interpolation */
1607     /* not word-aligned */
1608     /* It is faster to read 1 byte at time to avoid calling CreateAlign */
1609     /*  if(((uint32)p_ref)&0x3)
1610         {
1611             CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
1612             p_ref = &tmp_in[0][0];
1613             ref_offset = 24-blkwidth;
1614         }*/
1615 
1616     p_tmp = (uint32*) & (tmp_res[0][0]);
1617     for (j = blkheight; j > 0; j--)
1618     {
1619         r13 = 0;
1620         tmp = (uint32)(p_ref + blkwidth);
1621 
1622         //r0 = *((uint32*)p_ref);   /* d,c,b,a */
1623         //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
1624         //r0 &= 0xFF00FF;           /* 0,c,0,a */
1625         /* It is faster to read 1 byte at a time,  */
1626         r0 = p_ref[0];
1627         r1 = p_ref[2];
1628         r0 |= (r1 << 16);           /* 0,c,0,a */
1629         r1 = p_ref[1];
1630         r2 = p_ref[3];
1631         r1 |= (r2 << 16);           /* 0,d,0,b */
1632 
1633         while ((uint32)p_ref < tmp)
1634         {
1635             //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
1636             //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
1637             //r2 &= 0xFF00FF;           /* 0,g,0,e */
1638             /* It is faster to read 1 byte at a time,  */
1639             r2 = *(p_ref += 4);
1640             r3 = p_ref[2];
1641             r2 |= (r3 << 16);           /* 0,g,0,e */
1642             r3 = p_ref[1];
1643             r4 = p_ref[3];
1644             r3 |= (r4 << 16);           /* 0,h,0,f */
1645 
1646             r4 = r0 + r3;       /* c+h, a+f */
1647             r5 = r0 + r1;   /* c+d, a+b */
1648             r6 = r2 + r3;   /* g+h, e+f */
1649             r5 >>= 16;
1650             r5 |= (r6 << 16);   /* e+f, c+d */
1651             r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
1652             r4 += 0x100010; /* +16, +16 */
1653             r5 = r1 + r2;       /* d+g, b+e */
1654             r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
1655             r4 >>= 5;
1656             r13 |= r4;      /* check clipping */
1657             r4 &= 0xFF00FF; /* mask */
1658 
1659             r5 = p_ref[4];  /* i */
1660             r6 = (r5 << 16);
1661             r5 = r6 | (r2 >> 16);/* 0,i,0,g */
1662             r5 += r1;       /* d+i, b+g */ /* r5 not free */
1663             r1 >>= 16;
1664             r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
1665             r1 += r2;       /* f+g, d+e */
1666             r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
1667             r0 >>= 16;
1668             r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
1669             r0 += r3;       /* e+h, c+f */
1670             r5 += 0x100010; /* 16,16 */
1671             r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
1672             r5 >>= 5;
1673             r13 |= r5;      /* check clipping */
1674             r5 &= 0xFF00FF; /* mask */
1675 
1676             r4 |= (r5 << 8);    /* pack them together */
1677             *p_tmp++ = r4;
1678             r1 = r3;
1679             r0 = r2;
1680         }
1681         p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1682         p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1683 
1684         if (r13&0xFF000700) /* need clipping */
1685         {
1686             /* move back to the beginning of the line */
1687             p_ref -= (ref_offset + blkwidth);   /* input */
1688             p_tmp -= 6; /* intermediate output */
1689             tmp = (uint32)(p_ref + blkwidth);
1690             while ((uint32)p_ref < tmp)
1691             {
1692                 r0 = *p_ref++;
1693                 r1 = *p_ref++;
1694                 r2 = *p_ref++;
1695                 r3 = *p_ref++;
1696                 r4 = *p_ref++;
1697                 /* first pixel */
1698                 r5 = *p_ref++;
1699                 result = (r0 + r5);
1700                 r0 = (r1 + r4);
1701                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1702                 r0 = (r2 + r3);
1703                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1704                 result = (result + 16) >> 5;
1705                 CLIP_RESULT(result)
1706                 pkres = result;
1707                 /* second pixel */
1708                 r0 = *p_ref++;
1709                 result = (r1 + r0);
1710                 r1 = (r2 + r5);
1711                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1712                 r1 = (r3 + r4);
1713                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1714                 result = (result + 16) >> 5;
1715                 CLIP_RESULT(result)
1716                 pkres |= (result << 8);
1717                 /* third pixel */
1718                 r1 = *p_ref++;
1719                 result = (r2 + r1);
1720                 r2 = (r3 + r0);
1721                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1722                 r2 = (r4 + r5);
1723                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1724                 result = (result + 16) >> 5;
1725                 CLIP_RESULT(result)
1726                 pkres |= (result << 16);
1727                 /* fourth pixel */
1728                 r2 = *p_ref++;
1729                 result = (r3 + r2);
1730                 r3 = (r4 + r1);
1731                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1732                 r3 = (r5 + r0);
1733                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1734                 result = (result + 16) >> 5;
1735                 CLIP_RESULT(result)
1736                 pkres |= (result << 24);
1737 
1738                 *p_tmp++ = pkres; /* write 4 pixel */
1739                 p_ref -= 5;
1740             }
1741             p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1742             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1743         }
1744     }
1745 
1746     /*  perform vertical interpolation */
1747     /* not word-aligned */
1748     if (((uint32)in2)&0x3)
1749     {
1750         CreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1751         in2 = &tmp_in[2][0];
1752         inpitch = 24;
1753     }
1754 
1755     p_cur = out;
1756     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
1757     pkres = blkheight * inpitch; /* reuse it for limit */
1758 
1759     curr_offset += 3;
1760 
1761     for (j = 0; j < blkwidth; j += 4, in2 += 4)
1762     {
1763         r13 = 0;
1764         p_ref = in2;
1765         p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
1766         p_tmp8 -= 24;  /* compensate for the first offset */
1767         p_cur -= outpitch;  /* compensate for the first offset */
1768         tmp = (uint32)(p_ref + pkres); /* limit */
1769         while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1770         {
1771             /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign,  */
1772             /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
1773             r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
1774             r6 |= (r1<<16);         p_ref+=inpitch; */
1775             r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1776             p_ref += inpitch;
1777             r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1778             r0 &= 0xFF00FF;
1779 
1780             /*p_ref8 = p_ref+(inpitch<<1);
1781             r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
1782             r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
1783             r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1784             r7 = (r1 >> 8) & 0xFF00FF;
1785             r1 &= 0xFF00FF;
1786 
1787             r0 += r1;
1788             r6 += r7;
1789 
1790             /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
1791             r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
1792             r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1793             r8 = (r2 >> 8) & 0xFF00FF;
1794             r2 &= 0xFF00FF;
1795 
1796             /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1797             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1798             r2 = p_ref8[3];         r7 |= (r2<<16);*/
1799             r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1800             r7 = (r1 >> 8) & 0xFF00FF;
1801             r1 &= 0xFF00FF;
1802             r1 += r2;
1803 
1804             r7 += r8;
1805 
1806             r0 += 20 * r1;
1807             r6 += 20 * r7;
1808             r0 += 0x100010;
1809             r6 += 0x100010;
1810 
1811             /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
1812             r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
1813             r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1814             r8 = (r2 >> 8) & 0xFF00FF;
1815             r2 &= 0xFF00FF;
1816 
1817             /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1818             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1819             r2 = p_ref8[3];         r7 |= (r2<<16);*/
1820             r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1821             r7 = (r1 >> 8) & 0xFF00FF;
1822             r1 &= 0xFF00FF;
1823             r1 += r2;
1824 
1825             r7 += r8;
1826 
1827             r0 -= 5 * r1;
1828             r6 -= 5 * r7;
1829 
1830             r0 >>= 5;
1831             r6 >>= 5;
1832             /* clip */
1833             r13 |= r6;
1834             r13 |= r0;
1835             //CLIPPACK(r6,result)
1836             /* add with horizontal results */
1837             r10 = *((uint32*)(p_tmp8 += 24));
1838             r9 = (r10 >> 8) & 0xFF00FF;
1839             r10 &= 0xFF00FF;
1840 
1841             r0 += r10;
1842             r0 += 0x10001;
1843             r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1844 
1845             r6 += r9;
1846             r6 += 0x10001;
1847             r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1848 
1849             r0 |= (r6 << 8);  /* pack it back */
1850             *((uint32*)(p_cur += outpitch)) = r0;
1851         }
1852         p_cur += curr_offset; /* offset to the next pixel */
1853         if (r13 & 0xFF000700) /* this column need clipping */
1854         {
1855             p_cur -= 4;
1856             for (i = 0; i < 4; i++)
1857             {
1858                 p_ref = in2 + i;
1859                 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
1860                 p_tmp8 -= 24;  /* compensate for the first offset */
1861                 p_cur -= outpitch;  /* compensate for the first offset */
1862                 tmp = (uint32)(p_ref + pkres); /* limit */
1863                 while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1864                 {
1865                     r0 = *(p_ref - (inpitch << 1));
1866                     r1 = *(p_ref - inpitch);
1867                     r2 = *p_ref;
1868                     r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1869                     r4 = *(p_ref += inpitch);
1870                     /* first pixel */
1871                     r5 = *(p_ref += inpitch);
1872                     result = (r0 + r5);
1873                     r0 = (r1 + r4);
1874                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1875                     r0 = (r2 + r3);
1876                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1877                     result = (result + 16) >> 5;
1878                     CLIP_RESULT(result)
1879                     tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
1880                     result = (result + tmp_result + 1);  /* no clip */
1881                     result = (result >> 1);
1882                     *(p_cur += outpitch) = result;
1883                     /* second pixel */
1884                     r0 = *(p_ref += inpitch);
1885                     result = (r1 + r0);
1886                     r1 = (r2 + r5);
1887                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1888                     r1 = (r3 + r4);
1889                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1890                     result = (result + 16) >> 5;
1891                     CLIP_RESULT(result)
1892                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1893                     result = (result + tmp_result + 1);  /* no clip */
1894                     result = (result >> 1);
1895                     *(p_cur += outpitch) = result;
1896                     /* third pixel */
1897                     r1 = *(p_ref += inpitch);
1898                     result = (r2 + r1);
1899                     r2 = (r3 + r0);
1900                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1901                     r2 = (r4 + r5);
1902                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1903                     result = (result + 16) >> 5;
1904                     CLIP_RESULT(result)
1905                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1906                     result = (result + tmp_result + 1);  /* no clip */
1907                     result = (result >> 1);
1908                     *(p_cur += outpitch) = result;
1909                     /* fourth pixel */
1910                     r2 = *(p_ref += inpitch);
1911                     result = (r3 + r2);
1912                     r3 = (r4 + r1);
1913                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1914                     r3 = (r5 + r0);
1915                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1916                     result = (result + 16) >> 5;
1917                     CLIP_RESULT(result)
1918                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1919                     result = (result + tmp_result + 1);  /* no clip */
1920                     result = (result >> 1);
1921                     *(p_cur += outpitch) = result;
1922                     p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1923                 }
1924                 p_cur += (curr_offset - 3);
1925             }
1926         }
1927     }
1928 
1929     return ;
1930 }
1931 
1932 /* position G */
FullPelMC(uint8 * in,int inpitch,uint8 * out,int outpitch,int blkwidth,int blkheight)1933 void FullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1934                int blkwidth, int blkheight)
1935 {
1936     int i, j;
1937     int offset_in = inpitch - blkwidth;
1938     int offset_out = outpitch - blkwidth;
1939     uint32 temp;
1940     uint8 byte;
1941 
1942     if (((uint32)in)&3)
1943     {
1944         for (j = blkheight; j > 0; j--)
1945         {
1946             for (i = blkwidth; i > 0; i -= 4)
1947             {
1948                 temp = *in++;
1949                 byte = *in++;
1950                 temp |= (byte << 8);
1951                 byte = *in++;
1952                 temp |= (byte << 16);
1953                 byte = *in++;
1954                 temp |= (byte << 24);
1955 
1956                 *((uint32*)out) = temp; /* write 4 bytes */
1957                 out += 4;
1958             }
1959             out += offset_out;
1960             in += offset_in;
1961         }
1962     }
1963     else
1964     {
1965         for (j = blkheight; j > 0; j--)
1966         {
1967             for (i = blkwidth; i > 0; i -= 4)
1968             {
1969                 temp = *((uint32*)in);
1970                 *((uint32*)out) = temp;
1971                 in += 4;
1972                 out += 4;
1973             }
1974             out += offset_out;
1975             in += offset_in;
1976         }
1977     }
1978     return ;
1979 }
1980 
ChromaMotionComp(uint8 * ref,int picwidth,int picheight,int x_pos,int y_pos,uint8 * pred,int pred_pitch,int blkwidth,int blkheight)1981 void ChromaMotionComp(uint8 *ref, int picwidth, int picheight,
1982                       int x_pos, int y_pos,
1983                       uint8 *pred, int pred_pitch,
1984                       int blkwidth, int blkheight)
1985 {
1986     int dx, dy;
1987     int offset_dx, offset_dy;
1988     int index;
1989     uint8 temp[24][24];
1990 
1991     dx = x_pos & 7;
1992     dy = y_pos & 7;
1993     offset_dx = (dx + 7) >> 3;
1994     offset_dy = (dy + 7) >> 3;
1995     x_pos = x_pos >> 3;  /* round it to full-pel resolution */
1996     y_pos = y_pos >> 3;
1997 
1998     if ((x_pos >= 0 && x_pos + blkwidth + offset_dx <= picwidth) && (y_pos >= 0 && y_pos + blkheight + offset_dy <= picheight))
1999     {
2000         ref += y_pos * picwidth + x_pos;
2001     }
2002     else
2003     {
2004         CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth + offset_dx, blkheight + offset_dy);
2005         ref = &temp[0][0];
2006         picwidth = 24;
2007     }
2008 
2009     index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
2010 
2011     (*(ChromaMC_SIMD[index]))(ref, picwidth , dx, dy, pred, pred_pitch, blkwidth, blkheight);
2012     return ;
2013 }
2014 
2015 
2016 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done)  */
ChromaDiagonalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2017 void ChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2018                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2019 {
2020     int32 r0, r1, r2, r3, result0, result1;
2021     uint8 temp[288];
2022     uint8 *ref, *out;
2023     int i, j;
2024     int dx_8 = 8 - dx;
2025     int dy_8 = 8 - dy;
2026 
2027     /* horizontal first */
2028     out = temp;
2029     for (i = 0; i < blkheight + 1; i++)
2030     {
2031         ref = pRef;
2032         r0 = ref[0];
2033         for (j = 0; j < blkwidth; j += 4)
2034         {
2035             r0 |= (ref[2] << 16);
2036             result0 = dx_8 * r0;
2037 
2038             r1 = ref[1] | (ref[3] << 16);
2039             result0 += dx * r1;
2040             *(int32 *)out = result0;
2041 
2042             result0 = dx_8 * r1;
2043 
2044             r2 = ref[4];
2045             r0 = r0 >> 16;
2046             r1 = r0 | (r2 << 16);
2047             result0 += dx * r1;
2048             *(int32 *)(out + 16) = result0;
2049 
2050             ref += 4;
2051             out += 4;
2052             r0 = r2;
2053         }
2054         pRef += srcPitch;
2055         out += (32 - blkwidth);
2056     }
2057 
2058 //  pRef -= srcPitch*(blkheight+1);
2059     ref = temp;
2060 
2061     for (j = 0; j < blkwidth; j += 4)
2062     {
2063         r0 = *(int32 *)ref;
2064         r1 = *(int32 *)(ref + 16);
2065         ref += 32;
2066         out = pOut;
2067         for (i = 0; i < (blkheight >> 1); i++)
2068         {
2069             result0 = dy_8 * r0 + 0x00200020;
2070             r2 = *(int32 *)ref;
2071             result0 += dy * r2;
2072             result0 >>= 6;
2073             result0 &= 0x00FF00FF;
2074             r0 = r2;
2075 
2076             result1 = dy_8 * r1 + 0x00200020;
2077             r3 = *(int32 *)(ref + 16);
2078             result1 += dy * r3;
2079             result1 >>= 6;
2080             result1 &= 0x00FF00FF;
2081             r1 = r3;
2082             *(int32 *)out = result0 | (result1 << 8);
2083             out += predPitch;
2084             ref += 32;
2085 
2086             result0 = dy_8 * r0 + 0x00200020;
2087             r2 = *(int32 *)ref;
2088             result0 += dy * r2;
2089             result0 >>= 6;
2090             result0 &= 0x00FF00FF;
2091             r0 = r2;
2092 
2093             result1 = dy_8 * r1 + 0x00200020;
2094             r3 = *(int32 *)(ref + 16);
2095             result1 += dy * r3;
2096             result1 >>= 6;
2097             result1 &= 0x00FF00FF;
2098             r1 = r3;
2099             *(int32 *)out = result0 | (result1 << 8);
2100             out += predPitch;
2101             ref += 32;
2102         }
2103         pOut += 4;
2104         ref = temp + 4; /* since it can only iterate twice max  */
2105     }
2106     return;
2107 }
2108 
ChromaHorizontalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2109 void ChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2110                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2111 {
2112     OSCL_UNUSED_ARG(dy);
2113     int32 r0, r1, r2, result0, result1;
2114     uint8 *ref, *out;
2115     int i, j;
2116     int dx_8 = 8 - dx;
2117 
2118     /* horizontal first */
2119     for (i = 0; i < blkheight; i++)
2120     {
2121         ref = pRef;
2122         out = pOut;
2123 
2124         r0 = ref[0];
2125         for (j = 0; j < blkwidth; j += 4)
2126         {
2127             r0 |= (ref[2] << 16);
2128             result0 = dx_8 * r0 + 0x00040004;
2129 
2130             r1 = ref[1] | (ref[3] << 16);
2131             result0 += dx * r1;
2132             result0 >>= 3;
2133             result0 &= 0x00FF00FF;
2134 
2135             result1 = dx_8 * r1 + 0x00040004;
2136 
2137             r2 = ref[4];
2138             r0 = r0 >> 16;
2139             r1 = r0 | (r2 << 16);
2140             result1 += dx * r1;
2141             result1 >>= 3;
2142             result1 &= 0x00FF00FF;
2143 
2144             *(int32 *)out = result0 | (result1 << 8);
2145 
2146             ref += 4;
2147             out += 4;
2148             r0 = r2;
2149         }
2150 
2151         pRef += srcPitch;
2152         pOut += predPitch;
2153     }
2154     return;
2155 }
2156 
ChromaVerticalMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2157 void ChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2158                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2159 {
2160     OSCL_UNUSED_ARG(dx);
2161     int32 r0, r1, r2, r3, result0, result1;
2162     int i, j;
2163     uint8 *ref, *out;
2164     int dy_8 = 8 - dy;
2165     /* vertical first */
2166     for (i = 0; i < blkwidth; i += 4)
2167     {
2168         ref = pRef;
2169         out = pOut;
2170 
2171         r0 = ref[0] | (ref[2] << 16);
2172         r1 = ref[1] | (ref[3] << 16);
2173         ref += srcPitch;
2174         for (j = 0; j < blkheight; j++)
2175         {
2176             result0 = dy_8 * r0 + 0x00040004;
2177             r2 = ref[0] | (ref[2] << 16);
2178             result0 += dy * r2;
2179             result0 >>= 3;
2180             result0 &= 0x00FF00FF;
2181             r0 = r2;
2182 
2183             result1 = dy_8 * r1 + 0x00040004;
2184             r3 = ref[1] | (ref[3] << 16);
2185             result1 += dy * r3;
2186             result1 >>= 3;
2187             result1 &= 0x00FF00FF;
2188             r1 = r3;
2189             *(int32 *)out = result0 | (result1 << 8);
2190             ref += srcPitch;
2191             out += predPitch;
2192         }
2193         pOut += 4;
2194         pRef += 4;
2195     }
2196     return;
2197 }
2198 
ChromaDiagonalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2199 void ChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2200                             uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
2201 {
2202     OSCL_UNUSED_ARG(blkwidth);
2203     int32 r0, r1, temp0, temp1, result;
2204     int32 temp[9];
2205     int32 *out;
2206     int i, r_temp;
2207     int dy_8 = 8 - dy;
2208 
2209     /* horizontal first */
2210     out = temp;
2211     for (i = 0; i < blkheight + 1; i++)
2212     {
2213         r_temp = pRef[1];
2214         temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
2215         temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
2216         r0 = temp0 | (temp1 << 16);
2217         *out++ = r0;
2218         pRef += srcPitch;
2219     }
2220 
2221     pRef -= srcPitch * (blkheight + 1);
2222 
2223     out = temp;
2224 
2225     r0 = *out++;
2226 
2227     for (i = 0; i < blkheight; i++)
2228     {
2229         result = dy_8 * r0 + 0x00200020;
2230         r1 = *out++;
2231         result += dy * r1;
2232         result >>= 6;
2233         result &= 0x00FF00FF;
2234         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2235         r0 = r1;
2236         pOut += predPitch;
2237     }
2238     return;
2239 }
2240 
ChromaHorizontalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2241 void ChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2242                               uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2243 {
2244     OSCL_UNUSED_ARG(dy);
2245     OSCL_UNUSED_ARG(blkwidth);
2246     int i, temp, temp0, temp1;
2247 
2248     /* horizontal first */
2249     for (i = 0; i < blkheight; i++)
2250     {
2251         temp = pRef[1];
2252         temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
2253         temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
2254 
2255         *(int16 *)pOut = temp0 | (temp1 << 8);
2256         pRef += srcPitch;
2257         pOut += predPitch;
2258 
2259     }
2260     return;
2261 }
ChromaVerticalMC2_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2262 void ChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2263                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2264 {
2265     OSCL_UNUSED_ARG(dx);
2266     OSCL_UNUSED_ARG(blkwidth);
2267     int32 r0, r1, result;
2268     int i;
2269     int dy_8 = 8 - dy;
2270     r0 = pRef[0] | (pRef[1] << 16);
2271     pRef += srcPitch;
2272     for (i = 0; i < blkheight; i++)
2273     {
2274         result = dy_8 * r0 + 0x00040004;
2275         r1 = pRef[0] | (pRef[1] << 16);
2276         result += dy * r1;
2277         result >>= 3;
2278         result &= 0x00FF00FF;
2279         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2280         r0 = r1;
2281         pRef += srcPitch;
2282         pOut += predPitch;
2283     }
2284     return;
2285 }
2286 
ChromaFullMC_SIMD(uint8 * pRef,int srcPitch,int dx,int dy,uint8 * pOut,int predPitch,int blkwidth,int blkheight)2287 void ChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2288                        uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2289 {
2290     OSCL_UNUSED_ARG(dx);
2291     OSCL_UNUSED_ARG(dy);
2292     int i, j;
2293     int offset_in = srcPitch - blkwidth;
2294     int offset_out = predPitch - blkwidth;
2295     uint16 temp;
2296     uint8 byte;
2297 
2298     if (((uint32)pRef)&1)
2299     {
2300         for (j = blkheight; j > 0; j--)
2301         {
2302             for (i = blkwidth; i > 0; i -= 2)
2303             {
2304                 temp = *pRef++;
2305                 byte = *pRef++;
2306                 temp |= (byte << 8);
2307                 *((uint16*)pOut) = temp; /* write 2 bytes */
2308                 pOut += 2;
2309             }
2310             pOut += offset_out;
2311             pRef += offset_in;
2312         }
2313     }
2314     else
2315     {
2316         for (j = blkheight; j > 0; j--)
2317         {
2318             for (i = blkwidth; i > 0; i -= 2)
2319             {
2320                 temp = *((uint16*)pRef);
2321                 *((uint16*)pOut) = temp;
2322                 pRef += 2;
2323                 pOut += 2;
2324             }
2325             pOut += offset_out;
2326             pRef += offset_in;
2327         }
2328     }
2329     return ;
2330 }
2331