• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (C) 2007-2008 The Android Open Source Project
2 **
3 ** This software is licensed under the terms of the GNU General Public
4 ** License version 2, as published by the Free Software Foundation, and
5 ** may be copied, distributed, and modified under those terms.
6 **
7 ** This program is distributed in the hope that it will be useful,
8 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
9 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 ** GNU General Public License for more details.
11 */
12 /* this file contains template code and may be included multiple times */
13 
14 #ifndef ARGB_T_DEFINED
15 #define ARGB_T_DEFINED
16 
17 #if USE_MMX
18 #include <mmintrin.h>
19 
20 typedef __m64   mmx_t;
21 typedef  mmx_t  argb_t;
22 
23 static inline mmx_t
mmx_load8888(unsigned value,mmx_t zero)24 mmx_load8888( unsigned  value, mmx_t  zero )
25 {
26     return _mm_unpacklo_pi8( _mm_cvtsi32_si64 (value), zero);
27 }
28 
29 static inline unsigned
mmx_save8888(mmx_t argb,mmx_t zero)30 mmx_save8888( mmx_t   argb, mmx_t  zero )
31 {
32     return (unsigned) _mm_cvtsi64_si32( _mm_packs_pu16( argb, zero ) );
33 }
34 
35 static inline mmx_t
mmx_expand16(int value)36 mmx_expand16( int  value )
37 {
38     mmx_t  t1 = _mm_cvtsi32_si64( value );
39     return _mm_packs_pi32( t1, t1 );
40 }
41 
42 static inline mmx_t
mmx_mulshift(mmx_t argb,int multiplier,int rshift,mmx_t zero)43 mmx_mulshift( mmx_t   argb, int  multiplier, int  rshift, mmx_t  zero )
44 {
45     mmx_t   ar   = _mm_unpackhi_pi16(argb, zero );
46     mmx_t   gb   = _mm_unpacklo_pi16(argb, zero );
47     mmx_t   mult = mmx_expand16(multiplier);
48 
49     ar = _mm_srli_pi32( _mm_madd_pi16( ar, mult ), rshift );
50     gb = _mm_srli_pi32( _mm_madd_pi16( gb, mult ), rshift );
51 
52     return _mm_packs_pi32( gb, ar );
53 }
54 
55 static inline mmx_t
mmx_interp255(mmx_t m1,mmx_t m2,mmx_t zero,int alpha)56 mmx_interp255( mmx_t  m1, mmx_t  m2, mmx_t  zero, int  alpha )
57 {
58     mmx_t  mult, mult2, t1, t2, r1, r2;
59 
60     // m1 = [ a1 | r1 | g1 | b1 ]
61     // m2 = [ a2 | r2 | g2 | b2 ]
62     alpha = (alpha << 16) | (alpha ^ 255);
63     mult  = _mm_cvtsi32_si64( alpha );                   // mult  = [  0  |  0  |  a  | 1-a ]
64     mult2 = _mm_slli_si64( mult, 32 );                   // mult2 = [  a  | 1-a |  0  |  0  ]
65     mult  = _mm_or_si64( mult, mult2 );                  // mults = [  a  | 1-a |  a  | 1-a ]
66 
67     t1 = _mm_unpackhi_pi16( m1, m2 );    // t1 = [ a2 | a1 | r2 | r1 ]
68     r1 = _mm_madd_pi16( t1, mult );      // r1 = [   ra    |    rr   ]
69 
70     t2 = _mm_unpacklo_pi16( m1, m2 );    // t1 = [ g2 | g1 | b2 | b1 ]
71     r2 = _mm_madd_pi16( t2, mult );      // r2 = [   rg    |    rb   ]
72 
73     r1 = _mm_srli_pi32( r1, 8 );
74     r2 = _mm_srli_pi32( r2, 8 );
75 
76     return  _mm_packs_pi32( r2, r1 );
77 }
78 
79 #define   ARGB_DECL_ZERO()      mmx_t    _zero = _mm_setzero_si64()
80 #define   ARGB_DECL(x)          mmx_t    x
81 #define   ARGB_DECL2(x1,x2)     mmx_t    x1, x2
82 #define   ARGB_ZERO(x)          x = _zero
83 #define   ARGB_UNPACK(x,v)      x =  mmx_load8888((v), _zero)
84 #define   ARGB_PACK(x)          mmx_save8888(x, _zero)
85 #define   ARGB_COPY(x,y)        x = y
86 #define   ARGB_SUM(x1,x2,x3)    x1 = _mm_add_pi32(x2, x3)
87 #define   ARGB_REDUCE(x,red)   \
88     ({ \
89         int  _red = (red) >> 8;  \
90         if (_red < 256) \
91             x = mmx_mulshift( x, _red, 8, _zero ); \
92     })
93 
94 #define  ARGB_INTERP255(x1,x2,x3,alpha)  \
95     x1 = mmx_interp255( x2, x3, _zero, (alpha))
96 
97 #define    ARGB_ADDW_11(x1,x2,x3)  \
98     ARGB_SUM(x1,x2,x3)
99 
100 #define    ARGB_ADDW_31(x1,x2,x3)  \
101     ({ \
102         mmx_t   _t1 = _mm_add_pi16(x2, x3);  \
103         mmx_t   _t2 = _mm_slli_pi16(x2, 1);  \
104         x1 = _mm_add_pi16(_t1, _t2);  \
105     })
106 
107 #define    ARGB_ADDW_13(x1,x2,x3)  \
108     ({ \
109         mmx_t   _t1 = _mm_add_pi16(x2, x3);  \
110         mmx_t   _t2 = _mm_slli_pi16(x3, 1);  \
111         x1 = _mm_add_pi16(_t1, _t2);  \
112     })
113 
114 #define    ARGB_SHR(x1,x2,s)   \
115     x1 = _mm_srli_pi16(x2, s)
116 
117 
118 #define    ARGB_MULSHIFT(x1,x2,v,s)   \
119     x1 = mmx_mulshift(x2, v, s, _zero)
120 
121 #define   ARGB_BEGIN _mm_empty()
122 #define   ARGB_DONE  _mm_empty()
123 
124 #define   ARGB_RESCALE_SHIFT      10
125 #define   ARGB_DECL_SCALE(s2,s)   int   s2 = (int)((s)*(s)*(1 << ARGB_RESCALE_SHIFT))
126 #define   ARGB_RESCALE(x,s2)      x = mmx_mulshift( x, s2, ARGB_RESCALE_SHIFT, _zero )
127 
128 #else /* !USE_MMX */
129 
130 typedef uint32_t    argb_t;
131 
132 #define  ARGB_DECL_ZERO()   /* nothing */
133 #define  ARGB_DECL(x)       argb_t    x##_ag, x##_rb
134 #define  ARGB_DECL2(x1,x2)  argb_t    x1##_ag, x1##_rb, x2##_ag, x2##_rb
135 #define  ARGB_ZERO(x)       (x##_ag = x##_rb = 0)
136 #define  ARGB_COPY(x,y)     (x##_ag = y##_ag, x##_rb = y##_rb)
137 
138 #define  ARGB_UNPACK(x,v)  \
139     ({ \
140         argb_t  _v = (argb_t)(v); \
141         x##_ag = (_v >> 8) & 0xff00ff; \
142         x##_rb = (_v)      & 0xff00ff; \
143     })
144 
145 #define  ARGB_PACK(x)      (uint32_t)(((x##_ag) << 8) | x##_rb)
146 
147 #define   ARGB_SUM(x1,x2,x3)  \
148     ({ \
149         x1##_ag = x2##_ag + x3##_ag; \
150         x1##_rb = x2##_rb + x3##_rb; \
151     })
152 
153 #define   ARGB_REDUCE(x,red)   \
154     ({ \
155         int  _red = (red) >> 8; \
156         if (_red < 256) { \
157             x##_ag = ((x##_ag*_red) >> 8) & 0xff00ff; \
158             x##_rb = ((x##_rb*_red) >> 8) & 0xff00ff; \
159         } \
160     })
161 
162 #define    ARGB_INTERP255(x1,x2,x3,alpha)  \
163     ({ \
164         int  _alpha = (alpha); \
165         int  _ialpha; \
166         _alpha += _alpha >> 8; \
167         _ialpha = 256 - _alpha; \
168         x1##_ag = ((x2##_ag*_ialpha + x3##_ag*_alpha) >> 8) & 0xff00ff;  \
169         x1##_rb = ((x2##_rb*_ialpha + x3##_rb*_alpha) >> 8) & 0xff00ff;  \
170     })
171 
172 #define    ARGB_ADDW_11(x1,x2,x3)  \
173     ({ \
174         x1##_ag = (x2##_ag + x3##_ag);  \
175         x1##_rb = (x2##_rb + x3##_rb);  \
176     })
177 
178 #define    ARGB_ADDW_31(x1,x2,x3)  \
179     ({ \
180         x1##_ag = (3*x2##_ag + x3##_ag);  \
181         x1##_rb = (3*x2##_rb + x3##_rb);  \
182     })
183 
184 #define    ARGB_ADDW_13(x1,x2,x3)  \
185     ({ \
186         x1##_ag = (x2##_ag + 3*x3##_ag);  \
187         x1##_rb = (x2##_rb + 3*x3##_rb);  \
188     })
189 
190 #define    ARGB_MULSHIFT(x1,x2,v,s)   \
191     ({ \
192         unsigned  _vv = (v);  \
193         x1##_ag = ((x2##_ag * _vv) >> (s)) & 0xff00ff;  \
194         x1##_rb = ((x2##_rb * _vv) >> (s)) & 0xff00ff;  \
195     })
196 
197 #define   ARGB_SHR(x1,x2,s)  \
198     ({  \
199         int  _s = (s);  \
200         x1##_ag = (x2##_ag >> _s) & 0xff00ff; \
201         x1##_rb = (x2##_rb >> _s) & 0xff00ff; \
202     })
203 
204 #define   ARGB_BEGIN ((void)0)
205 #define   ARGB_DONE  ((void)0)
206 
207 #define   ARGB_RESCALE_SHIFT      8
208 #define   ARGB_DECL_SCALE(s2,s)   int   s2 = (int)((s)*(s)*(1 << ARGB_RESCALE_SHIFT))
209 #define   ARGB_RESCALE(x,scale2)  ARGB_MULSHIFT(x,x,scale2,ARGB_RESCALE_SHIFT)
210 
211 #endif /* !USE_MMX */
212 
213 #define   ARGB_ADD(x1,x2)     ARGB_SUM(x1,x1,x2)
214 #define   ARGB_READ(x,p)      ARGB_UNPACK(x,*(uint32_t*)(p))
215 #define   ARGB_WRITE(x,p)     *(uint32_t*)(p) = ARGB_PACK(x)
216 
217 #endif /* !ARGB_T_DEFINED */
218 
219 
220 
221 #ifdef ARGB_SCALE_GENERIC
222 static void
ARGB_SCALE_GENERIC(ScaleOp * op)223 ARGB_SCALE_GENERIC( ScaleOp*   op )
224 {
225     int        dst_pitch = op->dst_pitch;
226     int        src_pitch = op->src_pitch;
227     uint8_t*   dst_line  = op->dst_line;
228     uint8_t*   src_line  = op->src_line;
229     ARGB_DECL_SCALE(scale2, op->scale);
230     int        h;
231     int        sx = op->sx;
232     int        sy = op->sy;
233     int        ix = op->ix;
234     int        iy = op->iy;
235 
236     ARGB_BEGIN;
237 
238     src_line += (sx >> 16)*4 + (sy >> 16)*src_pitch;
239     sx       &= 0xffff;
240     sy       &= 0xffff;
241 
242     for ( h = op->rd.h; h > 0; h-- ) {
243         uint8_t*  dst = dst_line;
244         uint8_t*  src = src_line;
245         uint8_t*  dst_end = dst + 4*op->rd.w;
246         int       sx1 = sx;
247         int       sy1 = sy;
248 
249         for ( ; dst < dst_end; ) {
250             int  sx2 = sx1 + ix;
251             int  sy2 = sy1 + iy;
252 
253             ARGB_DECL_ZERO();
254             ARGB_DECL(spix);
255             ARGB_DECL(pix);
256             ARGB_ZERO(pix);
257 
258             /* the current destination pixel maps to the (sx1,sy1)-(sx2,sy2)
259             * source square, we're going to compute the sum of its pixels'
260             * colors...  simple box filtering
261             */
262             {
263                 int  gsy, gsx;
264                 for ( gsy = 0; gsy < sy2; gsy += 65536 ) {
265                     for ( gsx = 0; gsx < sx2; gsx += 65536 ) {
266                         uint8_t*  s    = src + (gsx >> 16)*4 + (gsy >> 16)*src_pitch;
267                         int       xmin = gsx, xmax = gsx + 65536, ymin = gsy, ymax = gsy + 65536;
268                         unsigned  ww, hh;
269                         unsigned  red;
270 
271                         if (xmin < sx1) xmin = sx1;
272                         if (xmax > sx2) xmax = sx2;
273                         if (ymin < sy1) ymin = sy1;
274                         if (ymax > sy2) ymax = sy2;
275 
276                         ww = (unsigned)(xmax-xmin);
277                         red = ww;
278 
279                         hh = (unsigned)(ymax-ymin);
280                         red = (hh < 65536) ? (red*hh >> 16U) : red;
281 
282                         ARGB_READ(spix,s);
283                         ARGB_REDUCE(spix,red);
284                         ARGB_ADD(pix,spix);
285                     }
286                 }
287             }
288 
289             ARGB_RESCALE(pix,scale2);
290             ARGB_WRITE(pix,dst);
291 
292             sx1  = sx2;
293             src += (sx1 >> 16)*4;
294             sx1 &= 0xffff;
295             dst += 4;
296         }
297 
298         sy       += iy;
299         src_line += (sy >> 16)*src_pitch;
300         sy       &= 0xffff;
301 
302         dst_line += dst_pitch;
303     }
304     ARGB_DONE;
305 }
306 #endif
307 #undef  ARGB_SCALE_GENERIC
308 
309 
310 #ifdef ARGB_SCALE_05_TO_10
cross(int x,int y)311 static inline int cross( int  x, int  y ) {
312     if (x == 65536 && y == 65536)
313         return 65536;
314 
315     return (int)((unsigned)x * (unsigned)y >> 16U);
316 }
317 
318 static void
scale_05_to_10(ScaleOp * op)319 scale_05_to_10( ScaleOp*   op )
320 {
321     int        dst_pitch = op->dst_pitch;
322     int        src_pitch = op->src_pitch;
323     uint8_t*   dst_line  = op->dst_line;
324     uint8_t*   src_line  = op->src_line;
325     ARGB_DECL_SCALE(scale2, op->scale);
326     int        h;
327     int        sx = op->sx;
328     int        sy = op->sy;
329     int        ix = op->ix;
330     int        iy = op->iy;
331 
332     ARGB_BEGIN;
333 
334     src_line += (sx >> 16)*4 + (sy >> 16)*src_pitch;
335     sx       &= 0xffff;
336     sy       &= 0xffff;
337 
338     for ( h = op->rd.h; h > 0; h-- ) {
339         uint8_t*  dst = dst_line;
340         uint8_t*  src = src_line;
341         uint8_t*  dst_end = dst + 4*op->rd.w;
342         int       sx1 = sx;
343         int       sy1 = sy;
344 
345         for ( ; dst < dst_end; ) {
346             int  sx2 = sx1 + ix;
347             int  sy2 = sy1 + iy;
348 
349             ARGB_DECL_ZERO();
350             ARGB_DECL2(spix, pix);
351 
352             int      off = src_pitch;
353             int      fx1 = sx1 & 0xffff;
354             int      fx2 = sx2 & 0xffff;
355             int      fy1 = sy1 & 0xffff;
356             int      fy2 = sy2 & 0xffff;
357 
358             int      center_x = ((sx1 >> 16) + 1) < ((sx2-1) >> 16);
359             int      center_y = ((sy1 >> 16) + 1) < ((sy2-1) >> 16);
360 
361             ARGB_ZERO(pix);
362 
363             if (fx2 == 0) {
364                 fx2  = 65536;
365             }
366             if (fy2 == 0) {
367                 fy2  = 65536;
368             }
369             fx1 = 65536 - fx1;
370             fy1 = 65536 - fy1;
371 
372             /** TOP BAND
373              **/
374 
375             /* top-left pixel */
376             ARGB_READ(spix,src);
377             ARGB_REDUCE(spix,cross(fx1,fy1));
378             ARGB_ADD(pix,spix);
379 
380             /* top-center pixel, if any */
381             ARGB_READ(spix,src + 4);
382             if (center_x) {
383                 ARGB_REDUCE(spix,fy1);
384                 ARGB_ADD(pix,spix);
385                 ARGB_READ(spix,src + 8);
386             }
387 
388             /* top-right pixel */
389             ARGB_REDUCE(spix,cross(fx2,fy1));
390             ARGB_ADD(pix,spix);
391 
392             /** MIDDLE BAND, IF ANY
393              **/
394             if (center_y) {
395                 /* left-middle pixel */
396                 ARGB_READ(spix,src + off);
397                 ARGB_REDUCE(spix,fx1);
398                 ARGB_ADD(pix,spix);
399 
400                 /* center pixel, if any */
401                 ARGB_READ(spix,src + off + 4);
402                 if (center_x) {
403                     ARGB_ADD(pix,spix);
404                     ARGB_READ(spix,src + off + 8);
405                 }
406 
407                 /* right-middle pixel */
408                 ARGB_REDUCE(spix,fx2);
409                 ARGB_ADD(pix,spix);
410 
411                 off += src_pitch;
412             }
413 
414             /** BOTTOM BAND
415              **/
416             /* left-bottom pixel */
417             ARGB_READ(spix,src + off);
418             ARGB_REDUCE(spix,cross(fx1,fy2));
419             ARGB_ADD(pix,spix);
420 
421             /* center-bottom, if any */
422             ARGB_READ(spix,src + off + 4);
423             if (center_x) {
424                 ARGB_REDUCE(spix,fy2);
425                 ARGB_ADD(pix,spix);
426                 ARGB_READ(spix,src + off + 8);
427             }
428 
429             /* right-bottom pixel */
430             ARGB_REDUCE(spix,cross(fx2,fy2));
431             ARGB_ADD(pix,spix);
432 
433             /** WRITE IT
434              **/
435             ARGB_RESCALE(pix,scale2);
436             ARGB_WRITE(pix,dst);
437 
438             sx1  = sx2;
439             src += (sx1 >> 16)*4;
440             sx1 &= 0xffff;
441             dst += 4;
442         }
443 
444         sy       += iy;
445         src_line += (sy >> 16)*src_pitch;
446         sy       &= 0xffff;
447 
448         dst_line += dst_pitch;
449     }
450     ARGB_DONE;
451 }
452 #endif
453 #undef ARGB_SCALE_05_TO_10
454 
455 
456 #ifdef ARGB_SCALE_UP_BILINEAR
457 static void
scale_up_bilinear(ScaleOp * op)458 scale_up_bilinear( ScaleOp*  op )
459 {
460     int        dst_pitch = op->dst_pitch;
461     int        src_pitch = op->src_pitch;
462     uint8_t*   dst_line  = op->dst_line;
463     uint8_t*   src_line  = op->src_line;
464     int        sx = op->sx;
465     int        sy = op->sy;
466     int        ix = op->ix;
467     int        iy = op->iy;
468     int        xlimit, ylimit;
469     int        h, sx0;
470 
471     ARGB_BEGIN;
472 
473     /* the center pixel is at (sx+ix/2, sy+iy/2), we then want to get */
474     /* the four nearest source pixels, which are at (0.5,0.5) offsets */
475 
476     sx = sx + ix/2 - 32768;
477     sy = sy + iy/2 - 32768;
478 
479     xlimit = (op->src_w-1);
480     ylimit = (op->src_h-1);
481 
482     sx0 = sx;
483 
484     for ( h = op->rd.h; h > 0; h-- ) {
485         uint8_t*  dst = dst_line;
486         uint8_t*  dst_end = dst + 4*op->rd.w;
487 
488         sx = sx0;
489         for ( ; dst < dst_end; ) {
490             int        ex1, ex2, ey1, ey2, alpha;
491             uint8_t*   s;
492 
493             ARGB_DECL_ZERO();
494             ARGB_DECL2(spix1,spix2);
495             ARGB_DECL2(pix3,pix4);
496             ARGB_DECL(pix);
497 
498             /* find the four neighbours */
499             ex1 = (sx >> 16);
500             ey1 = (sy >> 16);
501             ex2 = (sx+65535) >> 16;
502             ey2 = (sy+65535) >> 16;
503 
504             if (ex1 < 0) ex1 = 0; else if (ex1 > xlimit) ex1 = xlimit;
505             if (ey1 < 0) ey1 = 0; else if (ey1 > ylimit) ey1 = ylimit;
506             if (ex2 < 0) ex2 = 0; else if (ex2 > xlimit) ex2 = xlimit;
507             if (ey2 < 0) ey2 = 0; else if (ey2 > ylimit) ey2 = ylimit;
508 
509             ex2 = (ex2-ex1)*4;
510             ey2 = (ey2-ey1)*src_pitch;
511 
512             /* interpolate */
513             s   = src_line + ex1*4 + ey1*src_pitch;
514             ARGB_READ(spix1, s);
515             ARGB_READ(spix2, s+ex2);
516 
517             alpha  = (sx >> 8) & 0xff;
518             ARGB_INTERP255(pix3,spix1,spix2,alpha);
519 
520             s  += ey2;
521             ARGB_READ(spix1, s);
522             ARGB_READ(spix2, s+ex2);
523 
524             ARGB_INTERP255(pix4,spix1,spix2,alpha);
525 
526             alpha = (sy >> 8) & 0xff;
527             ARGB_INTERP255(pix,pix3,pix4,alpha);
528 
529             ARGB_WRITE(pix,dst);
530 
531             sx  += ix;
532             dst += 4;
533         }
534 
535         sy       += iy;
536         dst_line += dst_pitch;
537     }
538     ARGB_DONE;
539 }
540 #endif
541 #undef ARGB_SCALE_UP_BILINEAR
542 
543 #ifdef ARGB_SCALE_UP_QUICK_4x4
544 static void
ARGB_SCALE_UP_QUICK_4x4(ScaleOp * op)545 ARGB_SCALE_UP_QUICK_4x4( ScaleOp*  op )
546 {
547     int        dst_pitch = op->dst_pitch;
548     int        src_pitch = op->src_pitch;
549     uint8_t*   dst_line  = op->dst_line;
550     uint8_t*   src_line  = op->src_line;
551     int        sx = op->sx;
552     int        sy = op->sy;
553     int        ix = op->ix;
554     int        iy = op->iy;
555     int        xlimit, ylimit;
556     int        h, sx0;
557 
558     ARGB_BEGIN;
559 
560     /* the center pixel is at (sx+ix/2, sy+iy/2), we then want to get */
561     /* the four nearest source pixels, which are at (0.5,0.5) offsets */
562 
563     sx = sx + ix/2 - 32768;
564     sy = sy + iy/2 - 32768;
565 
566     xlimit = (op->src_w-1);
567     ylimit = (op->src_h-1);
568 
569     sx0 = sx;
570 
571     for ( h = op->rd.h; h > 0; h-- ) {
572         uint8_t*  dst = dst_line;
573         uint8_t*  dst_end = dst + 4*op->rd.w;
574 
575         sx = sx0;
576         for ( ; dst < dst_end; ) {
577             int        ex1, ex2, ey1, ey2;
578             uint8_t*   p;
579             ARGB_DECL_ZERO();
580             ARGB_DECL(pix);
581             ARGB_DECL2(spix1, spix2);
582             ARGB_DECL2(pix3, pix4);
583 
584             /* find the four neighbours */
585             ex1 = (sx >> 16);
586             ey1 = (sy >> 16);
587             ex2 = (sx+65535) >> 16;
588             ey2 = (sy+65535) >> 16;
589 
590             if (ex1 < 0) ex1 = 0; else if (ex1 > xlimit) ex1 = xlimit;
591             if (ey1 < 0) ey1 = 0; else if (ey1 > ylimit) ey1 = ylimit;
592             if (ex2 < 0) ex2 = 0; else if (ex2 > xlimit) ex2 = xlimit;
593             if (ey2 < 0) ey2 = 0; else if (ey2 > ylimit) ey2 = ylimit;
594 
595             /* interpolate */
596             p   = (src_line + ex1*4 + ey1*src_pitch);
597 
598             ex2 = (ex2-ex1)*4;
599             ey2 = (ey2-ey1)*src_pitch;
600 
601             switch (((sx >> 14) & 3) | ((sy >> 12) & 12)) {
602                 case 0:
603                     *(uint32_t*)dst = *(uint32_t*)p;
604                     break;
605 
606                 /* top-line is easy */
607                 case 1:
608                     ARGB_READ(spix1, p);
609                     ARGB_READ(spix2, p+ex2);
610                     ARGB_ADDW_31(pix,spix1,spix2);
611                     ARGB_SHR(pix,pix,2);
612                     ARGB_WRITE(pix, dst);
613                     break;
614 
615                 case 2:
616                     ARGB_READ(spix1, p);
617                     ARGB_READ(spix2, p+ex2);
618                     ARGB_ADDW_11(pix, spix1, spix2);
619                     ARGB_SHR(pix,pix,1);
620                     ARGB_WRITE(pix, dst);
621                     break;
622 
623                 case 3:
624                     ARGB_READ(spix1, p);
625                     ARGB_READ(spix2, p+ex2);
626                     ARGB_ADDW_13(pix,spix1,spix2);
627                     ARGB_SHR(pix,pix,2);
628                     ARGB_WRITE(pix, dst);
629                     break;
630 
631                 /* second line is harder */
632                 case 4:
633                     ARGB_READ(spix1, p);
634                     ARGB_READ(spix2, p+ey2);
635                     ARGB_ADDW_31(pix,spix1,spix2);
636                     ARGB_SHR(pix,pix,2);
637                     ARGB_WRITE(pix, dst);
638                     break;
639 
640                 case 5:
641                     ARGB_READ(spix1, p);
642                     ARGB_READ(spix2, p+ex2);
643                     ARGB_ADDW_31(pix3,spix1,spix2);
644                     p += ey2;
645                     ARGB_READ(spix1, p);
646                     ARGB_READ(spix2, p+ex2);
647                     ARGB_ADDW_31(pix4,spix1,spix2);
648 
649                     ARGB_ADDW_31(pix,pix3,pix4);
650                     ARGB_SHR(pix,pix,4);
651                     ARGB_WRITE(pix,dst);
652                     break;
653 
654                 case 6:
655                     ARGB_READ(spix1, p);
656                     ARGB_READ(spix2, p+ex2);
657                     ARGB_ADDW_11(pix3,spix1,spix2);
658                     p += ey2;
659                     ARGB_READ(spix1, p);
660                     ARGB_READ(spix2, p+ex2);
661                     ARGB_ADDW_11(pix4,spix1,spix2);
662 
663                     ARGB_ADDW_31(pix,pix3,pix4);
664                     ARGB_SHR(pix,pix,3);
665                     ARGB_WRITE(pix,dst);
666                     break;
667 
668                 case 7:
669                     ARGB_READ(spix1, p);
670                     ARGB_READ(spix2, p+ex2);
671                     ARGB_ADDW_13(pix3,spix1,spix2);
672                     p += ey2;
673                     ARGB_READ(spix1, p);
674                     ARGB_READ(spix2, p+ex2);
675                     ARGB_ADDW_13(pix4,spix1,spix2);
676 
677                     ARGB_ADDW_31(pix,pix3,pix4);
678                     ARGB_SHR(pix,pix,4);
679                     ARGB_WRITE(pix,dst);
680                     break;
681 
682                  /* third line */
683                 case 8:
684                     ARGB_READ(spix1, p);
685                     ARGB_READ(spix2, p+ey2);
686                     ARGB_ADDW_11(pix,spix1,spix2);
687                     ARGB_SHR(pix,pix,1);
688                     ARGB_WRITE(pix, dst);
689                     break;
690 
691                 case 9:
692                     ARGB_READ(spix1, p);
693                     ARGB_READ(spix2, p+ex2);
694                     ARGB_ADDW_31(pix3,spix1,spix2);
695                     p += ey2;
696                     ARGB_READ(spix1, p);
697                     ARGB_READ(spix2, p+ex2);
698                     ARGB_ADDW_31(pix4,spix1,spix2);
699 
700                     ARGB_ADDW_11(pix,pix3,pix4);
701                     ARGB_SHR(pix,pix,3);
702                     ARGB_WRITE(pix,dst);
703                     break;
704 
705                 case 10:
706                     ARGB_READ(spix1, p);
707                     ARGB_READ(spix2, p+ex2);
708                     ARGB_ADDW_11(pix3,spix1,spix2);
709                     p += ey2;
710                     ARGB_READ(spix1, p);
711                     ARGB_READ(spix2, p+ex2);
712                     ARGB_ADDW_11(pix4,spix1,spix2);
713 
714                     ARGB_ADDW_11(pix,pix3,pix4);
715                     ARGB_SHR(pix,pix,2);
716                     ARGB_WRITE(pix,dst);
717                     break;
718 
719                 case 11:
720                     ARGB_READ(spix1, p);
721                     ARGB_READ(spix2, p+ex2);
722                     ARGB_ADDW_13(pix3,spix1,spix2);
723                     p += ey2;
724                     ARGB_READ(spix1, p);
725                     ARGB_READ(spix2, p+ex2);
726                     ARGB_ADDW_13(pix4,spix1,spix2);
727 
728                     ARGB_ADDW_11(pix,pix3,pix4);
729                     ARGB_SHR(pix,pix,3);
730                     ARGB_WRITE(pix,dst);
731                     break;
732 
733                  /* last line */
734                 case 12:
735                     ARGB_READ(spix1, p);
736                     ARGB_READ(spix2, p+ey2);
737                     ARGB_ADDW_13(pix,spix1,spix2);
738                     ARGB_SHR(pix,pix,2);
739                     ARGB_WRITE(pix, dst);
740                     break;
741 
742                 case 13:
743                     ARGB_READ(spix1, p);
744                     ARGB_READ(spix2, p+ex2);
745                     ARGB_ADDW_31(pix3,spix1,spix2);
746                     p += ey2;
747                     ARGB_READ(spix1, p);
748                     ARGB_READ(spix2, p+ex2);
749                     ARGB_ADDW_31(pix4,spix1,spix2);
750 
751                     ARGB_ADDW_13(pix,pix3,pix4);
752                     ARGB_SHR(pix,pix,4);
753                     ARGB_WRITE(pix,dst);
754                     break;
755 
756                 case 14:
757                     ARGB_READ(spix1, p);
758                     ARGB_READ(spix2, p+ex2);
759                     ARGB_ADDW_11(pix3,spix1,spix2);
760                     p += ey2;
761                     ARGB_READ(spix1, p);
762                     ARGB_READ(spix2, p+ex2);
763                     ARGB_ADDW_11(pix4,spix1,spix2);
764 
765                     ARGB_ADDW_13(pix,pix3,pix4);
766                     ARGB_SHR(pix,pix,3);
767                     ARGB_WRITE(pix,dst);
768                     break;
769 
770                 default:
771                     ARGB_READ(spix1, p);
772                     ARGB_READ(spix2, p+ex2);
773                     ARGB_ADDW_13(pix3,spix1,spix2);
774                     p += ey2;
775                     ARGB_READ(spix1, p);
776                     ARGB_READ(spix2, p+ex2);
777                     ARGB_ADDW_13(pix4,spix1,spix2);
778 
779                     ARGB_ADDW_13(pix,pix3,pix4);
780                     ARGB_SHR(pix,pix,4);
781                     ARGB_WRITE(pix,dst);
782             }
783             sx  += ix;
784             dst += 4;
785         }
786 
787         sy       += iy;
788         dst_line += dst_pitch;
789     }
790     ARGB_DONE;
791 }
792 #endif
793 #undef  ARGB_SCALE_UP_QUICK_4x4
794 
795 
796 #ifdef ARGB_SCALE_NEAREST
797 /* this version scales up with nearest neighbours - looks crap */
798 static void
ARGB_SCALE_NEAREST(ScaleOp * op)799 ARGB_SCALE_NEAREST( ScaleOp*  op )
800 {
801     int        dst_pitch = op->dst_pitch;
802     int        src_pitch = op->src_pitch;
803     uint8_t*   dst_line  = op->dst_line;
804     uint8_t*   src_line  = op->src_line;
805     int        sx = op->sx;
806     int        sy = op->sy;
807     int        ix = op->ix;
808     int        iy = op->iy;
809     int        xlimit, ylimit;
810     int        h, sx0;
811 
812     ARGB_BEGIN;
813 
814     /* the center pixel is at (sx+ix/2, sy+iy/2), we then want to get */
815     /* the four nearest source pixels, which are at (0.5,0.5) offsets */
816 
817     sx = sx + ix/2 - 32768;
818     sy = sy + iy/2 - 32768;
819 
820     xlimit = (op->src_w-1);
821     ylimit = (op->src_h-1);
822 
823     sx0 = sx;
824 
825     for ( h = op->rd.h; h > 0; h-- ) {
826         uint8_t*  dst = dst_line;
827         uint8_t*  dst_end = dst + 4*op->rd.w;
828 
829         sx = sx0;
830         for ( ; dst < dst_end; ) {
831             int        ex1, ex2, ey1, ey2;
832             unsigned*  p;
833 
834             /* find the top-left neighbour */
835             ex1 = (sx >> 16);
836             ey1 = (sy >> 16);
837             ex2 = ex1+1;
838             ey2 = ey1+1;
839 
840             if (ex1 < 0) ex1 = 0; else if (ex1 > xlimit) ex1 = xlimit;
841             if (ey1 < 0) ey1 = 0; else if (ey1 > ylimit) ey1 = ylimit;
842             if (ex2 < 0) ex2 = 0; else if (ex2 > xlimit) ex2 = xlimit;
843             if (ey2 < 0) ey2 = 0; else if (ey2 > ylimit) ey2 = ylimit;
844 
845             p   = (unsigned*)(src_line + ex1*4 + ey1*src_pitch);
846             if ((sx & 0xffff) >= 32768)
847                 p += (ex2-ex1);
848             if ((sy & 0xffff) >= 32768)
849                 p = (unsigned*)((char*)p + (ey2-ey1)*src_pitch);
850 
851             *(unsigned*)dst = p[0];
852 
853             sx  += ix;
854             dst += 4;
855         }
856 
857         sy       += iy;
858         dst_line += dst_pitch;
859     }
860 }
861 #endif
862 #undef  ARGB_SCALE_NEAREST
863