• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (C) 2007-2008 The Android Open Source Project
2 **
3 ** This software is licensed under the terms of the GNU General Public
4 ** License version 2, as published by the Free Software Foundation, and
5 ** may be copied, distributed, and modified under those terms.
6 **
7 ** This program is distributed in the hope that it will be useful,
8 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
9 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 ** GNU General Public License for more details.
11 */
12 /* this file contains template code and may be included multiple times */
13 
14 #ifndef ARGB_T_DEFINED
15 #define ARGB_T_DEFINED
16 
17 #if USE_MMX
18 #include <mmintrin.h>
19 
20 typedef __m64   mmx_t;
21 typedef  mmx_t  argb_t;
22 
23 static inline mmx_t
mmx_load8888(unsigned value,mmx_t zero)24 mmx_load8888( unsigned  value, mmx_t  zero )
25 {
26     return _mm_unpacklo_pi8( _mm_cvtsi32_si64 (value), zero);
27 }
28 
29 static inline unsigned
mmx_save8888(mmx_t argb,mmx_t zero)30 mmx_save8888( mmx_t   argb, mmx_t  zero )
31 {
32     return (unsigned) _mm_cvtsi64_si32( _mm_packs_pu16( argb, zero ) );
33 }
34 
35 static inline mmx_t
mmx_expand16(int value)36 mmx_expand16( int  value )
37 {
38     mmx_t  t1 = _mm_cvtsi32_si64( value );
39     return _mm_packs_pi32( t1, t1 );
40 }
41 
42 static inline mmx_t
mmx_mulshift(mmx_t argb,int multiplier,int rshift,mmx_t zero)43 mmx_mulshift( mmx_t   argb, int  multiplier, int  rshift, mmx_t  zero )
44 {
45     mmx_t   ar   = _mm_unpackhi_pi16(argb, zero );
46     mmx_t   gb   = _mm_unpacklo_pi16(argb, zero );
47     mmx_t   mult = mmx_expand16(multiplier);
48 
49     ar = _mm_srli_pi32( _mm_madd_pi16( ar, mult ), rshift );
50     gb = _mm_srli_pi32( _mm_madd_pi16( gb, mult ), rshift );
51 
52     return _mm_packs_pi32( gb, ar );
53 }
54 
55 static inline mmx_t
mmx_interp255(mmx_t m1,mmx_t m2,mmx_t zero,int alpha)56 mmx_interp255( mmx_t  m1, mmx_t  m2, mmx_t  zero, int  alpha )
57 {
58     mmx_t  mult, mult2, t1, t2, r1, r2;
59 
60     // m1 = [ a1 | r1 | g1 | b1 ]
61     // m2 = [ a2 | r2 | g2 | b2 ]
62     alpha = (alpha << 16) | (alpha ^ 255);
63     mult  = _mm_cvtsi32_si64( alpha );                   // mult  = [  0  |  0  |  a  | 1-a ]
64     mult2 = _mm_slli_si64( mult, 32 );                   // mult2 = [  a  | 1-a |  0  |  0  ]
65     mult  = _mm_or_si64( mult, mult2 );                  // mults = [  a  | 1-a |  a  | 1-a ]
66 
67     t1 = _mm_unpackhi_pi16( m1, m2 );    // t1 = [ a2 | a1 | r2 | r1 ]
68     r1 = _mm_madd_pi16( t1, mult );      // r1 = [   ra    |    rr   ]
69 
70     t2 = _mm_unpacklo_pi16( m1, m2 );    // t1 = [ g2 | g1 | b2 | b1 ]
71     r2 = _mm_madd_pi16( t2, mult );      // r2 = [   rg    |    rb   ]
72 
73     r1 = _mm_srli_pi32( r1, 8 );
74     r2 = _mm_srli_pi32( r2, 8 );
75 
76     return  _mm_packs_pi32( r2, r1 );
77 }
78 
79 #define   ARGB_DECL_ZERO()      mmx_t    _zero = _mm_setzero_si64()
80 #define   ARGB_DECL(x)          mmx_t    x
81 #define   ARGB_DECL2(x1,x2)     mmx_t    x1, x2
82 #define   ARGB_ZERO(x)          x = _zero
83 #define   ARGB_UNPACK(x,v)      x =  mmx_load8888((v), _zero)
84 #define   ARGB_PACK(x)          mmx_save8888(x, _zero)
85 #define   ARGB_COPY(x,y)        x = y
86 #define   ARGB_SUM(x1,x2,x3)    x1 = _mm_add_pi32(x2, x3)
87 #define   ARGB_REDUCE(x,red)   \
88     ({ \
89         int  _red = (red) >> 8;  \
90         if (_red < 256) \
91             x = mmx_mulshift( x, _red, 8, _zero ); \
92     })
93 
94 #define  ARGB_INTERP255(x1,x2,x3,alpha)  \
95     x1 = mmx_interp255( x2, x3, _zero, (alpha))
96 
97 #define    ARGB_ADDW_11(x1,x2,x3)  \
98     ARGB_SUM(x1,x2,x3)
99 
100 #define    ARGB_ADDW_31(x1,x2,x3)  \
101     ({ \
102         mmx_t   _t1 = _mm_add_pi16(x2, x3);  \
103         mmx_t   _t2 = _mm_slli_pi16(x2, 1);  \
104         x1 = _mm_add_pi16(_t1, _t2);  \
105     })
106 
107 #define    ARGB_ADDW_13(x1,x2,x3)  \
108     ({ \
109         mmx_t   _t1 = _mm_add_pi16(x2, x3);  \
110         mmx_t   _t2 = _mm_slli_pi16(x3, 1);  \
111         x1 = _mm_add_pi16(_t1, _t2);  \
112     })
113 
114 #define    ARGB_SHR(x1,x2,s)   \
115     x1 = _mm_srli_pi16(x2, s)
116 
117 
118 #define    ARGB_MULSHIFT(x1,x2,v,s)   \
119     x1 = mmx_mulshift(x2, v, s, _zero)
120 
121 #define   ARGB_DONE  _mm_empty()
122 
123 #define   ARGB_RESCALE_SHIFT      10
124 #define   ARGB_DECL_SCALE(s2,s)   int   s2 = (int)((s)*(s)*(1 << ARGB_RESCALE_SHIFT))
125 #define   ARGB_RESCALE(x,s2)      x = mmx_mulshift( x, s2, ARGB_RESCALE_SHIFT, _zero )
126 
127 #else /* !USE_MMX */
128 
129 typedef uint32_t    argb_t;
130 
131 #define  ARGB_DECL_ZERO()   /* nothing */
132 #define  ARGB_DECL(x)       argb_t    x##_ag, x##_rb
133 #define  ARGB_DECL2(x1,x2)  argb_t    x1##_ag, x1##_rb, x2##_ag, x2##_rb
134 #define  ARGB_ZERO(x)       (x##_ag = x##_rb = 0)
135 #define  ARGB_COPY(x,y)     (x##_ag = y##_ag, x##_rb = y##_rb)
136 
137 #define  ARGB_UNPACK(x,v)  \
138     ({ \
139         argb_t  _v = (argb_t)(v); \
140         x##_ag = (_v >> 8) & 0xff00ff; \
141         x##_rb = (_v)      & 0xff00ff; \
142     })
143 
144 #define  ARGB_PACK(x)      (uint32_t)(((x##_ag) << 8) | x##_rb)
145 
146 #define   ARGB_SUM(x1,x2,x3)  \
147     ({ \
148         x1##_ag = x2##_ag + x3##_ag; \
149         x1##_rb = x2##_rb + x3##_rb; \
150     })
151 
152 #define   ARGB_REDUCE(x,red)   \
153     ({ \
154         int  _red = (red) >> 8; \
155         if (_red < 256) { \
156             x##_ag = ((x##_ag*_red) >> 8) & 0xff00ff; \
157             x##_rb = ((x##_rb*_red) >> 8) & 0xff00ff; \
158         } \
159     })
160 
161 #define    ARGB_INTERP255(x1,x2,x3,alpha)  \
162     ({ \
163         int  _alpha = (alpha); \
164         int  _ialpha; \
165         _alpha += _alpha >> 8; \
166         _ialpha = 256 - _alpha; \
167         x1##_ag = ((x2##_ag*_ialpha + x3##_ag*_alpha) >> 8) & 0xff00ff;  \
168         x1##_rb = ((x2##_rb*_ialpha + x3##_rb*_alpha) >> 8) & 0xff00ff;  \
169     })
170 
171 #define    ARGB_ADDW_11(x1,x2,x3)  \
172     ({ \
173         x1##_ag = (x2##_ag + x3##_ag);  \
174         x1##_rb = (x2##_rb + x3##_rb);  \
175     })
176 
177 #define    ARGB_ADDW_31(x1,x2,x3)  \
178     ({ \
179         x1##_ag = (3*x2##_ag + x3##_ag);  \
180         x1##_rb = (3*x2##_rb + x3##_rb);  \
181     })
182 
183 #define    ARGB_ADDW_13(x1,x2,x3)  \
184     ({ \
185         x1##_ag = (x2##_ag + 3*x3##_ag);  \
186         x1##_rb = (x2##_rb + 3*x3##_rb);  \
187     })
188 
189 #define    ARGB_MULSHIFT(x1,x2,v,s)   \
190     ({ \
191         unsigned  _vv = (v);  \
192         x1##_ag = ((x2##_ag * _vv) >> (s)) & 0xff00ff;  \
193         x1##_rb = ((x2##_rb * _vv) >> (s)) & 0xff00ff;  \
194     })
195 
196 #define   ARGB_SHR(x1,x2,s)  \
197     ({  \
198         int  _s = (s);  \
199         x1##_ag = (x2##_ag >> _s) & 0xff00ff; \
200         x1##_rb = (x2##_rb >> _s) & 0xff00ff; \
201     })
202 
203 #define   ARGB_DONE  ((void)0)
204 
205 #define   ARGB_RESCALE_SHIFT      8
206 #define   ARGB_DECL_SCALE(s2,s)   int   s2 = (int)((s)*(s)*(1 << ARGB_RESCALE_SHIFT))
207 #define   ARGB_RESCALE(x,scale2)  ARGB_MULSHIFT(x,x,scale2,ARGB_RESCALE_SHIFT)
208 
209 #endif /* !USE_MMX */
210 
211 #define   ARGB_ADD(x1,x2)     ARGB_SUM(x1,x1,x2)
212 #define   ARGB_READ(x,p)      ARGB_UNPACK(x,*(uint32_t*)(p))
213 #define   ARGB_WRITE(x,p)     *(uint32_t*)(p) = ARGB_PACK(x)
214 
215 #endif /* !ARGB_T_DEFINED */
216 
217 
218 
219 #ifdef ARGB_SCALE_GENERIC
220 static void
ARGB_SCALE_GENERIC(ScaleOp * op)221 ARGB_SCALE_GENERIC( ScaleOp*   op )
222 {
223     int        dst_pitch = op->dst_pitch;
224     int        src_pitch = op->src_pitch;
225     uint8_t*   dst_line  = op->dst_line;
226     uint8_t*   src_line  = op->src_line;
227     ARGB_DECL_SCALE(scale2, op->scale);
228     int        h;
229     int        sx = op->sx;
230     int        sy = op->sy;
231     int        ix = op->ix;
232     int        iy = op->iy;
233 
234     _mm_empty();
235 
236     src_line += (sx >> 16)*4 + (sy >> 16)*src_pitch;
237     sx       &= 0xffff;
238     sy       &= 0xffff;
239 
240     for ( h = op->rd.h; h > 0; h-- ) {
241         uint8_t*  dst = dst_line;
242         uint8_t*  src = src_line;
243         uint8_t*  dst_end = dst + 4*op->rd.w;
244         int       sx1 = sx;
245         int       sy1 = sy;
246 
247         for ( ; dst < dst_end; ) {
248             int  sx2 = sx1 + ix;
249             int  sy2 = sy1 + iy;
250 
251             ARGB_DECL_ZERO();
252             ARGB_DECL(spix);
253             ARGB_DECL(pix);
254             ARGB_ZERO(pix);
255 
256             /* the current destination pixel maps to the (sx1,sy1)-(sx2,sy2)
257             * source square, we're going to compute the sum of its pixels'
258             * colors...  simple box filtering
259             */
260             {
261                 int  gsy, gsx;
262                 for ( gsy = 0; gsy < sy2; gsy += 65536 ) {
263                     for ( gsx = 0; gsx < sx2; gsx += 65536 ) {
264                         uint8_t*  s    = src + (gsx >> 16)*4 + (gsy >> 16)*src_pitch;
265                         int       xmin = gsx, xmax = gsx + 65536, ymin = gsy, ymax = gsy + 65536;
266                         unsigned  ww, hh;
267                         unsigned  red;
268 
269                         if (xmin < sx1) xmin = sx1;
270                         if (xmax > sx2) xmax = sx2;
271                         if (ymin < sy1) ymin = sy1;
272                         if (ymax > sy2) ymax = sy2;
273 
274                         ww = (unsigned)(xmax-xmin);
275                         red = ww;
276 
277                         hh = (unsigned)(ymax-ymin);
278                         red = (hh < 65536) ? (red*hh >> 16U) : red;
279 
280                         ARGB_READ(spix,s);
281                         ARGB_REDUCE(spix,red);
282                         ARGB_ADD(pix,spix);
283                     }
284                 }
285             }
286 
287             ARGB_RESCALE(pix,scale2);
288             ARGB_WRITE(pix,dst);
289 
290             sx1  = sx2;
291             src += (sx1 >> 16)*4;
292             sx1 &= 0xffff;
293             dst += 4;
294         }
295 
296         sy       += iy;
297         src_line += (sy >> 16)*src_pitch;
298         sy       &= 0xffff;
299 
300         dst_line += dst_pitch;
301     }
302     ARGB_DONE;
303 }
304 #endif
305 #undef  ARGB_SCALE_GENERIC
306 
307 
308 #ifdef ARGB_SCALE_05_TO_10
cross(int x,int y)309 static inline int cross( int  x, int  y ) {
310     if (x == 65536 && y == 65536)
311         return 65536;
312 
313     return (int)((unsigned)x * (unsigned)y >> 16U);
314 }
315 
316 static void
scale_05_to_10(ScaleOp * op)317 scale_05_to_10( ScaleOp*   op )
318 {
319     int        dst_pitch = op->dst_pitch;
320     int        src_pitch = op->src_pitch;
321     uint8_t*   dst_line  = op->dst_line;
322     uint8_t*   src_line  = op->src_line;
323     ARGB_DECL_SCALE(scale2, op->scale);
324     int        h;
325     int        sx = op->sx;
326     int        sy = op->sy;
327     int        ix = op->ix;
328     int        iy = op->iy;
329 
330     _mm_empty();
331 
332     src_line += (sx >> 16)*4 + (sy >> 16)*src_pitch;
333     sx       &= 0xffff;
334     sy       &= 0xffff;
335 
336     for ( h = op->rd.h; h > 0; h-- ) {
337         uint8_t*  dst = dst_line;
338         uint8_t*  src = src_line;
339         uint8_t*  dst_end = dst + 4*op->rd.w;
340         int       sx1 = sx;
341         int       sy1 = sy;
342 
343         for ( ; dst < dst_end; ) {
344             int  sx2 = sx1 + ix;
345             int  sy2 = sy1 + iy;
346 
347             ARGB_DECL_ZERO();
348             ARGB_DECL2(spix, pix);
349 
350             int      off = src_pitch;
351             int      fx1 = sx1 & 0xffff;
352             int      fx2 = sx2 & 0xffff;
353             int      fy1 = sy1 & 0xffff;
354             int      fy2 = sy2 & 0xffff;
355 
356             int      center_x = ((sx1 >> 16) + 1) < ((sx2-1) >> 16);
357             int      center_y = ((sy1 >> 16) + 1) < ((sy2-1) >> 16);
358 
359             ARGB_ZERO(pix);
360 
361             if (fx2 == 0) {
362                 fx2  = 65536;
363             }
364             if (fy2 == 0) {
365                 fy2  = 65536;
366             }
367             fx1 = 65536 - fx1;
368             fy1 = 65536 - fy1;
369 
370             /** TOP BAND
371              **/
372 
373             /* top-left pixel */
374             ARGB_READ(spix,src);
375             ARGB_REDUCE(spix,cross(fx1,fy1));
376             ARGB_ADD(pix,spix);
377 
378             /* top-center pixel, if any */
379             ARGB_READ(spix,src + 4);
380             if (center_x) {
381                 ARGB_REDUCE(spix,fy1);
382                 ARGB_ADD(pix,spix);
383                 ARGB_READ(spix,src + 8);
384             }
385 
386             /* top-right pixel */
387             ARGB_REDUCE(spix,cross(fx2,fy1));
388             ARGB_ADD(pix,spix);
389 
390             /** MIDDLE BAND, IF ANY
391              **/
392             if (center_y) {
393                 /* left-middle pixel */
394                 ARGB_READ(spix,src + off);
395                 ARGB_REDUCE(spix,fx1);
396                 ARGB_ADD(pix,spix);
397 
398                 /* center pixel, if any */
399                 ARGB_READ(spix,src + off + 4);
400                 if (center_x) {
401                     ARGB_ADD(pix,spix);
402                     ARGB_READ(spix,src + off + 8);
403                 }
404 
405                 /* right-middle pixel */
406                 ARGB_REDUCE(spix,fx2);
407                 ARGB_ADD(pix,spix);
408 
409                 off += src_pitch;
410             }
411 
412             /** BOTTOM BAND
413              **/
414             /* left-bottom pixel */
415             ARGB_READ(spix,src + off);
416             ARGB_REDUCE(spix,cross(fx1,fy2));
417             ARGB_ADD(pix,spix);
418 
419             /* center-bottom, if any */
420             ARGB_READ(spix,src + off + 4);
421             if (center_x) {
422                 ARGB_REDUCE(spix,fy2);
423                 ARGB_ADD(pix,spix);
424                 ARGB_READ(spix,src + off + 8);
425             }
426 
427             /* right-bottom pixel */
428             ARGB_REDUCE(spix,cross(fx2,fy2));
429             ARGB_ADD(pix,spix);
430 
431             /** WRITE IT
432              **/
433             ARGB_RESCALE(pix,scale2);
434             ARGB_WRITE(pix,dst);
435 
436             sx1  = sx2;
437             src += (sx1 >> 16)*4;
438             sx1 &= 0xffff;
439             dst += 4;
440         }
441 
442         sy       += iy;
443         src_line += (sy >> 16)*src_pitch;
444         sy       &= 0xffff;
445 
446         dst_line += dst_pitch;
447     }
448     ARGB_DONE;
449 }
450 #endif
451 #undef ARGB_SCALE_05_TO_10
452 
453 
454 #ifdef ARGB_SCALE_UP_BILINEAR
455 static void
scale_up_bilinear(ScaleOp * op)456 scale_up_bilinear( ScaleOp*  op )
457 {
458     int        dst_pitch = op->dst_pitch;
459     int        src_pitch = op->src_pitch;
460     uint8_t*   dst_line  = op->dst_line;
461     uint8_t*   src_line  = op->src_line;
462     int        sx = op->sx;
463     int        sy = op->sy;
464     int        ix = op->ix;
465     int        iy = op->iy;
466     int        xlimit, ylimit;
467     int        h, sx0;
468 
469     _mm_empty();
470 
471     /* the center pixel is at (sx+ix/2, sy+iy/2), we then want to get */
472     /* the four nearest source pixels, which are at (0.5,0.5) offsets */
473 
474     sx = sx + ix/2 - 32768;
475     sy = sy + iy/2 - 32768;
476 
477     xlimit = (op->src_w-1);
478     ylimit = (op->src_h-1);
479 
480     sx0 = sx;
481 
482     for ( h = op->rd.h; h > 0; h-- ) {
483         uint8_t*  dst = dst_line;
484         uint8_t*  dst_end = dst + 4*op->rd.w;
485 
486         sx = sx0;
487         for ( ; dst < dst_end; ) {
488             int        ex1, ex2, ey1, ey2, alpha;
489             uint8_t*   s;
490 
491             ARGB_DECL_ZERO();
492             ARGB_DECL2(spix1,spix2);
493             ARGB_DECL2(pix3,pix4);
494             ARGB_DECL(pix);
495 
496             /* find the four neighbours */
497             ex1 = (sx >> 16);
498             ey1 = (sy >> 16);
499             ex2 = (sx+65535) >> 16;
500             ey2 = (sy+65535) >> 16;
501 
502             if (ex1 < 0) ex1 = 0; else if (ex1 > xlimit) ex1 = xlimit;
503             if (ey1 < 0) ey1 = 0; else if (ey1 > ylimit) ey1 = ylimit;
504             if (ex2 < 0) ex2 = 0; else if (ex2 > xlimit) ex2 = xlimit;
505             if (ey2 < 0) ey2 = 0; else if (ey2 > ylimit) ey2 = ylimit;
506 
507             ex2 = (ex2-ex1)*4;
508             ey2 = (ey2-ey1)*src_pitch;
509 
510             /* interpolate */
511             s   = src_line + ex1*4 + ey1*src_pitch;
512             ARGB_READ(spix1, s);
513             ARGB_READ(spix2, s+ex2);
514 
515             alpha  = (sx >> 8) & 0xff;
516             ARGB_INTERP255(pix3,spix1,spix2,alpha);
517 
518             s  += ey2;
519             ARGB_READ(spix1, s);
520             ARGB_READ(spix2, s+ex2);
521 
522             ARGB_INTERP255(pix4,spix1,spix2,alpha);
523 
524             alpha = (sy >> 8) & 0xff;
525             ARGB_INTERP255(pix,pix3,pix4,alpha);
526 
527             ARGB_WRITE(pix,dst);
528 
529             sx  += ix;
530             dst += 4;
531         }
532 
533         sy       += iy;
534         dst_line += dst_pitch;
535     }
536     ARGB_DONE;
537 }
538 #endif
539 #undef ARGB_SCALE_UP_BILINEAR
540 
541 #ifdef ARGB_SCALE_UP_QUICK_4x4
542 static void
ARGB_SCALE_UP_QUICK_4x4(ScaleOp * op)543 ARGB_SCALE_UP_QUICK_4x4( ScaleOp*  op )
544 {
545     int        dst_pitch = op->dst_pitch;
546     int        src_pitch = op->src_pitch;
547     uint8_t*   dst_line  = op->dst_line;
548     uint8_t*   src_line  = op->src_line;
549     int        sx = op->sx;
550     int        sy = op->sy;
551     int        ix = op->ix;
552     int        iy = op->iy;
553     int        xlimit, ylimit;
554     int        h, sx0;
555 
556     _mm_empty();
557 
558     /* the center pixel is at (sx+ix/2, sy+iy/2), we then want to get */
559     /* the four nearest source pixels, which are at (0.5,0.5) offsets */
560 
561     sx = sx + ix/2 - 32768;
562     sy = sy + iy/2 - 32768;
563 
564     xlimit = (op->src_w-1);
565     ylimit = (op->src_h-1);
566 
567     sx0 = sx;
568 
569     for ( h = op->rd.h; h > 0; h-- ) {
570         uint8_t*  dst = dst_line;
571         uint8_t*  dst_end = dst + 4*op->rd.w;
572 
573         sx = sx0;
574         for ( ; dst < dst_end; ) {
575             int        ex1, ex2, ey1, ey2;
576             uint8_t*   p;
577             ARGB_DECL_ZERO();
578             ARGB_DECL(pix);
579             ARGB_DECL2(spix1, spix2);
580             ARGB_DECL2(pix3, pix4);
581 
582             /* find the four neighbours */
583             ex1 = (sx >> 16);
584             ey1 = (sy >> 16);
585             ex2 = (sx+65535) >> 16;
586             ey2 = (sy+65535) >> 16;
587 
588             if (ex1 < 0) ex1 = 0; else if (ex1 > xlimit) ex1 = xlimit;
589             if (ey1 < 0) ey1 = 0; else if (ey1 > ylimit) ey1 = ylimit;
590             if (ex2 < 0) ex2 = 0; else if (ex2 > xlimit) ex2 = xlimit;
591             if (ey2 < 0) ey2 = 0; else if (ey2 > ylimit) ey2 = ylimit;
592 
593             /* interpolate */
594             p   = (src_line + ex1*4 + ey1*src_pitch);
595 
596             ex2 = (ex2-ex1)*4;
597             ey2 = (ey2-ey1)*src_pitch;
598 
599             switch (((sx >> 14) & 3) | ((sy >> 12) & 12)) {
600                 case 0:
601                     *(uint32_t*)dst = *(uint32_t*)p;
602                     break;
603 
604                 /* top-line is easy */
605                 case 1:
606                     ARGB_READ(spix1, p);
607                     ARGB_READ(spix2, p+ex2);
608                     ARGB_ADDW_31(pix,spix1,spix2);
609                     ARGB_SHR(pix,pix,2);
610                     ARGB_WRITE(pix, dst);
611                     break;
612 
613                 case 2:
614                     ARGB_READ(spix1, p);
615                     ARGB_READ(spix2, p+ex2);
616                     ARGB_ADDW_11(pix, spix1, spix2);
617                     ARGB_SHR(pix,pix,1);
618                     ARGB_WRITE(pix, dst);
619                     break;
620 
621                 case 3:
622                     ARGB_READ(spix1, p);
623                     ARGB_READ(spix2, p+ex2);
624                     ARGB_ADDW_13(pix,spix1,spix2);
625                     ARGB_SHR(pix,pix,2);
626                     ARGB_WRITE(pix, dst);
627                     break;
628 
629                 /* second line is harder */
630                 case 4:
631                     ARGB_READ(spix1, p);
632                     ARGB_READ(spix2, p+ey2);
633                     ARGB_ADDW_31(pix,spix1,spix2);
634                     ARGB_SHR(pix,pix,2);
635                     ARGB_WRITE(pix, dst);
636                     break;
637 
638                 case 5:
639                     ARGB_READ(spix1, p);
640                     ARGB_READ(spix2, p+ex2);
641                     ARGB_ADDW_31(pix3,spix1,spix2);
642                     p += ey2;
643                     ARGB_READ(spix1, p);
644                     ARGB_READ(spix2, p+ex2);
645                     ARGB_ADDW_31(pix4,spix1,spix2);
646 
647                     ARGB_ADDW_31(pix,pix3,pix4);
648                     ARGB_SHR(pix,pix,4);
649                     ARGB_WRITE(pix,dst);
650                     break;
651 
652                 case 6:
653                     ARGB_READ(spix1, p);
654                     ARGB_READ(spix2, p+ex2);
655                     ARGB_ADDW_11(pix3,spix1,spix2);
656                     p += ey2;
657                     ARGB_READ(spix1, p);
658                     ARGB_READ(spix2, p+ex2);
659                     ARGB_ADDW_11(pix4,spix1,spix2);
660 
661                     ARGB_ADDW_31(pix,pix3,pix4);
662                     ARGB_SHR(pix,pix,3);
663                     ARGB_WRITE(pix,dst);
664                     break;
665 
666                 case 7:
667                     ARGB_READ(spix1, p);
668                     ARGB_READ(spix2, p+ex2);
669                     ARGB_ADDW_13(pix3,spix1,spix2);
670                     p += ey2;
671                     ARGB_READ(spix1, p);
672                     ARGB_READ(spix2, p+ex2);
673                     ARGB_ADDW_13(pix4,spix1,spix2);
674 
675                     ARGB_ADDW_31(pix,pix3,pix4);
676                     ARGB_SHR(pix,pix,4);
677                     ARGB_WRITE(pix,dst);
678                     break;
679 
680                  /* third line */
681                 case 8:
682                     ARGB_READ(spix1, p);
683                     ARGB_READ(spix2, p+ey2);
684                     ARGB_ADDW_11(pix,spix1,spix2);
685                     ARGB_SHR(pix,pix,1);
686                     ARGB_WRITE(pix, dst);
687                     break;
688 
689                 case 9:
690                     ARGB_READ(spix1, p);
691                     ARGB_READ(spix2, p+ex2);
692                     ARGB_ADDW_31(pix3,spix1,spix2);
693                     p += ey2;
694                     ARGB_READ(spix1, p);
695                     ARGB_READ(spix2, p+ex2);
696                     ARGB_ADDW_31(pix4,spix1,spix2);
697 
698                     ARGB_ADDW_11(pix,pix3,pix4);
699                     ARGB_SHR(pix,pix,3);
700                     ARGB_WRITE(pix,dst);
701                     break;
702 
703                 case 10:
704                     ARGB_READ(spix1, p);
705                     ARGB_READ(spix2, p+ex2);
706                     ARGB_ADDW_11(pix3,spix1,spix2);
707                     p += ey2;
708                     ARGB_READ(spix1, p);
709                     ARGB_READ(spix2, p+ex2);
710                     ARGB_ADDW_11(pix4,spix1,spix2);
711 
712                     ARGB_ADDW_11(pix,pix3,pix4);
713                     ARGB_SHR(pix,pix,2);
714                     ARGB_WRITE(pix,dst);
715                     break;
716 
717                 case 11:
718                     ARGB_READ(spix1, p);
719                     ARGB_READ(spix2, p+ex2);
720                     ARGB_ADDW_13(pix3,spix1,spix2);
721                     p += ey2;
722                     ARGB_READ(spix1, p);
723                     ARGB_READ(spix2, p+ex2);
724                     ARGB_ADDW_13(pix4,spix1,spix2);
725 
726                     ARGB_ADDW_11(pix,pix3,pix4);
727                     ARGB_SHR(pix,pix,3);
728                     ARGB_WRITE(pix,dst);
729                     break;
730 
731                  /* last line */
732                 case 12:
733                     ARGB_READ(spix1, p);
734                     ARGB_READ(spix2, p+ey2);
735                     ARGB_ADDW_13(pix,spix1,spix2);
736                     ARGB_SHR(pix,pix,2);
737                     ARGB_WRITE(pix, dst);
738                     break;
739 
740                 case 13:
741                     ARGB_READ(spix1, p);
742                     ARGB_READ(spix2, p+ex2);
743                     ARGB_ADDW_31(pix3,spix1,spix2);
744                     p += ey2;
745                     ARGB_READ(spix1, p);
746                     ARGB_READ(spix2, p+ex2);
747                     ARGB_ADDW_31(pix4,spix1,spix2);
748 
749                     ARGB_ADDW_13(pix,pix3,pix4);
750                     ARGB_SHR(pix,pix,4);
751                     ARGB_WRITE(pix,dst);
752                     break;
753 
754                 case 14:
755                     ARGB_READ(spix1, p);
756                     ARGB_READ(spix2, p+ex2);
757                     ARGB_ADDW_11(pix3,spix1,spix2);
758                     p += ey2;
759                     ARGB_READ(spix1, p);
760                     ARGB_READ(spix2, p+ex2);
761                     ARGB_ADDW_11(pix4,spix1,spix2);
762 
763                     ARGB_ADDW_13(pix,pix3,pix4);
764                     ARGB_SHR(pix,pix,3);
765                     ARGB_WRITE(pix,dst);
766                     break;
767 
768                 default:
769                     ARGB_READ(spix1, p);
770                     ARGB_READ(spix2, p+ex2);
771                     ARGB_ADDW_13(pix3,spix1,spix2);
772                     p += ey2;
773                     ARGB_READ(spix1, p);
774                     ARGB_READ(spix2, p+ex2);
775                     ARGB_ADDW_13(pix4,spix1,spix2);
776 
777                     ARGB_ADDW_13(pix,pix3,pix4);
778                     ARGB_SHR(pix,pix,4);
779                     ARGB_WRITE(pix,dst);
780             }
781             sx  += ix;
782             dst += 4;
783         }
784 
785         sy       += iy;
786         dst_line += dst_pitch;
787     }
788     ARGB_DONE;
789 }
790 #endif
791 #undef  ARGB_SCALE_UP_QUICK_4x4
792 
793 
794 #ifdef ARGB_SCALE_NEAREST
795 /* this version scales up with nearest neighbours - looks crap */
796 static void
ARGB_SCALE_NEAREST(ScaleOp * op)797 ARGB_SCALE_NEAREST( ScaleOp*  op )
798 {
799     int        dst_pitch = op->dst_pitch;
800     int        src_pitch = op->src_pitch;
801     uint8_t*   dst_line  = op->dst_line;
802     uint8_t*   src_line  = op->src_line;
803     int        sx = op->sx;
804     int        sy = op->sy;
805     int        ix = op->ix;
806     int        iy = op->iy;
807     int        xlimit, ylimit;
808     int        h, sx0;
809 
810     _mm_empty();
811 
812     /* the center pixel is at (sx+ix/2, sy+iy/2), we then want to get */
813     /* the four nearest source pixels, which are at (0.5,0.5) offsets */
814 
815     sx = sx + ix/2 - 32768;
816     sy = sy + iy/2 - 32768;
817 
818     xlimit = (op->src_w-1);
819     ylimit = (op->src_h-1);
820 
821     sx0 = sx;
822 
823     for ( h = op->rd.h; h > 0; h-- ) {
824         uint8_t*  dst = dst_line;
825         uint8_t*  dst_end = dst + 4*op->rd.w;
826 
827         sx = sx0;
828         for ( ; dst < dst_end; ) {
829             int        ex1, ex2, ey1, ey2;
830             unsigned*  p;
831 
832             /* find the top-left neighbour */
833             ex1 = (sx >> 16);
834             ey1 = (sy >> 16);
835             ex2 = ex1+1;
836             ey2 = ey1+1;
837 
838             if (ex1 < 0) ex1 = 0; else if (ex1 > xlimit) ex1 = xlimit;
839             if (ey1 < 0) ey1 = 0; else if (ey1 > ylimit) ey1 = ylimit;
840             if (ex2 < 0) ex2 = 0; else if (ex2 > xlimit) ex2 = xlimit;
841             if (ey2 < 0) ey2 = 0; else if (ey2 > ylimit) ey2 = ylimit;
842 
843             p   = (unsigned*)(src_line + ex1*4 + ey1*src_pitch);
844             if ((sx & 0xffff) >= 32768)
845                 p += (ex2-ex1);
846             if ((sy & 0xffff) >= 32768)
847                 p = (unsigned*)((char*)p + (ey2-ey1)*src_pitch);
848 
849             *(unsigned*)dst = p[0];
850 
851             sx  += ix;
852             dst += 4;
853         }
854 
855         sy       += iy;
856         dst_line += dst_pitch;
857     }
858 }
859 #endif
860 #undef  ARGB_SCALE_NEAREST
861