• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*
3  * Copyright 2009 The Android Open Source Project
4  *
5  * Use of this source code is governed by a BSD-style license that can be
6  * found in the LICENSE file.
7  */
8 
9 
10 #include <emmintrin.h>
11 #include "SkBitmapProcState_opts_SSE2.h"
12 #include "SkUtils.h"
13 
S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)14 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
15                                    const uint32_t* xy,
16                                    int count, uint32_t* colors) {
17     SkASSERT(count > 0 && colors != NULL);
18     SkASSERT(s.fDoFilter);
19     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
20     SkASSERT(s.fAlphaScale == 256);
21 
22     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
23     unsigned rb = s.fBitmap->rowBytes();
24     uint32_t XY = *xy++;
25     unsigned y0 = XY >> 14;
26     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
27     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
28     unsigned subY = y0 & 0xF;
29 
30     // ( 0,  0,  0,  0,  0,  0,  0, 16)
31     __m128i sixteen = _mm_cvtsi32_si128(16);
32 
33     // ( 0,  0,  0,  0, 16, 16, 16, 16)
34     sixteen = _mm_shufflelo_epi16(sixteen, 0);
35 
36     // ( 0,  0,  0,  0,  0,  0,  0,  y)
37     __m128i allY = _mm_cvtsi32_si128(subY);
38 
39     // ( 0,  0,  0,  0,  y,  y,  y,  y)
40     allY = _mm_shufflelo_epi16(allY, 0);
41 
42     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
43     __m128i negY = _mm_sub_epi16(sixteen, allY);
44 
45     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
46     allY = _mm_unpacklo_epi64(allY, negY);
47 
48     // (16, 16, 16, 16, 16, 16, 16, 16 )
49     sixteen = _mm_shuffle_epi32(sixteen, 0);
50 
51     // ( 0,  0,  0,  0,  0,  0,  0,  0)
52     __m128i zero = _mm_setzero_si128();
53     do {
54         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
55         unsigned x0 = XX >> 18;
56         unsigned x1 = XX & 0x3FFF;
57 
58         // (0, 0, 0, 0, 0, 0, 0, x)
59         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
60 
61         // (0, 0, 0, 0, x, x, x, x)
62         allX = _mm_shufflelo_epi16(allX, 0);
63 
64         // (x, x, x, x, x, x, x, x)
65         allX = _mm_shuffle_epi32(allX, 0);
66 
67         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
68         __m128i negX = _mm_sub_epi16(sixteen, allX);
69 
70         // Load 4 samples (pixels).
71         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
72         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
73         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
74         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
75 
76         // (0, 0, a00, a10)
77         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
78 
79         // Expand to 16 bits per component.
80         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
81 
82         // ((a00 * (16-y)), (a10 * y)).
83         a00a10 = _mm_mullo_epi16(a00a10, allY);
84 
85         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
86         a00a10 = _mm_mullo_epi16(a00a10, negX);
87 
88         // (0, 0, a01, a10)
89         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
90 
91         // Expand to 16 bits per component.
92         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
93 
94         // (a01 * (16-y)), (a11 * y)
95         a01a11 = _mm_mullo_epi16(a01a11, allY);
96 
97         // (a01 * (16-y) * x), (a11 * y * x)
98         a01a11 = _mm_mullo_epi16(a01a11, allX);
99 
100         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
101         __m128i sum = _mm_add_epi16(a00a10, a01a11);
102 
103         // (DC, a00*w00 + a01*w01)
104         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
105 
106         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
107         sum = _mm_add_epi16(sum, shifted);
108 
109         // Divide each 16 bit component by 256.
110         sum = _mm_srli_epi16(sum, 8);
111 
112         // Pack lower 4 16 bit values of sum into lower 4 bytes.
113         sum = _mm_packus_epi16(sum, zero);
114 
115         // Extract low int and store.
116         *colors++ = _mm_cvtsi128_si32(sum);
117     } while (--count > 0);
118 }
119 
S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)120 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
121                                   const uint32_t* xy,
122                                   int count, uint32_t* colors) {
123     SkASSERT(count > 0 && colors != NULL);
124     SkASSERT(s.fDoFilter);
125     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
126     SkASSERT(s.fAlphaScale < 256);
127 
128     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
129     unsigned rb = s.fBitmap->rowBytes();
130     uint32_t XY = *xy++;
131     unsigned y0 = XY >> 14;
132     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
133     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
134     unsigned subY = y0 & 0xF;
135 
136     // ( 0,  0,  0,  0,  0,  0,  0, 16)
137     __m128i sixteen = _mm_cvtsi32_si128(16);
138 
139     // ( 0,  0,  0,  0, 16, 16, 16, 16)
140     sixteen = _mm_shufflelo_epi16(sixteen, 0);
141 
142     // ( 0,  0,  0,  0,  0,  0,  0,  y)
143     __m128i allY = _mm_cvtsi32_si128(subY);
144 
145     // ( 0,  0,  0,  0,  y,  y,  y,  y)
146     allY = _mm_shufflelo_epi16(allY, 0);
147 
148     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
149     __m128i negY = _mm_sub_epi16(sixteen, allY);
150 
151     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
152     allY = _mm_unpacklo_epi64(allY, negY);
153 
154     // (16, 16, 16, 16, 16, 16, 16, 16 )
155     sixteen = _mm_shuffle_epi32(sixteen, 0);
156 
157     // ( 0,  0,  0,  0,  0,  0,  0,  0)
158     __m128i zero = _mm_setzero_si128();
159 
160     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
161     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
162 
163     do {
164         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
165         unsigned x0 = XX >> 18;
166         unsigned x1 = XX & 0x3FFF;
167 
168         // (0, 0, 0, 0, 0, 0, 0, x)
169         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
170 
171         // (0, 0, 0, 0, x, x, x, x)
172         allX = _mm_shufflelo_epi16(allX, 0);
173 
174         // (x, x, x, x, x, x, x, x)
175         allX = _mm_shuffle_epi32(allX, 0);
176 
177         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
178         __m128i negX = _mm_sub_epi16(sixteen, allX);
179 
180         // Load 4 samples (pixels).
181         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
182         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
183         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
184         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
185 
186         // (0, 0, a00, a10)
187         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
188 
189         // Expand to 16 bits per component.
190         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
191 
192         // ((a00 * (16-y)), (a10 * y)).
193         a00a10 = _mm_mullo_epi16(a00a10, allY);
194 
195         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
196         a00a10 = _mm_mullo_epi16(a00a10, negX);
197 
198         // (0, 0, a01, a10)
199         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
200 
201         // Expand to 16 bits per component.
202         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
203 
204         // (a01 * (16-y)), (a11 * y)
205         a01a11 = _mm_mullo_epi16(a01a11, allY);
206 
207         // (a01 * (16-y) * x), (a11 * y * x)
208         a01a11 = _mm_mullo_epi16(a01a11, allX);
209 
210         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
211         __m128i sum = _mm_add_epi16(a00a10, a01a11);
212 
213         // (DC, a00*w00 + a01*w01)
214         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
215 
216         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
217         sum = _mm_add_epi16(sum, shifted);
218 
219         // Divide each 16 bit component by 256.
220         sum = _mm_srli_epi16(sum, 8);
221 
222         // Multiply by alpha.
223         sum = _mm_mullo_epi16(sum, alpha);
224 
225         // Divide each 16 bit component by 256.
226         sum = _mm_srli_epi16(sum, 8);
227 
228         // Pack lower 4 16 bit values of sum into lower 4 bytes.
229         sum = _mm_packus_epi16(sum, zero);
230 
231         // Extract low int and store.
232         *colors++ = _mm_cvtsi128_si32(sum);
233     } while (--count > 0);
234 }
235 
ClampX_ClampY_pack_filter(SkFixed f,unsigned max,SkFixed one)236 static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
237                                                  SkFixed one) {
238     unsigned i = SkClampMax(f >> 16, max);
239     i = (i << 4) | ((f >> 12) & 0xF);
240     return (i << 14) | SkClampMax((f + one) >> 16, max);
241 }
242 
243 /*  SSE version of ClampX_ClampY_filter_scale()
244  *  portable version is in core/SkBitmapProcState_matrix.h
245  */
ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)246 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
247                                      int count, int x, int y) {
248     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
249                              SkMatrix::kScale_Mask)) == 0);
250     SkASSERT(s.fInvKy == 0);
251 
252     const unsigned maxX = s.fBitmap->width() - 1;
253     const SkFixed one = s.fFilterOneX;
254     const SkFixed dx = s.fInvSx;
255     SkFixed fx;
256 
257     SkPoint pt;
258     s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
259                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
260     const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
261     const unsigned maxY = s.fBitmap->height() - 1;
262     // compute our two Y values up front
263     *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
264     // now initialize fx
265     fx = SkScalarToFixed(pt.fX) - (one >> 1);
266 
267     // test if we don't need to apply the tile proc
268     if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
269         (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
270         if (count >= 4) {
271             // SSE version of decal_filter_scale
272             while ((size_t(xy) & 0x0F) != 0) {
273                 SkASSERT((fx >> (16 + 14)) == 0);
274                 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
275                 fx += dx;
276                 count--;
277             }
278 
279             __m128i wide_1    = _mm_set1_epi32(1);
280             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
281             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
282                                               fx + dx, fx);
283 
284             while (count >= 4) {
285                 __m128i wide_out;
286 
287                 wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
288                 wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
289                                         _mm_srai_epi32(wide_fx, 16), wide_1));
290 
291                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
292 
293                 xy += 4;
294                 fx += dx * 4;
295                 wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
296                 count -= 4;
297             } // while count >= 4
298         } // if count >= 4
299 
300         while (count-- > 0) {
301             SkASSERT((fx >> (16 + 14)) == 0);
302             *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
303             fx += dx;
304         }
305     } else {
306         // SSE2 only support 16bit interger max & min, so only process the case
307         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
308         // height, there should be rare bitmap whose height will be greater
309         // than max 16bit interger in the real world.
310         if ((count >= 4) && (maxX <= 0xFFFF)) {
311             while (((size_t)xy & 0x0F) != 0) {
312                 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
313                 fx += dx;
314                 count--;
315             }
316 
317             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
318                                               fx + dx, fx);
319             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
320             __m128i wide_one  = _mm_set1_epi32(one);
321             __m128i wide_maxX = _mm_set1_epi32(maxX);
322             __m128i wide_mask = _mm_set1_epi32(0xF);
323 
324              while (count >= 4) {
325                 __m128i wide_i;
326                 __m128i wide_lo;
327                 __m128i wide_fx1;
328 
329                 // i = SkClampMax(f>>16,maxX)
330                 wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
331                                        _mm_setzero_si128());
332                 wide_i = _mm_min_epi16(wide_i, wide_maxX);
333 
334                 // i<<4 | TILEX_LOW_BITS(fx)
335                 wide_lo = _mm_srli_epi32(wide_fx, 12);
336                 wide_lo = _mm_and_si128(wide_lo, wide_mask);
337                 wide_i  = _mm_slli_epi32(wide_i, 4);
338                 wide_i  = _mm_or_si128(wide_i, wide_lo);
339 
340                 // i<<14
341                 wide_i = _mm_slli_epi32(wide_i, 14);
342 
343                 // SkClampMax(((f+one))>>16,max)
344                 wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
345                 wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
346                                                         _mm_setzero_si128());
347                 wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
348 
349                 // final combination
350                 wide_i = _mm_or_si128(wide_i, wide_fx1);
351                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
352 
353                 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
354                 fx += dx * 4;
355                 xy += 4;
356                 count -= 4;
357             } // while count >= 4
358         } // if count >= 4
359 
360         while (count-- > 0) {
361             *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
362             fx += dx;
363         }
364     }
365 }
366 
367 /*  SSE version of ClampX_ClampY_nofilter_scale()
368  *  portable version is in core/SkBitmapProcState_matrix.h
369  */
ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)370 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
371                                     uint32_t xy[], int count, int x, int y) {
372     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
373                              SkMatrix::kScale_Mask)) == 0);
374 
375     // we store y, x, x, x, x, x
376     const unsigned maxX = s.fBitmap->width() - 1;
377     SkFixed fx;
378     SkPoint pt;
379     s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
380                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
381     fx = SkScalarToFixed(pt.fY);
382     const unsigned maxY = s.fBitmap->height() - 1;
383     *xy++ = SkClampMax(fx >> 16, maxY);
384     fx = SkScalarToFixed(pt.fX);
385 
386     if (0 == maxX) {
387         // all of the following X values must be 0
388         memset(xy, 0, count * sizeof(uint16_t));
389         return;
390     }
391 
392     const SkFixed dx = s.fInvSx;
393 
394     // test if we don't need to apply the tile proc
395     if ((unsigned)(fx >> 16) <= maxX &&
396         (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
397         // SSE version of decal_nofilter_scale
398         if (count >= 8) {
399             while (((size_t)xy & 0x0F) != 0) {
400                 *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
401                 fx += 2 * dx;
402                 count -= 2;
403             }
404 
405             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
406             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
407 
408             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
409                                              fx + dx, fx);
410             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
411 
412             while (count >= 8) {
413                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
414                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
415 
416                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
417                                                       wide_out_high);
418                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
419 
420                 wide_low = _mm_add_epi32(wide_low, wide_dx8);
421                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
422 
423                 xy += 4;
424                 fx += dx * 8;
425                 count -= 8;
426             }
427         } // if count >= 8
428 
429         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
430         while (count-- > 0) {
431             *xx++ = SkToU16(fx >> 16);
432             fx += dx;
433         }
434     } else {
435         // SSE2 only support 16bit interger max & min, so only process the case
436         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
437         // height, there should be rare bitmap whose height will be greater
438         // than max 16bit interger in the real world.
439         if ((count >= 8) && (maxX <= 0xFFFF)) {
440             while (((size_t)xy & 0x0F) != 0) {
441                 *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
442                                         SkClampMax(fx >> 16, maxX));
443                 fx += 2 * dx;
444                 count -= 2;
445             }
446 
447             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
448             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
449 
450             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
451                                              fx + dx, fx);
452             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
453             __m128i wide_maxX = _mm_set1_epi32(maxX);
454 
455             while (count >= 8) {
456                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
457                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
458 
459                 wide_out_low  = _mm_max_epi16(wide_out_low,
460                                               _mm_setzero_si128());
461                 wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
462                 wide_out_high = _mm_max_epi16(wide_out_high,
463                                               _mm_setzero_si128());
464                 wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
465 
466                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
467                                                       wide_out_high);
468                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
469 
470                 wide_low  = _mm_add_epi32(wide_low, wide_dx8);
471                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
472 
473                 xy += 4;
474                 fx += dx * 8;
475                 count -= 8;
476             }
477         } // if count >= 8
478 
479         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
480         while (count-- > 0) {
481             *xx++ = SkClampMax(fx >> 16, maxX);
482             fx += dx;
483         }
484     }
485 }
486 
487 /*  SSE version of ClampX_ClampY_filter_affine()
488  *  portable version is in core/SkBitmapProcState_matrix.h
489  */
ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)490 void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
491                                       uint32_t xy[], int count, int x, int y) {
492     SkPoint srcPt;
493     s.fInvProc(*s.fInvMatrix,
494                SkIntToScalar(x) + SK_ScalarHalf,
495                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
496 
497     SkFixed oneX = s.fFilterOneX;
498     SkFixed oneY = s.fFilterOneY;
499     SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
500     SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
501     SkFixed dx = s.fInvSx;
502     SkFixed dy = s.fInvKy;
503     unsigned maxX = s.fBitmap->width() - 1;
504     unsigned maxY = s.fBitmap->height() - 1;
505 
506     if (count >= 2 && (maxX <= 0xFFFF)) {
507         SkFixed dx2 = dx + dx;
508         SkFixed dy2 = dy + dy;
509 
510         __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
511         __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
512         __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
513         __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
514         __m128i wide_mask = _mm_set1_epi32(0xF);
515 
516         while (count >= 2) {
517             // i = SkClampMax(f>>16,maxX)
518             __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
519                                            _mm_setzero_si128());
520             wide_i = _mm_min_epi16(wide_i, wide_max);
521 
522             // i<<4 | TILEX_LOW_BITS(f)
523             __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
524             wide_lo = _mm_and_si128(wide_lo, wide_mask);
525             wide_i  = _mm_slli_epi32(wide_i, 4);
526             wide_i  = _mm_or_si128(wide_i, wide_lo);
527 
528             // i<<14
529             wide_i = _mm_slli_epi32(wide_i, 14);
530 
531             // SkClampMax(((f+one))>>16,max)
532             __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
533             wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
534                                                    _mm_setzero_si128());
535             wide_f1 = _mm_min_epi16(wide_f1, wide_max);
536 
537             // final combination
538             wide_i = _mm_or_si128(wide_i, wide_f1);
539             _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
540 
541             wide_f = _mm_add_epi32(wide_f, wide_d2);
542 
543             fx += dx2;
544             fy += dy2;
545             xy += 4;
546             count -= 2;
547         } // while count >= 2
548     } // if count >= 2
549 
550     while (count-- > 0) {
551         *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
552         fy += dy;
553         *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
554         fx += dx;
555     }
556 }
557 
558 /*  SSE version of ClampX_ClampY_nofilter_affine()
559  *  portable version is in core/SkBitmapProcState_matrix.h
560  */
ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)561 void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
562                                       uint32_t xy[], int count, int x, int y) {
563     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
564     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
565                              SkMatrix::kScale_Mask |
566                              SkMatrix::kAffine_Mask)) == 0);
567 
568     SkPoint srcPt;
569     s.fInvProc(*s.fInvMatrix,
570                SkIntToScalar(x) + SK_ScalarHalf,
571                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
572 
573     SkFixed fx = SkScalarToFixed(srcPt.fX);
574     SkFixed fy = SkScalarToFixed(srcPt.fY);
575     SkFixed dx = s.fInvSx;
576     SkFixed dy = s.fInvKy;
577     int maxX = s.fBitmap->width() - 1;
578     int maxY = s.fBitmap->height() - 1;
579 
580     if (count >= 4 && (maxX <= 0xFFFF)) {
581         while (((size_t)xy & 0x0F) != 0) {
582             *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
583                                   SkClampMax(fx >> 16, maxX);
584             fx += dx;
585             fy += dy;
586             count--;
587         }
588 
589         SkFixed dx4 = dx * 4;
590         SkFixed dy4 = dy * 4;
591 
592         __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
593                                           fx + dx, fx);
594         __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
595                                           fy + dy, fy);
596         __m128i wide_dx4  = _mm_set1_epi32(dx4);
597         __m128i wide_dy4  = _mm_set1_epi32(dy4);
598 
599         __m128i wide_maxX = _mm_set1_epi32(maxX);
600         __m128i wide_maxY = _mm_set1_epi32(maxY);
601 
602         while (count >= 4) {
603             // SkClampMax(fx>>16,maxX)
604             __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
605                                             _mm_setzero_si128());
606             wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
607 
608             // SkClampMax(fy>>16,maxY)
609             __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
610                                             _mm_setzero_si128());
611             wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
612 
613             // final combination
614             __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
615                                           wide_lo);
616             _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
617 
618             wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
619             wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
620 
621             fx += dx4;
622             fy += dy4;
623             xy += 4;
624             count -= 4;
625         } // while count >= 4
626     } // if count >= 4
627 
628     while (count-- > 0) {
629         *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
630                               SkClampMax(fx >> 16, maxX);
631         fx += dx;
632         fy += dy;
633     }
634 }
635 
636 /*  SSE version of S32_D16_filter_DX_SSE2
637  *  Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
638  *  It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
639  */
S32_D16_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint16_t * colors)640 void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
641                                    const uint32_t* xy,
642                                    int count, uint16_t* colors) {
643     SkASSERT(count > 0 && colors != NULL);
644     SkASSERT(s.fDoFilter);
645     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
646     SkASSERT(s.fBitmap->isOpaque());
647 
648     SkPMColor dstColor;
649     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
650     unsigned rb = s.fBitmap->rowBytes();
651     uint32_t XY = *xy++;
652     unsigned y0 = XY >> 14;
653     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
654     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
655     unsigned subY = y0 & 0xF;
656 
657     // ( 0,  0,  0,  0,  0,  0,  0, 16)
658     __m128i sixteen = _mm_cvtsi32_si128(16);
659 
660     // ( 0,  0,  0,  0, 16, 16, 16, 16)
661     sixteen = _mm_shufflelo_epi16(sixteen, 0);
662 
663     // ( 0,  0,  0,  0,  0,  0,  0,  y)
664     __m128i allY = _mm_cvtsi32_si128(subY);
665 
666     // ( 0,  0,  0,  0,  y,  y,  y,  y)
667     allY = _mm_shufflelo_epi16(allY, 0);
668 
669     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
670     __m128i negY = _mm_sub_epi16(sixteen, allY);
671 
672     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
673     allY = _mm_unpacklo_epi64(allY, negY);
674 
675     // (16, 16, 16, 16, 16, 16, 16, 16 )
676     sixteen = _mm_shuffle_epi32(sixteen, 0);
677 
678     // ( 0,  0,  0,  0,  0,  0,  0,  0)
679     __m128i zero = _mm_setzero_si128();
680 
681     do {
682         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
683         unsigned x0 = XX >> 18;
684         unsigned x1 = XX & 0x3FFF;
685 
686         // (0, 0, 0, 0, 0, 0, 0, x)
687         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
688 
689         // (0, 0, 0, 0, x, x, x, x)
690         allX = _mm_shufflelo_epi16(allX, 0);
691 
692         // (x, x, x, x, x, x, x, x)
693         allX = _mm_shuffle_epi32(allX, 0);
694 
695         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
696         __m128i negX = _mm_sub_epi16(sixteen, allX);
697 
698         // Load 4 samples (pixels).
699         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
700         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
701         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
702         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
703 
704         // (0, 0, a00, a10)
705         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
706 
707         // Expand to 16 bits per component.
708         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
709 
710         // ((a00 * (16-y)), (a10 * y)).
711         a00a10 = _mm_mullo_epi16(a00a10, allY);
712 
713         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
714         a00a10 = _mm_mullo_epi16(a00a10, negX);
715 
716         // (0, 0, a01, a10)
717         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
718 
719         // Expand to 16 bits per component.
720         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
721 
722         // (a01 * (16-y)), (a11 * y)
723         a01a11 = _mm_mullo_epi16(a01a11, allY);
724 
725         // (a01 * (16-y) * x), (a11 * y * x)
726         a01a11 = _mm_mullo_epi16(a01a11, allX);
727 
728         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
729         __m128i sum = _mm_add_epi16(a00a10, a01a11);
730 
731         // (DC, a00*w00 + a01*w01)
732         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
733 
734         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
735         sum = _mm_add_epi16(sum, shifted);
736 
737         // Divide each 16 bit component by 256.
738         sum = _mm_srli_epi16(sum, 8);
739 
740         // Pack lower 4 16 bit values of sum into lower 4 bytes.
741         sum = _mm_packus_epi16(sum, zero);
742 
743         // Extract low int and store.
744         dstColor = _mm_cvtsi128_si32(sum);
745 
746         //*colors++ = SkPixel32ToPixel16(dstColor);
747         // below is much faster than the above. It's tested for Android benchmark--Softweg
748         __m128i _m_temp1 = _mm_set1_epi32(dstColor);
749         __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
750 
751         unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
752         unsigned r = (r32 & ((1<<5) -1)) << 11;
753 
754         _m_temp2 = _mm_srli_epi32(_m_temp2, 7);
755         unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
756         unsigned g = (g32 & ((1<<6) -1)) << 5;
757 
758         _m_temp2 = _mm_srli_epi32(_m_temp2, 9);
759         unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
760         unsigned b = (b32 & ((1<<5) -1));
761 
762         *colors++ = r | g | b;
763 
764     } while (--count > 0);
765 }
766