• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2009 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBitmapProcState_utils.h"
11 #include "SkColorPriv.h"
12 #include "SkPaint.h"
13 #include "SkUtils.h"
14 
S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)15 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
16                                    const uint32_t* xy,
17                                    int count, uint32_t* colors) {
18     SkASSERT(count > 0 && colors != nullptr);
19     SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
20     SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
21     SkASSERT(s.fAlphaScale == 256);
22 
23     const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
24     size_t rb = s.fPixmap.rowBytes();
25     uint32_t XY = *xy++;
26     unsigned y0 = XY >> 14;
27     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
28     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
29     unsigned subY = y0 & 0xF;
30 
31     // ( 0,  0,  0,  0,  0,  0,  0, 16)
32     __m128i sixteen = _mm_cvtsi32_si128(16);
33 
34     // ( 0,  0,  0,  0, 16, 16, 16, 16)
35     sixteen = _mm_shufflelo_epi16(sixteen, 0);
36 
37     // ( 0,  0,  0,  0,  0,  0,  0,  y)
38     __m128i allY = _mm_cvtsi32_si128(subY);
39 
40     // ( 0,  0,  0,  0,  y,  y,  y,  y)
41     allY = _mm_shufflelo_epi16(allY, 0);
42 
43     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
44     __m128i negY = _mm_sub_epi16(sixteen, allY);
45 
46     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
47     allY = _mm_unpacklo_epi64(allY, negY);
48 
49     // (16, 16, 16, 16, 16, 16, 16, 16 )
50     sixteen = _mm_shuffle_epi32(sixteen, 0);
51 
52     // ( 0,  0,  0,  0,  0,  0,  0,  0)
53     __m128i zero = _mm_setzero_si128();
54     do {
55         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
56         unsigned x0 = XX >> 18;
57         unsigned x1 = XX & 0x3FFF;
58 
59         // (0, 0, 0, 0, 0, 0, 0, x)
60         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
61 
62         // (0, 0, 0, 0, x, x, x, x)
63         allX = _mm_shufflelo_epi16(allX, 0);
64 
65         // (x, x, x, x, x, x, x, x)
66         allX = _mm_shuffle_epi32(allX, 0);
67 
68         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
69         __m128i negX = _mm_sub_epi16(sixteen, allX);
70 
71         // Load 4 samples (pixels).
72         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
73         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
74         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
75         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
76 
77         // (0, 0, a00, a10)
78         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
79 
80         // Expand to 16 bits per component.
81         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
82 
83         // ((a00 * (16-y)), (a10 * y)).
84         a00a10 = _mm_mullo_epi16(a00a10, allY);
85 
86         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
87         a00a10 = _mm_mullo_epi16(a00a10, negX);
88 
89         // (0, 0, a01, a10)
90         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
91 
92         // Expand to 16 bits per component.
93         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
94 
95         // (a01 * (16-y)), (a11 * y)
96         a01a11 = _mm_mullo_epi16(a01a11, allY);
97 
98         // (a01 * (16-y) * x), (a11 * y * x)
99         a01a11 = _mm_mullo_epi16(a01a11, allX);
100 
101         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
102         __m128i sum = _mm_add_epi16(a00a10, a01a11);
103 
104         // (DC, a00*w00 + a01*w01)
105         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
106 
107         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
108         sum = _mm_add_epi16(sum, shifted);
109 
110         // Divide each 16 bit component by 256.
111         sum = _mm_srli_epi16(sum, 8);
112 
113         // Pack lower 4 16 bit values of sum into lower 4 bytes.
114         sum = _mm_packus_epi16(sum, zero);
115 
116         // Extract low int and store.
117         *colors++ = _mm_cvtsi128_si32(sum);
118     } while (--count > 0);
119 }
120 
S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)121 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
122                                   const uint32_t* xy,
123                                   int count, uint32_t* colors) {
124     SkASSERT(count > 0 && colors != nullptr);
125     SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
126     SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
127     SkASSERT(s.fAlphaScale < 256);
128 
129     const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
130     size_t rb = s.fPixmap.rowBytes();
131     uint32_t XY = *xy++;
132     unsigned y0 = XY >> 14;
133     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
134     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
135     unsigned subY = y0 & 0xF;
136 
137     // ( 0,  0,  0,  0,  0,  0,  0, 16)
138     __m128i sixteen = _mm_cvtsi32_si128(16);
139 
140     // ( 0,  0,  0,  0, 16, 16, 16, 16)
141     sixteen = _mm_shufflelo_epi16(sixteen, 0);
142 
143     // ( 0,  0,  0,  0,  0,  0,  0,  y)
144     __m128i allY = _mm_cvtsi32_si128(subY);
145 
146     // ( 0,  0,  0,  0,  y,  y,  y,  y)
147     allY = _mm_shufflelo_epi16(allY, 0);
148 
149     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
150     __m128i negY = _mm_sub_epi16(sixteen, allY);
151 
152     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
153     allY = _mm_unpacklo_epi64(allY, negY);
154 
155     // (16, 16, 16, 16, 16, 16, 16, 16 )
156     sixteen = _mm_shuffle_epi32(sixteen, 0);
157 
158     // ( 0,  0,  0,  0,  0,  0,  0,  0)
159     __m128i zero = _mm_setzero_si128();
160 
161     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
162     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
163 
164     do {
165         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
166         unsigned x0 = XX >> 18;
167         unsigned x1 = XX & 0x3FFF;
168 
169         // (0, 0, 0, 0, 0, 0, 0, x)
170         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
171 
172         // (0, 0, 0, 0, x, x, x, x)
173         allX = _mm_shufflelo_epi16(allX, 0);
174 
175         // (x, x, x, x, x, x, x, x)
176         allX = _mm_shuffle_epi32(allX, 0);
177 
178         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
179         __m128i negX = _mm_sub_epi16(sixteen, allX);
180 
181         // Load 4 samples (pixels).
182         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
183         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
184         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
185         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
186 
187         // (0, 0, a00, a10)
188         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
189 
190         // Expand to 16 bits per component.
191         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
192 
193         // ((a00 * (16-y)), (a10 * y)).
194         a00a10 = _mm_mullo_epi16(a00a10, allY);
195 
196         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
197         a00a10 = _mm_mullo_epi16(a00a10, negX);
198 
199         // (0, 0, a01, a10)
200         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
201 
202         // Expand to 16 bits per component.
203         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
204 
205         // (a01 * (16-y)), (a11 * y)
206         a01a11 = _mm_mullo_epi16(a01a11, allY);
207 
208         // (a01 * (16-y) * x), (a11 * y * x)
209         a01a11 = _mm_mullo_epi16(a01a11, allX);
210 
211         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
212         __m128i sum = _mm_add_epi16(a00a10, a01a11);
213 
214         // (DC, a00*w00 + a01*w01)
215         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
216 
217         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
218         sum = _mm_add_epi16(sum, shifted);
219 
220         // Divide each 16 bit component by 256.
221         sum = _mm_srli_epi16(sum, 8);
222 
223         // Multiply by alpha.
224         sum = _mm_mullo_epi16(sum, alpha);
225 
226         // Divide each 16 bit component by 256.
227         sum = _mm_srli_epi16(sum, 8);
228 
229         // Pack lower 4 16 bit values of sum into lower 4 bytes.
230         sum = _mm_packus_epi16(sum, zero);
231 
232         // Extract low int and store.
233         *colors++ = _mm_cvtsi128_si32(sum);
234     } while (--count > 0);
235 }
236 
ClampX_ClampY_pack_filter(SkFixed f,unsigned max,SkFixed one)237 static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
238                                                  SkFixed one) {
239     unsigned i = SkClampMax(f >> 16, max);
240     i = (i << 4) | ((f >> 12) & 0xF);
241     return (i << 14) | SkClampMax((f + one) >> 16, max);
242 }
243 
244 /*  SSE version of ClampX_ClampY_filter_scale()
245  *  portable version is in core/SkBitmapProcState_matrix.h
246  */
ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)247 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
248                                      int count, int x, int y) {
249     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
250                              SkMatrix::kScale_Mask)) == 0);
251     SkASSERT(s.fInvKy == 0);
252 
253     const unsigned maxX = s.fPixmap.width() - 1;
254     const SkFixed one = s.fFilterOneX;
255     const SkFixed dx = s.fInvSx;
256 
257     const SkBitmapProcStateAutoMapper mapper(s, x, y);
258     const SkFixed fy = mapper.fixedY();
259     const unsigned maxY = s.fPixmap.height() - 1;
260     // compute our two Y values up front
261     *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
262     // now initialize fx
263     SkFixed fx = mapper.fixedX();
264 
265     // test if we don't need to apply the tile proc
266     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
267         if (count >= 4) {
268             // SSE version of decal_filter_scale
269             while ((size_t(xy) & 0x0F) != 0) {
270                 SkASSERT((fx >> (16 + 14)) == 0);
271                 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
272                 fx += dx;
273                 count--;
274             }
275 
276             __m128i wide_1    = _mm_set1_epi32(1);
277             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
278             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
279                                               fx + dx, fx);
280 
281             while (count >= 4) {
282                 __m128i wide_out;
283 
284                 wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
285                 wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
286                                         _mm_srai_epi32(wide_fx, 16), wide_1));
287 
288                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
289 
290                 xy += 4;
291                 fx += dx * 4;
292                 wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
293                 count -= 4;
294             } // while count >= 4
295         } // if count >= 4
296 
297         while (count-- > 0) {
298             SkASSERT((fx >> (16 + 14)) == 0);
299             *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
300             fx += dx;
301         }
302     } else {
303         // SSE2 only support 16bit interger max & min, so only process the case
304         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
305         // height, there should be rare bitmap whose height will be greater
306         // than max 16bit interger in the real world.
307         if ((count >= 4) && (maxX <= 0xFFFF)) {
308             while (((size_t)xy & 0x0F) != 0) {
309                 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
310                 fx += dx;
311                 count--;
312             }
313 
314             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
315                                               fx + dx, fx);
316             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
317             __m128i wide_one  = _mm_set1_epi32(one);
318             __m128i wide_maxX = _mm_set1_epi32(maxX);
319             __m128i wide_mask = _mm_set1_epi32(0xF);
320 
321              while (count >= 4) {
322                 __m128i wide_i;
323                 __m128i wide_lo;
324                 __m128i wide_fx1;
325 
326                 // i = SkClampMax(f>>16,maxX)
327                 wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
328                                        _mm_setzero_si128());
329                 wide_i = _mm_min_epi16(wide_i, wide_maxX);
330 
331                 // i<<4 | EXTRACT_LOW_BITS(fx)
332                 wide_lo = _mm_srli_epi32(wide_fx, 12);
333                 wide_lo = _mm_and_si128(wide_lo, wide_mask);
334                 wide_i  = _mm_slli_epi32(wide_i, 4);
335                 wide_i  = _mm_or_si128(wide_i, wide_lo);
336 
337                 // i<<14
338                 wide_i = _mm_slli_epi32(wide_i, 14);
339 
340                 // SkClampMax(((f+one))>>16,max)
341                 wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
342                 wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
343                                                         _mm_setzero_si128());
344                 wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
345 
346                 // final combination
347                 wide_i = _mm_or_si128(wide_i, wide_fx1);
348                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
349 
350                 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
351                 fx += dx * 4;
352                 xy += 4;
353                 count -= 4;
354             } // while count >= 4
355         } // if count >= 4
356 
357         while (count-- > 0) {
358             *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
359             fx += dx;
360         }
361     }
362 }
363 
364 /*  SSE version of ClampX_ClampY_nofilter_scale()
365  *  portable version is in core/SkBitmapProcState_matrix.h
366  */
ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)367 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
368                                     uint32_t xy[], int count, int x, int y) {
369     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
370                              SkMatrix::kScale_Mask)) == 0);
371 
372     // we store y, x, x, x, x, x
373     const unsigned maxX = s.fPixmap.width() - 1;
374     const SkBitmapProcStateAutoMapper mapper(s, x, y);
375     const unsigned maxY = s.fPixmap.height() - 1;
376     *xy++ = SkClampMax(mapper.intY(), maxY);
377     SkFixed fx = mapper.fixedX();
378 
379     if (0 == maxX) {
380         // all of the following X values must be 0
381         memset(xy, 0, count * sizeof(uint16_t));
382         return;
383     }
384 
385     const SkFixed dx = s.fInvSx;
386 
387     // test if we don't need to apply the tile proc
388     if ((unsigned)(fx >> 16) <= maxX &&
389         (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
390         // SSE version of decal_nofilter_scale
391         if (count >= 8) {
392             while (((size_t)xy & 0x0F) != 0) {
393                 *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
394                 fx += 2 * dx;
395                 count -= 2;
396             }
397 
398             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
399             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
400 
401             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
402                                              fx + dx, fx);
403             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
404 
405             while (count >= 8) {
406                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
407                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
408 
409                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
410                                                       wide_out_high);
411                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
412 
413                 wide_low = _mm_add_epi32(wide_low, wide_dx8);
414                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
415 
416                 xy += 4;
417                 fx += dx * 8;
418                 count -= 8;
419             }
420         } // if count >= 8
421 
422         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
423         while (count-- > 0) {
424             *xx++ = SkToU16(fx >> 16);
425             fx += dx;
426         }
427     } else {
428         // SSE2 only support 16bit interger max & min, so only process the case
429         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
430         // height, there should be rare bitmap whose height will be greater
431         // than max 16bit interger in the real world.
432         if ((count >= 8) && (maxX <= 0xFFFF)) {
433             while (((size_t)xy & 0x0F) != 0) {
434                 *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
435                                         SkClampMax(fx >> 16, maxX));
436                 fx += 2 * dx;
437                 count -= 2;
438             }
439 
440             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
441             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
442 
443             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
444                                              fx + dx, fx);
445             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
446             __m128i wide_maxX = _mm_set1_epi32(maxX);
447 
448             while (count >= 8) {
449                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
450                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
451 
452                 wide_out_low  = _mm_max_epi16(wide_out_low,
453                                               _mm_setzero_si128());
454                 wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
455                 wide_out_high = _mm_max_epi16(wide_out_high,
456                                               _mm_setzero_si128());
457                 wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
458 
459                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
460                                                       wide_out_high);
461                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
462 
463                 wide_low  = _mm_add_epi32(wide_low, wide_dx8);
464                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
465 
466                 xy += 4;
467                 fx += dx * 8;
468                 count -= 8;
469             }
470         } // if count >= 8
471 
472         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
473         while (count-- > 0) {
474             *xx++ = SkClampMax(fx >> 16, maxX);
475             fx += dx;
476         }
477     }
478 }
479 
480 /*  SSE version of ClampX_ClampY_filter_affine()
481  *  portable version is in core/SkBitmapProcState_matrix.h
482  */
ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)483 void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
484                                       uint32_t xy[], int count, int x, int y) {
485     const SkBitmapProcStateAutoMapper mapper(s, x, y);
486 
487     SkFixed oneX = s.fFilterOneX;
488     SkFixed oneY = s.fFilterOneY;
489     SkFixed fx = mapper.fixedX();
490     SkFixed fy = mapper.fixedY();
491     SkFixed dx = s.fInvSx;
492     SkFixed dy = s.fInvKy;
493     unsigned maxX = s.fPixmap.width() - 1;
494     unsigned maxY = s.fPixmap.height() - 1;
495 
496     if (count >= 2 && (maxX <= 0xFFFF)) {
497         SkFixed dx2 = dx + dx;
498         SkFixed dy2 = dy + dy;
499 
500         __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
501         __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
502         __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
503         __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
504         __m128i wide_mask = _mm_set1_epi32(0xF);
505 
506         while (count >= 2) {
507             // i = SkClampMax(f>>16,maxX)
508             __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
509                                            _mm_setzero_si128());
510             wide_i = _mm_min_epi16(wide_i, wide_max);
511 
512             // i<<4 | EXTRACT_LOW_BITS(f)
513             __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
514             wide_lo = _mm_and_si128(wide_lo, wide_mask);
515             wide_i  = _mm_slli_epi32(wide_i, 4);
516             wide_i  = _mm_or_si128(wide_i, wide_lo);
517 
518             // i<<14
519             wide_i = _mm_slli_epi32(wide_i, 14);
520 
521             // SkClampMax(((f+one))>>16,max)
522             __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
523             wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
524                                                    _mm_setzero_si128());
525             wide_f1 = _mm_min_epi16(wide_f1, wide_max);
526 
527             // final combination
528             wide_i = _mm_or_si128(wide_i, wide_f1);
529             _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
530 
531             wide_f = _mm_add_epi32(wide_f, wide_d2);
532 
533             fx += dx2;
534             fy += dy2;
535             xy += 4;
536             count -= 2;
537         } // while count >= 2
538     } // if count >= 2
539 
540     while (count-- > 0) {
541         *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
542         fy += dy;
543         *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
544         fx += dx;
545     }
546 }
547 
548 /*  SSE version of ClampX_ClampY_nofilter_affine()
549  *  portable version is in core/SkBitmapProcState_matrix.h
550  */
ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState & s,uint32_t xy[],int count,int x,int y)551 void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
552                                       uint32_t xy[], int count, int x, int y) {
553     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
554     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
555                              SkMatrix::kScale_Mask |
556                              SkMatrix::kAffine_Mask)) == 0);
557 
558     const SkBitmapProcStateAutoMapper mapper(s, x, y);
559 
560     SkFixed fx = mapper.fixedX();
561     SkFixed fy = mapper.fixedY();
562     SkFixed dx = s.fInvSx;
563     SkFixed dy = s.fInvKy;
564     int maxX = s.fPixmap.width() - 1;
565     int maxY = s.fPixmap.height() - 1;
566 
567     if (count >= 4 && (maxX <= 0xFFFF)) {
568         while (((size_t)xy & 0x0F) != 0) {
569             *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
570                                   SkClampMax(fx >> 16, maxX);
571             fx += dx;
572             fy += dy;
573             count--;
574         }
575 
576         SkFixed dx4 = dx * 4;
577         SkFixed dy4 = dy * 4;
578 
579         __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
580                                           fx + dx, fx);
581         __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
582                                           fy + dy, fy);
583         __m128i wide_dx4  = _mm_set1_epi32(dx4);
584         __m128i wide_dy4  = _mm_set1_epi32(dy4);
585 
586         __m128i wide_maxX = _mm_set1_epi32(maxX);
587         __m128i wide_maxY = _mm_set1_epi32(maxY);
588 
589         while (count >= 4) {
590             // SkClampMax(fx>>16,maxX)
591             __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
592                                             _mm_setzero_si128());
593             wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
594 
595             // SkClampMax(fy>>16,maxY)
596             __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
597                                             _mm_setzero_si128());
598             wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
599 
600             // final combination
601             __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
602                                           wide_lo);
603             _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
604 
605             wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
606             wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
607 
608             fx += dx4;
609             fy += dy4;
610             xy += 4;
611             count -= 4;
612         } // while count >= 4
613     } // if count >= 4
614 
615     while (count-- > 0) {
616         *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
617                               SkClampMax(fx >> 16, maxX);
618         fx += dx;
619         fy += dy;
620     }
621 }
622