• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBlitRect_opts_SSE2.h"
10 #include "SkBlitRow.h"
11 #include "SkColorPriv.h"
12 
13 /* Simple blitting of opaque rectangles less than 31 pixels wide:
14  * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
15  */
BlitRect32_OpaqueNarrow_SSE2(SkPMColor * SK_RESTRICT destination,int width,int height,size_t rowBytes,uint32_t color)16 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
17                                   int width, int height,
18                                   size_t rowBytes, uint32_t color) {
19     SkASSERT(255 == SkGetPackedA32(color));
20     SkASSERT(width > 0);
21     SkASSERT(width < 31);
22 
23     while (--height >= 0) {
24         SkPMColor* dst = destination;
25         int count = width;
26 
27         while (count > 4) {
28             *dst++ = color;
29             *dst++ = color;
30             *dst++ = color;
31             *dst++ = color;
32             count -= 4;
33         }
34 
35         while (count > 0) {
36             *dst++ = color;
37             --count;
38         }
39 
40         destination = (uint32_t*)((char*)destination + rowBytes);
41     }
42 }
43 
44 /*
45  * Fast blitting of opaque rectangles at least 31 pixels wide:
46  * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
47  * A 31 pixel rectangle is guaranteed to have at least one
48  * 16-pixel aligned span that can take advantage of mm_store.
49  */
BlitRect32_OpaqueWide_SSE2(SkPMColor * SK_RESTRICT destination,int width,int height,size_t rowBytes,uint32_t color)50 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
51                                 int width, int height,
52                                 size_t rowBytes, uint32_t color) {
53     SkASSERT(255 == SkGetPackedA32(color));
54     SkASSERT(width >= 31);
55 
56     __m128i color_wide = _mm_set1_epi32(color);
57     while (--height >= 0) {
58         // Prefetching one row ahead to L1 cache can equal hardware
59         // performance for large/tall rects, but never *beats*
60         // hardware performance.
61         SkPMColor* dst = destination;
62         int count = width;
63 
64         while (((size_t)dst) & 0x0F) {
65             *dst++ = color;
66             --count;
67         }
68         __m128i *d = reinterpret_cast<__m128i*>(dst);
69 
70         // Googling suggests _mm_stream is only going to beat _mm_store
71         // for things that wouldn't fit in L2 cache anyway, typically
72         // >500kB, and precisely fill cache lines.  For us, with
73         // arrays > 100k elements _mm_stream is still 100%+ slower than
74         // mm_store.
75 
76         // Unrolling to count >= 64 is a break-even for most
77         // input patterns; we seem to be saturating the bus and having
78         // low enough overhead at 32.
79 
80         while (count >= 32) {
81             _mm_store_si128(d++, color_wide);
82             _mm_store_si128(d++, color_wide);
83             _mm_store_si128(d++, color_wide);
84             _mm_store_si128(d++, color_wide);
85             _mm_store_si128(d++, color_wide);
86             _mm_store_si128(d++, color_wide);
87             _mm_store_si128(d++, color_wide);
88             _mm_store_si128(d++, color_wide);
89             count -= 32;
90         }
91         if (count >= 16) {
92             _mm_store_si128(d++, color_wide);
93             _mm_store_si128(d++, color_wide);
94             _mm_store_si128(d++, color_wide);
95             _mm_store_si128(d++, color_wide);
96             count -= 16;
97         }
98         dst = reinterpret_cast<uint32_t*>(d);
99 
100         // Unrolling the loop in the Narrow code is a significant performance
101         // gain, but unrolling this loop appears to make no difference in
102         // benchmarks with either mm_store_si128 or individual sets.
103 
104         while (count > 0) {
105             *dst++ = color;
106             --count;
107         }
108 
109         destination = (uint32_t*)((char*)destination + rowBytes);
110     }
111 }
112 
ColorRect32_SSE2(SkPMColor * destination,int width,int height,size_t rowBytes,uint32_t color)113 void ColorRect32_SSE2(SkPMColor* destination,
114                       int width, int height,
115                       size_t rowBytes, uint32_t color) {
116     if (0 == height || 0 == width || 0 == color) {
117         return;
118     }
119     unsigned colorA = SkGetPackedA32(color);
120     colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
121     if (255 == colorA) {
122         if (width < 31) {
123             BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
124                                          rowBytes, color);
125         } else {
126             BlitRect32_OpaqueWide_SSE2(destination, width, height,
127                                        rowBytes, color);
128         }
129     } else {
130         SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
131     }
132 }
133