1 /*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitRect_opts_SSE2.h"
9 #include "SkBlitRow.h"
10 #include "SkColorPriv.h"
11
12 #include <emmintrin.h>
13
14 /** Simple blitting of opaque rectangles less than 31 pixels wide:
15 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
16 */
BlitRect32_OpaqueNarrow_SSE2(SkPMColor * SK_RESTRICT destination,int width,int height,size_t rowBytes,uint32_t color)17 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
18 int width, int height,
19 size_t rowBytes, uint32_t color) {
20 SkASSERT(255 == SkGetPackedA32(color));
21 SkASSERT(width > 0);
22 SkASSERT(width < 31);
23
24 while (--height >= 0) {
25 SkPMColor* dst = destination;
26 int count = width;
27
28 while (count > 4) {
29 *dst++ = color;
30 *dst++ = color;
31 *dst++ = color;
32 *dst++ = color;
33 count -= 4;
34 }
35
36 while (count > 0) {
37 *dst++ = color;
38 --count;
39 }
40
41 destination = (uint32_t*)((char*)destination + rowBytes);
42 }
43 }
44
45 /**
46 Fast blitting of opaque rectangles at least 31 pixels wide:
47 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
48 A 31 pixel rectangle is guaranteed to have at least one
49 16-pixel aligned span that can take advantage of mm_store.
50 */
BlitRect32_OpaqueWide_SSE2(SkPMColor * SK_RESTRICT destination,int width,int height,size_t rowBytes,uint32_t color)51 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
52 int width, int height,
53 size_t rowBytes, uint32_t color) {
54 SkASSERT(255 == SkGetPackedA32(color));
55 SkASSERT(width >= 31);
56
57 __m128i color_wide = _mm_set1_epi32(color);
58 while (--height >= 0) {
59 // Prefetching one row ahead to L1 cache can equal hardware
60 // performance for large/tall rects, but never *beats*
61 // hardware performance.
62 SkPMColor* dst = destination;
63 int count = width;
64
65 while (((size_t)dst) & 0x0F) {
66 *dst++ = color;
67 --count;
68 }
69 __m128i *d = reinterpret_cast<__m128i*>(dst);
70
71 // Googling suggests _mm_stream is only going to beat _mm_store
72 // for things that wouldn't fit in L2 cache anyway, typically
73 // >500kB, and precisely fill cache lines. For us, with
74 // arrays > 100k elements _mm_stream is still 100%+ slower than
75 // mm_store.
76
77 // Unrolling to count >= 64 is a break-even for most
78 // input patterns; we seem to be saturating the bus and having
79 // low enough overhead at 32.
80
81 while (count >= 32) {
82 _mm_store_si128(d++, color_wide);
83 _mm_store_si128(d++, color_wide);
84 _mm_store_si128(d++, color_wide);
85 _mm_store_si128(d++, color_wide);
86 _mm_store_si128(d++, color_wide);
87 _mm_store_si128(d++, color_wide);
88 _mm_store_si128(d++, color_wide);
89 _mm_store_si128(d++, color_wide);
90 count -= 32;
91 }
92 if (count >= 16) {
93 _mm_store_si128(d++, color_wide);
94 _mm_store_si128(d++, color_wide);
95 _mm_store_si128(d++, color_wide);
96 _mm_store_si128(d++, color_wide);
97 count -= 16;
98 }
99 dst = reinterpret_cast<uint32_t*>(d);
100
101 // Unrolling the loop in the Narrow code is a significant performance
102 // gain, but unrolling this loop appears to make no difference in
103 // benchmarks with either mm_store_si128 or individual sets.
104
105 while (count > 0) {
106 *dst++ = color;
107 --count;
108 }
109
110 destination = (uint32_t*)((char*)destination + rowBytes);
111 }
112 }
113
ColorRect32_SSE2(SkPMColor * destination,int width,int height,size_t rowBytes,uint32_t color)114 void ColorRect32_SSE2(SkPMColor* destination,
115 int width, int height,
116 size_t rowBytes, uint32_t color) {
117 if (0 == height || 0 == width || 0 == color) {
118 return;
119 }
120 unsigned colorA = SkGetPackedA32(color);
121 if (false && 255 == colorA) { // disabled but compilable to suppress warning
122 if (width < 31) {
123 BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
124 rowBytes, color);
125 } else {
126 BlitRect32_OpaqueWide_SSE2(destination, width, height,
127 rowBytes, color);
128 }
129 } else {
130 SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
131 }
132 }
133