• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2014 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBitmap.h"
9 #include "SkBlurImage_opts_SSE4.h"
10 #include "SkColorPriv.h"
11 #include "SkRect.h"
12 
13 /* With the exception of the compilers that don't support it, we always build the
14  * SSE4 functions and enable the caller to determine SSE4 support.  However for
15  * compilers that do not support SSE4x we provide a stub implementation.
16  */
17 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
18 
19 #include <smmintrin.h>
20 
21 namespace {
22 enum BlurDirection {
23     kX, kY
24 };
25 
26 /* Helper function to spread the components of a 32-bit integer into the
27  * lower 8 bits of each 32-bit element of an SSE register.
28  */
expand(int a)29 inline __m128i expand(int a) {
30     // ARGB -> 0000 0000 0000 ARGB
31     __m128i widened = _mm_cvtsi32_si128(a);
32     // SSE4.1 has xxxx xxxx xxxx ARGB -> 000A 000R 000G 000B as a one-stop-shop instruction.
33     // It can even work from memory, so a smart compiler probably merges in the _mm_cvtsi32_si128().
34     return _mm_cvtepu8_epi32(widened);
35 }
36 
37 template<BlurDirection srcDirection, BlurDirection dstDirection>
SkBoxBlur_SSE4(const SkPMColor * src,int srcStride,SkPMColor * dst,int kernelSize,int leftOffset,int rightOffset,int width,int height)38 void SkBoxBlur_SSE4(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
39                     int leftOffset, int rightOffset, int width, int height)
40 {
41     const int rightBorder = SkMin32(rightOffset + 1, width);
42     const int srcStrideX = srcDirection == kX ? 1 : srcStride;
43     const int dstStrideX = dstDirection == kX ? 1 : height;
44     const int srcStrideY = srcDirection == kX ? srcStride : 1;
45     const int dstStrideY = dstDirection == kX ? width : 1;
46     const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);
47     const __m128i half = _mm_set1_epi32(1 << 23);
48     for (int y = 0; y < height; ++y) {
49         __m128i sum = _mm_setzero_si128();
50         const SkPMColor* p = src;
51         for (int i = 0; i < rightBorder; ++i) {
52             sum = _mm_add_epi32(sum, expand(*p));
53             p += srcStrideX;
54         }
55 
56         const SkPMColor* sptr = src;
57         SkColor* dptr = dst;
58         for (int x = 0; x < width; ++x) {
59             // TODO(mtklein): We are working in 8.24 here. Drop to 8.8 when the kernel is narrow?
60 
61             // Multiply each component by scale (i.e. divide by kernel size) and add half to round.
62             __m128i result = _mm_mullo_epi32(sum, scale);
63             result = _mm_add_epi32(result, half);
64 
65             // Now pack the top byte of each 32-bit lane back down into one 32-bit color.
66             // Axxx Rxxx Gxxx Bxxx -> xxxx xxxx xxxx ARGB
67             const char _ = 0;  // Don't care what ends up in these bytes.  Happens to be byte 0.
68             result = _mm_shuffle_epi8(result, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 15,11,7,3));
69 
70             *dptr = _mm_cvtsi128_si32(result);
71 
72             // TODO(mtklein): experiment with breaking this loop into 3 parts
73             if (x >= leftOffset) {
74                 SkColor l = *(sptr - leftOffset * srcStrideX);
75                 sum = _mm_sub_epi32(sum, expand(l));
76             }
77             if (x + rightOffset + 1 < width) {
78                 SkColor r = *(sptr + (rightOffset + 1) * srcStrideX);
79                 sum = _mm_add_epi32(sum, expand(r));
80             }
81             sptr += srcStrideX;
82             if (srcDirection == kY) {
83                 // TODO(mtklein): experiment with moving this prefetch forward
84                 _mm_prefetch(reinterpret_cast<const char*>(sptr + (rightOffset + 1) * srcStrideX),
85                              _MM_HINT_T0);
86             }
87             dptr += dstStrideX;
88         }
89         src += srcStrideY;
90         dst += dstStrideY;
91     }
92 }
93 
94 } // namespace
95 
SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc * boxBlurX,SkBoxBlurProc * boxBlurXY,SkBoxBlurProc * boxBlurYX)96 bool SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc* boxBlurX,
97                                     SkBoxBlurProc* boxBlurXY,
98                                     SkBoxBlurProc* boxBlurYX) {
99     *boxBlurX = SkBoxBlur_SSE4<kX, kX>;
100     *boxBlurXY = SkBoxBlur_SSE4<kX, kY>;
101     *boxBlurYX = SkBoxBlur_SSE4<kY, kX>;
102     return true;
103 }
104 
105 #else // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
106 
SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc * boxBlurX,SkBoxBlurProc * boxBlurXY,SkBoxBlurProc * boxBlurYX)107 bool SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc* boxBlurX,
108                                     SkBoxBlurProc* boxBlurXY,
109                                     SkBoxBlurProc* boxBlurYX) {
110     sk_throw();
111     return false;
112 }
113 
114 
115 #endif
116