• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/core/SkBlurEngine.h"
9 
10 #include "include/core/SkAlphaType.h"
11 #include "include/core/SkBitmap.h"
12 #include "include/core/SkBlendMode.h"
13 #include "include/core/SkClipOp.h"
14 #include "include/core/SkColor.h"
15 #include "include/core/SkColorSpace.h" // IWYU pragma: keep
16 #include "include/core/SkColorType.h"
17 #include "include/core/SkImageInfo.h"
18 #include "include/core/SkM44.h"
19 #include "include/core/SkMatrix.h"
20 #include "include/core/SkPaint.h"
21 #include "include/core/SkPoint.h"
22 #include "include/core/SkRect.h"
23 #include "include/core/SkSamplingOptions.h"
24 #include "include/core/SkScalar.h"
25 #include "include/core/SkSurfaceProps.h"
26 #include "include/core/SkTileMode.h"
27 #include "include/effects/SkRuntimeEffect.h"
28 #include "include/private/base/SkAssert.h"
29 #include "include/private/base/SkFeatures.h"
30 #include "include/private/base/SkMalloc.h"
31 #include "include/private/base/SkMath.h"
32 #include "include/private/base/SkTo.h"
33 #include "src/base/SkArenaAlloc.h"
34 #include "src/base/SkVx.h"
35 #include "src/core/SkBitmapDevice.h"
36 #include "src/core/SkDevice.h"
37 #include "src/core/SkKnownRuntimeEffects.h"
38 #include "src/core/SkSpecialImage.h"
39 
40 #include <algorithm>
41 #include <array>
42 #include <cmath>
43 #include <cstdint>
44 #include <cstring>
45 #include <utility>
46 
47 
48 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
49     #include <xmmintrin.h>
50     #define SK_PREFETCH(ptr) _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0)
51 #elif defined(__GNUC__)
52     #define SK_PREFETCH(ptr) __builtin_prefetch(ptr)
53 #else
54     #define SK_PREFETCH(ptr)
55 #endif
56 
57 // RasterBlurEngine
58 // ----------------------------------------------------------------------------
59 
60 namespace {
61 
62 class Pass {
63 public:
Pass(int border)64     explicit Pass(int border) : fBorder(border) {}
65     virtual ~Pass() = default;
66 
blur(int srcLeft,int srcRight,int dstRight,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)67     void blur(int srcLeft, int srcRight, int dstRight,
68               const uint32_t* src, int srcStride,
69               uint32_t* dst, int dstStride) {
70         this->startBlur();
71 
72         auto srcStart = srcLeft - fBorder,
73                 srcEnd   = srcRight - fBorder,
74                 dstEnd   = dstRight,
75                 srcIdx   = srcStart,
76                 dstIdx   = 0;
77 
78         const uint32_t* srcCursor = src;
79         uint32_t* dstCursor = dst;
80 
81         if (dstIdx < srcIdx) {
82             // The destination pixels are not effected by the src pixels,
83             // change to zero as per the spec.
84             // https://drafts.fxtf.org/filter-effects/#FilterPrimitivesOverviewIntro
85             int commonEnd = std::min(srcIdx, dstEnd);
86             while (dstIdx < commonEnd) {
87                 *dstCursor = 0;
88                 dstCursor += dstStride;
89                 SK_PREFETCH(dstCursor);
90                 dstIdx++;
91             }
92         } else if (srcIdx < dstIdx) {
93             // The edge of the source is before the edge of the destination. Calculate the sums for
94             // the pixels before the start of the destination.
95             if (int commonEnd = std::min(dstIdx, srcEnd); srcIdx < commonEnd) {
96                 // Preload the blur with values from src before dst is entered.
97                 int n = commonEnd - srcIdx;
98                 this->blurSegment(n, srcCursor, srcStride, nullptr, 0);
99                 srcIdx += n;
100                 srcCursor += n * srcStride;
101             }
102             if (srcIdx < dstIdx) {
103                 // The weird case where src is out of pixels before dst is even started.
104                 int n = dstIdx - srcIdx;
105                 this->blurSegment(n, nullptr, 0, nullptr, 0);
106                 srcIdx += n;
107             }
108         }
109 
110         if (int commonEnd = std::min(dstEnd, srcEnd); dstIdx < commonEnd) {
111             // Both srcIdx and dstIdx are in sync now, and can run in a 1:1 fashion. This is the
112             // normal mode of operation.
113             SkASSERT(srcIdx == dstIdx);
114 
115             int n = commonEnd - dstIdx;
116             this->blurSegment(n, srcCursor, srcStride, dstCursor, dstStride);
117             srcCursor += n * srcStride;
118             dstCursor += n * dstStride;
119             dstIdx += n;
120             srcIdx += n;
121         }
122 
123         // Drain the remaining blur values into dst assuming 0's for the leading edge.
124         if (dstIdx < dstEnd) {
125             int n = dstEnd - dstIdx;
126             this->blurSegment(n, nullptr, 0, dstCursor, dstStride);
127         }
128     }
129 
130 protected:
131     virtual void startBlur() = 0;
132     virtual void blurSegment(
133             int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) = 0;
134 
135 private:
136     const int fBorder;
137 };
138 
139 class PassMaker {
140 public:
PassMaker(int window)141     explicit PassMaker(int window) : fWindow{window} {}
142     virtual ~PassMaker() = default;
143     virtual Pass* makePass(void* buffer, SkArenaAlloc* alloc) const = 0;
144     virtual size_t bufferSizeBytes() const = 0;
window() const145     int window() const {return fWindow;}
146 
147 private:
148     const int fWindow;
149 };
150 
151 // Implement a scanline processor that uses a three-box filter to approximate a Gaussian blur.
152 // The GaussPass is limit to processing sigmas < 135.
153 class GaussPass final : public Pass {
154 public:
155     // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
156     // using the Gauss filter. It also limits the size of buffers used hold intermediate values.
157     // Explanation of maximums:
158     //   sum0 = window * 255
159     //   sum1 = window * sum0 -> window * window * 255
160     //   sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255
161     //
162     //   The value window^3 * 255 must fit in a uint32_t. So,
163     //      window^3 < 2^32. window = 255.
164     //
165     //   window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
166     //   For window <= 255, the largest value for sigma is 136.
MakeMaker(float sigma,SkArenaAlloc * alloc)167     static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) {
168         SkASSERT(0 <= sigma);
169         int window = SkBlurEngine::BoxBlurWindow(sigma);
170         if (255 <= window) {
171             return nullptr;
172         }
173 
174         class Maker : public PassMaker {
175         public:
176             explicit Maker(int window) : PassMaker{window} {}
177             Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override {
178                 return GaussPass::Make(this->window(), buffer, alloc);
179             }
180 
181             size_t bufferSizeBytes() const override {
182                 int window = this->window();
183                 size_t onePassSize = window - 1;
184                 // If the window is odd, then there is an obvious middle element. For even sizes
185                 // 2 passes are shifted, and the last pass has an extra element. Like this:
186                 //       S
187                 //    aaaAaa
188                 //     bbBbbb
189                 //    cccCccc
190                 //       D
191                 size_t bufferCount = (window & 1) == 1 ? 3 * onePassSize : 3 * onePassSize + 1;
192                 return bufferCount * sizeof(skvx::Vec<4, uint32_t>);
193             }
194         };
195 
196         return alloc->make<Maker>(window);
197     }
198 
Make(int window,void * buffers,SkArenaAlloc * alloc)199     static GaussPass* Make(int window, void* buffers, SkArenaAlloc* alloc) {
200         // We don't need to store the trailing edge pixel in the buffer;
201         int passSize = window - 1;
202         skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers);
203         skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize;
204         skvx::Vec<4, uint32_t>* buffer2 = buffer1 + passSize;
205         // If the window is odd just one buffer is needed, but if it's even, then there is one
206         // more element on that pass.
207         skvx::Vec<4, uint32_t>* buffersEnd = buffer2 + ((window & 1) ? passSize : passSize + 1);
208 
209         // Calculating the border is tricky. The border is the distance in pixels between the first
210         // dst pixel and the first src pixel (or the last src pixel and the last dst pixel).
211         // I will go through the odd case which is simpler, and then through the even case. Given a
212         // stack of filters seven wide for the odd case of three passes.
213         //
214         //        S
215         //     aaaAaaa
216         //     bbbBbbb
217         //     cccCccc
218         //        D
219         //
220         // The furthest changed pixel is when the filters are in the following configuration.
221         //
222         //                 S
223         //           aaaAaaa
224         //        bbbBbbb
225         //     cccCccc
226         //        D
227         //
228         // The A pixel is calculated using the value S, the B uses A, and the C uses B, and
229         // finally D is C. So, with a window size of seven the border is nine. In the odd case, the
230         // border is 3*((window - 1)/2).
231         //
232         // For even cases the filter stack is more complicated. The spec specifies two passes
233         // of even filters and a final pass of odd filters. A stack for a width of six looks like
234         // this.
235         //
236         //       S
237         //    aaaAaa
238         //     bbBbbb
239         //    cccCccc
240         //       D
241         //
242         // The furthest pixel looks like this.
243         //
244         //               S
245         //          aaaAaa
246         //        bbBbbb
247         //    cccCccc
248         //       D
249         //
250         // For a window of six, the border value is eight. In the even case the border is 3 *
251         // (window/2) - 1.
252         int border = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
253 
254         // If the window is odd then the divisor is just window ^ 3 otherwise,
255         // it is window * window * (window + 1) = window ^ 3 + window ^ 2;
256         int window2 = window * window;
257         int window3 = window2 * window;
258         int divisor = (window & 1) == 1 ? window3 : window3 + window2;
259         return alloc->make<GaussPass>(buffer0, buffer1, buffer2, buffersEnd, border, divisor);
260     }
261 
GaussPass(skvx::Vec<4,uint32_t> * buffer0,skvx::Vec<4,uint32_t> * buffer1,skvx::Vec<4,uint32_t> * buffer2,skvx::Vec<4,uint32_t> * buffersEnd,int border,int divisor)262     GaussPass(skvx::Vec<4, uint32_t>* buffer0,
263               skvx::Vec<4, uint32_t>* buffer1,
264               skvx::Vec<4, uint32_t>* buffer2,
265               skvx::Vec<4, uint32_t>* buffersEnd,
266               int border,
267               int divisor)
268         : Pass{border}
269         , fBuffer0{buffer0}
270         , fBuffer1{buffer1}
271         , fBuffer2{buffer2}
272         , fBuffersEnd{buffersEnd}
273         , fDivider(divisor) {}
274 
275 private:
startBlur()276     void startBlur() override {
277         skvx::Vec<4, uint32_t> zero = {0u, 0u, 0u, 0u};
278         zero.store(fSum0);
279         zero.store(fSum1);
280         auto half = fDivider.half();
281         skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum2);
282         sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
283 
284         fBuffer0Cursor = fBuffer0;
285         fBuffer1Cursor = fBuffer1;
286         fBuffer2Cursor = fBuffer2;
287     }
288 
289     // GaussPass implements the common three pass box filter approximation of Gaussian blur,
290     // but combines all three passes into a single pass. This approach is facilitated by three
291     // circular buffers the width of the window which track values for trailing edges of each of
292     // the three passes. This allows the algorithm to use more precision in the calculation
293     // because the values are not rounded each pass. And this implementation also avoids a trap
294     // that's easy to fall into resulting in blending in too many zeroes near the edge.
295     //
296     // In general, a window sum has the form:
297     //     sum_n+1 = sum_n + leading_edge - trailing_edge.
298     // If instead we do the subtraction at the end of the previous iteration, we can just
299     // calculate the sums instead of having to do the subtractions too.
300     //
301     //      In previous iteration:
302     //      sum_n+1 = sum_n - trailing_edge.
303     //
304     //      In this iteration:
305     //      sum_n+1 = sum_n + leading_edge.
306     //
307     // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the
308     // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the
309     // three passes at the same time has the form:
310     //
311     //    sum0_n+1 = sum0_n + leading edge
312     //    sum1_n+1 = sum1_n + sum0_n+1
313     //    sum2_n+1 = sum2_n + sum1_n+1
314     //
315     //    sum2_n+1 / window^3 is the new value of the destination pixel.
316     //
317     // Reduce the sums by the trailing edges which were stored in the circular buffers for the
318     // next go around. This is the case for odd sized windows, even windows the the third
319     // circular buffer is one larger then the first two circular buffers.
320     //
321     //    sum2_n+2 = sum2_n+1 - buffer2[i];
322     //    buffer2[i] = sum1;
323     //    sum1_n+2 = sum1_n+1 - buffer1[i];
324     //    buffer1[i] = sum0;
325     //    sum0_n+2 = sum0_n+1 - buffer0[i];
326     //    buffer0[i] = leading edge
blurSegment(int n,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)327     void blurSegment(
328             int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) override {
329 #if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
330         skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
331         skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
332         skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor;
333         v4u32 sum0 = __lsx_vld(fSum0, 0); // same as skvx::Vec<4, uint32_t>::Load(fSum0);
334         v4u32 sum1 = __lsx_vld(fSum1, 0);
335         v4u32 sum2 = __lsx_vld(fSum2, 0);
336 
337         auto processValue = [&](v4u32& vLeadingEdge){
338           sum0 += vLeadingEdge;
339           sum1 += sum0;
340           sum2 += sum1;
341 
342           v4u32 divisorFactor = __lsx_vreplgr2vr_w(fDivider.divisorFactor());
343           v4u32 blurred = __lsx_vmuh_w(divisorFactor, sum2);
344 
345           v4u32 buffer2Value = __lsx_vld(buffer2Cursor, 0); //Not fBuffer0Cursor, out of bounds.
346           sum2 -= buffer2Value;
347           __lsx_vst(sum1, (void *)buffer2Cursor, 0);
348           buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2;
349           v4u32 buffer1Value = __lsx_vld(buffer1Cursor, 0);
350           sum1 -= buffer1Value;
351           __lsx_vst(sum0, (void *)buffer1Cursor, 0);
352           buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1;
353           v4u32 buffer0Value = __lsx_vld(buffer0Cursor, 0);
354           sum0 -= buffer0Value;
355           __lsx_vst(vLeadingEdge, (void *)buffer0Cursor, 0);
356           buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
357 
358           v16u8 shuf = {0x0,0x4,0x8,0xc,0x0};
359           v16u8 ret = __lsx_vshuf_b(blurred, blurred, shuf);
360           return ret;
361         };
362 
363         v4u32 zero = __lsx_vldi(0x0);
364         if (!src && !dst) {
365             while (n --> 0) {
366                 (void)processValue(zero);
367             }
368         } else if (src && !dst) {
369             while (n --> 0) {
370                 v4u32 edge = __lsx_vinsgr2vr_w(zero, *src, 0);
371                 edge = __lsx_vilvl_b(zero, edge);
372                 edge = __lsx_vilvl_h(zero, edge);
373                 (void)processValue(edge);
374                 src += srcStride;
375             }
376         } else if (!src && dst) {
377             while (n --> 0) {
378                 v4u32 ret = processValue(zero);
379                 __lsx_vstelm_w(ret, dst, 0, 0); // 3rd is offset, 4th is idx.
380                 dst += dstStride;
381             }
382         } else if (src && dst) {
383             while (n --> 0) {
384                 v4u32 edge = __lsx_vinsgr2vr_w(zero, *src, 0);
385                 edge = __lsx_vilvl_b(zero, edge);
386                 edge = __lsx_vilvl_h(zero, edge);
387                 v4u32 ret = processValue(edge);
388                 __lsx_vstelm_w(ret, dst, 0, 0);
389                 src += srcStride;
390                 dst += dstStride;
391             }
392         }
393 
394         // Store the state
395         fBuffer0Cursor = buffer0Cursor;
396         fBuffer1Cursor = buffer1Cursor;
397         fBuffer2Cursor = buffer2Cursor;
398 
399         __lsx_vst(sum0, fSum0, 0);
400         __lsx_vst(sum1, fSum1, 0);
401         __lsx_vst(sum2, fSum2, 0);
402 #else
403         skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
404         skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
405         skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor;
406         skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0);
407         skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1);
408         skvx::Vec<4, uint32_t> sum2 = skvx::Vec<4, uint32_t>::Load(fSum2);
409 
410         // Given an expanded input pixel, move the window ahead using the leadingEdge value.
411         auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) {
412             sum0 += leadingEdge;
413             sum1 += sum0;
414             sum2 += sum1;
415 
416             skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum2);
417 
418             sum2 -= *buffer2Cursor;
419             *buffer2Cursor = sum1;
420             buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2;
421             sum1 -= *buffer1Cursor;
422             *buffer1Cursor = sum0;
423             buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1;
424             sum0 -= *buffer0Cursor;
425             *buffer0Cursor = leadingEdge;
426             buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
427 
428             return skvx::cast<uint8_t>(blurred);
429         };
430 
431         auto loadEdge = [&](const uint32_t* srcCursor) {
432             return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor));
433         };
434 
435         if (!src && !dst) {
436             while (n --> 0) {
437                 (void)processValue(0);
438             }
439         } else if (src && !dst) {
440             while (n --> 0) {
441                 (void)processValue(loadEdge(src));
442                 src += srcStride;
443             }
444         } else if (!src && dst) {
445             while (n --> 0) {
446                 processValue(0u).store(dst);
447                 dst += dstStride;
448             }
449         } else if (src && dst) {
450             while (n --> 0) {
451                 processValue(loadEdge(src)).store(dst);
452                 src += srcStride;
453                 dst += dstStride;
454             }
455         }
456 
457         // Store the state
458         fBuffer0Cursor = buffer0Cursor;
459         fBuffer1Cursor = buffer1Cursor;
460         fBuffer2Cursor = buffer2Cursor;
461 
462         sum0.store(fSum0);
463         sum1.store(fSum1);
464         sum2.store(fSum2);
465 #endif
466     }
467 
468     skvx::Vec<4, uint32_t>* const fBuffer0;
469     skvx::Vec<4, uint32_t>* const fBuffer1;
470     skvx::Vec<4, uint32_t>* const fBuffer2;
471     skvx::Vec<4, uint32_t>* const fBuffersEnd;
472     const skvx::ScaledDividerU32 fDivider;
473 
474     // blur state
475     char fSum0[sizeof(skvx::Vec<4, uint32_t>)];
476     char fSum1[sizeof(skvx::Vec<4, uint32_t>)];
477     char fSum2[sizeof(skvx::Vec<4, uint32_t>)];
478     skvx::Vec<4, uint32_t>* fBuffer0Cursor;
479     skvx::Vec<4, uint32_t>* fBuffer1Cursor;
480     skvx::Vec<4, uint32_t>* fBuffer2Cursor;
481 };
482 
483 // Implement a scanline processor that uses a two-box filter to approximate a Tent filter.
484 // The TentPass is limit to processing sigmas < 2183.
485 class TentPass final : public Pass {
486 public:
487     // NB 2183 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
488     // using the Tent filter. It also limits the size of buffers used hold intermediate values.
489     // Explanation of maximums:
490     //   sum0 = window * 255
491     //   sum1 = window * sum0 -> window * window * 255
492     //
493     //   The value window^2 * 255 must fit in a uint32_t. So,
494     //      window^2 < 2^32. window = 4104.
495     //
496     //   window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
497     //   For window <= 4104, the largest value for sigma is 2183.
MakeMaker(float sigma,SkArenaAlloc * alloc)498     static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) {
499         SkASSERT(0 <= sigma);
500         int gaussianWindow = SkBlurEngine::BoxBlurWindow(sigma);
501         // This is a naive method of using the window size for the Gaussian blur to calculate the
502         // window size for the Tent blur. This seems to work well in practice.
503         //
504         // We can use a single pixel to generate the effective blur area given a window size. For
505         // the Gaussian blur this is 3 * window size. For the Tent filter this is 2 * window size.
506         int tentWindow = 3 * gaussianWindow / 2;
507         if (tentWindow >= 4104) {
508             return nullptr;
509         }
510 
511         class Maker : public PassMaker {
512         public:
513             explicit Maker(int window) : PassMaker{window} {}
514             Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override {
515                 return TentPass::Make(this->window(), buffer, alloc);
516             }
517 
518             size_t bufferSizeBytes() const override {
519                 size_t onePassSize = this->window() - 1;
520                 // If the window is odd, then there is an obvious middle element. For even sizes 2
521                 // passes are shifted, and the last pass has an extra element. Like this:
522                 //       S
523                 //    aaaAaa
524                 //     bbBbbb
525                 //       D
526                 size_t bufferCount = 2 * onePassSize;
527                 return bufferCount * sizeof(skvx::Vec<4, uint32_t>);
528             }
529         };
530 
531         return alloc->make<Maker>(tentWindow);
532     }
533 
Make(int window,void * buffers,SkArenaAlloc * alloc)534     static TentPass* Make(int window, void* buffers, SkArenaAlloc* alloc) {
535         if (window > 4104) {
536             return nullptr;
537         }
538 
539         // We don't need to store the trailing edge pixel in the buffer;
540         int passSize = window - 1;
541         skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers);
542         skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize;
543         skvx::Vec<4, uint32_t>* buffersEnd = buffer1 + passSize;
544 
545         // Calculating the border is tricky. The border is the distance in pixels between the first
546         // dst pixel and the first src pixel (or the last src pixel and the last dst pixel).
547         // I will go through the odd case which is simpler, and then through the even case. Given a
548         // stack of filters seven wide for the odd case of three passes.
549         //
550         //        S
551         //     aaaAaaa
552         //     bbbBbbb
553         //        D
554         //
555         // The furthest changed pixel is when the filters are in the following configuration.
556         //
557         //              S
558         //        aaaAaaa
559         //     bbbBbbb
560         //        D
561         //
562         // The A pixel is calculated using the value S, the B uses A, and the D uses B.
563         // So, with a window size of seven the border is nine. In the odd case, the border is
564         // window - 1.
565         //
566         // For even cases the filter stack is more complicated. It uses two passes
567         // of even filters offset from each other. A stack for a width of six looks like
568         // this.
569         //
570         //       S
571         //    aaaAaa
572         //     bbBbbb
573         //       D
574         //
575         // The furthest pixel looks like this.
576         //
577         //            S
578         //       aaaAaa
579         //     bbBbbb
580         //       D
581         //
582         // For a window of six, the border value is 5. In the even case the border is
583         // window - 1.
584         int border = window - 1;
585 
586         int divisor = window * window;
587         return alloc->make<TentPass>(buffer0, buffer1, buffersEnd, border, divisor);
588     }
589 
TentPass(skvx::Vec<4,uint32_t> * buffer0,skvx::Vec<4,uint32_t> * buffer1,skvx::Vec<4,uint32_t> * buffersEnd,int border,int divisor)590     TentPass(skvx::Vec<4, uint32_t>* buffer0,
591              skvx::Vec<4, uint32_t>* buffer1,
592              skvx::Vec<4, uint32_t>* buffersEnd,
593              int border,
594              int divisor)
595          : Pass{border}
596          , fBuffer0{buffer0}
597          , fBuffer1{buffer1}
598          , fBuffersEnd{buffersEnd}
599          , fDivider(divisor) {}
600 
601 private:
startBlur()602     void startBlur() override {
603         skvx::Vec<4, uint32_t>{0u, 0u, 0u, 0u}.store(fSum0);
604         auto half = fDivider.half();
605         skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum1);
606         sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
607 
608         fBuffer0Cursor = fBuffer0;
609         fBuffer1Cursor = fBuffer1;
610     }
611 
612     // TentPass implements the common two pass box filter approximation of Tent filter,
613     // but combines all both passes into a single pass. This approach is facilitated by two
614     // circular buffers the width of the window which track values for trailing edges of each of
615     // both passes. This allows the algorithm to use more precision in the calculation
616     // because the values are not rounded each pass. And this implementation also avoids a trap
617     // that's easy to fall into resulting in blending in too many zeroes near the edge.
618     //
619     // In general, a window sum has the form:
620     //     sum_n+1 = sum_n + leading_edge - trailing_edge.
621     // If instead we do the subtraction at the end of the previous iteration, we can just
622     // calculate the sums instead of having to do the subtractions too.
623     //
624     //      In previous iteration:
625     //      sum_n+1 = sum_n - trailing_edge.
626     //
627     //      In this iteration:
628     //      sum_n+1 = sum_n + leading_edge.
629     //
630     // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the
631     // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the
632     // three passes at the same time has the form:
633     //
634     //    sum0_n+1 = sum0_n + leading edge
635     //    sum1_n+1 = sum1_n + sum0_n+1
636     //
637     //    sum1_n+1 / window^2 is the new value of the destination pixel.
638     //
639     // Reduce the sums by the trailing edges which were stored in the circular buffers for the
640     // next go around.
641     //
642     //    sum1_n+2 = sum1_n+1 - buffer1[i];
643     //    buffer1[i] = sum0;
644     //    sum0_n+2 = sum0_n+1 - buffer0[i];
645     //    buffer0[i] = leading edge
blurSegment(int n,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)646     void blurSegment(
647             int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) override {
648         skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
649         skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
650         skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0);
651         skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1);
652 
653         // Given an expanded input pixel, move the window ahead using the leadingEdge value.
654         auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) {
655             sum0 += leadingEdge;
656             sum1 += sum0;
657 
658             skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum1);
659 
660             sum1 -= *buffer1Cursor;
661             *buffer1Cursor = sum0;
662             buffer1Cursor = (buffer1Cursor + 1) < fBuffersEnd ? buffer1Cursor + 1 : fBuffer1;
663             sum0 -= *buffer0Cursor;
664             *buffer0Cursor = leadingEdge;
665             buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
666 
667             return skvx::cast<uint8_t>(blurred);
668         };
669 
670         auto loadEdge = [&](const uint32_t* srcCursor) {
671             return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor));
672         };
673 
674         if (!src && !dst) {
675             while (n --> 0) {
676                 (void)processValue(0);
677             }
678         } else if (src && !dst) {
679             while (n --> 0) {
680                 (void)processValue(loadEdge(src));
681                 src += srcStride;
682             }
683         } else if (!src && dst) {
684             while (n --> 0) {
685                 processValue(0u).store(dst);
686                 dst += dstStride;
687             }
688         } else if (src && dst) {
689             while (n --> 0) {
690                 processValue(loadEdge(src)).store(dst);
691                 src += srcStride;
692                 dst += dstStride;
693             }
694         }
695 
696         // Store the state
697         fBuffer0Cursor = buffer0Cursor;
698         fBuffer1Cursor = buffer1Cursor;
699         sum0.store(fSum0);
700         sum1.store(fSum1);
701     }
702 
703     skvx::Vec<4, uint32_t>* const fBuffer0;
704     skvx::Vec<4, uint32_t>* const fBuffer1;
705     skvx::Vec<4, uint32_t>* const fBuffersEnd;
706     const skvx::ScaledDividerU32 fDivider;
707 
708     // blur state
709     char fSum0[sizeof(skvx::Vec<4, uint32_t>)];
710     char fSum1[sizeof(skvx::Vec<4, uint32_t>)];
711     skvx::Vec<4, uint32_t>* fBuffer0Cursor;
712     skvx::Vec<4, uint32_t>* fBuffer1Cursor;
713 };
714 
715 class Raster8888BlurAlgorithm : public SkBlurEngine::Algorithm {
716 public:
717     // See analysis in description of TentPass for the max supported sigma.
maxSigma() const718     float maxSigma() const override {
719         // TentPass supports a sigma up to 2183, and was added so that the CPU blur algorithm's
720         // blur radius was as large as that supported by the GPU. GaussPass only supports up to 136.
721         // However, there is a very apparent pop in blur weight when switching from successive box
722         // blurs to the tent filter. The TentPass is preserved for legacy blurs, which do not use
723         // FilterResult::rescale(). However, using kMaxSigma = 135 with the raster SkBlurEngine
724         // ensures that the non-legacy raster blurs will always use the GaussPass implementation.
725         // This is about 6-7x faster on large blurs to rescale a few times to a lower resolution
726         // than it is to evaluate the much larger original window.
727         static constexpr float kMaxSigma = 135.f;
728         SkASSERT(SkBlurEngine::BoxBlurWindow(kMaxSigma) <= 255); // see GaussPass::MakeMaker().
729         return kMaxSigma;
730     }
731 
732     // TODO: Implement CPU backend for different fTileMode. This is still worth doing inline with
733     // the blur; at the moment the tiling is applied via the CropImageFilter and carried as metadata
734     // on the FilterResult. This is forcefully applied in FilterResult::Builder::blur() when
735     // supportsOnlyDecalTiling() returns true.
supportsOnlyDecalTiling() const736     bool supportsOnlyDecalTiling() const override { return true; }
737 
blur(SkSize sigma,sk_sp<SkSpecialImage> input,const SkIRect & originalSrcBounds,SkTileMode tileMode,const SkIRect & originalDstBounds) const738     sk_sp<SkSpecialImage> blur(SkSize sigma,
739                                sk_sp<SkSpecialImage> input,
740                                const SkIRect& originalSrcBounds,
741                                SkTileMode tileMode,
742                                const SkIRect& originalDstBounds) const override {
743         // TODO: Enable this assert when the TentPass is no longer used for legacy blurs
744         // (which supports blur sigmas larger than what's reported in maxSigma()).
745         // SkASSERT(sigma.width() <= this->maxSigma() && sigma.height() <= this->maxSigma());
746         SkASSERT(tileMode == SkTileMode::kDecal);
747 
748         SkASSERT(SkIRect::MakeSize(input->dimensions()).contains(originalSrcBounds));
749 
750         SkBitmap src;
751         if (!SkSpecialImages::AsBitmap(input.get(), &src)) {
752             return nullptr; // Should only have been called by CPU-backed images
753         }
754         // The blur engine should not have picked this algorithm for a non-32-bit color type
755         SkASSERT(src.colorType() == kRGBA_8888_SkColorType ||
756                  src.colorType() == kBGRA_8888_SkColorType);
757 
758         SkSTArenaAlloc<1024> alloc;
759         auto makeMaker = [&](float sigma) -> PassMaker* {
760             SkASSERT(0 <= sigma && sigma <= 2183); // should be guaranteed after map_sigma
761             if (PassMaker* maker = GaussPass::MakeMaker(sigma, &alloc)) {
762                 return maker;
763             }
764             if (PassMaker* maker = TentPass::MakeMaker(sigma, &alloc)) {
765                 return maker;
766             }
767             SK_ABORT("Sigma is out of range.");
768         };
769 
770         PassMaker* makerX = makeMaker(sigma.width());
771         PassMaker* makerY = makeMaker(sigma.height());
772 
773 #if !defined(SK_AVOID_SLOW_RASTER_PIPELINE_BLURS)
774         // A blur with a sigma smaller than the successive box-blurs accuracy should have been
775         // routed to the shader-based algorithm.
776         SkASSERT(makerX->window() > 1 || makerY->window() > 1);
777 #endif
778 
779         SkIRect srcBounds = originalSrcBounds;
780         SkIRect dstBounds = originalDstBounds;
781         if (makerX->window() > 1) {
782             // Inflate the dst by the window required for the Y pass so that the X pass can prepare
783             // it. The Y pass will be offset to only write to the original rows in dstBounds, but
784             // its window will access these extra rows calculated by the X pass. The SpecialImage
785             // factory will then subset the bitmap so it appears to match 'originalDstBounds'
786             // tightly. We make one slightly larger image to hold this extra data instead of two
787             // separate images sized exactly to each pass because the CPU blur can write in place.
788             dstBounds.outset(0, SkBlurEngine::SigmaToRadius(sigma.height()));
789         }
790 
791         SkBitmap dst;
792         const SkIPoint dstOrigin = dstBounds.topLeft();
793         if (!dst.tryAllocPixels(src.info().makeWH(dstBounds.width(), dstBounds.height()))) {
794             return nullptr;
795         }
796         dst.eraseColor(SK_ColorTRANSPARENT);
797 
798         auto buffer = alloc.makeBytesAlignedTo(std::max(makerX->bufferSizeBytes(),
799                                                         makerY->bufferSizeBytes()),
800                                             alignof(skvx::Vec<4, uint32_t>));
801 
802         // Basic Plan: The three cases to handle
803         // * Horizontal and Vertical - blur horizontally while copying values from the source to
804         //     the destination. Then, do an in-place vertical blur.
805         // * Horizontal only - blur horizontally copying values from the source to the destination.
806         // * Vertical only - blur vertically copying values from the source to the destination.
807 
808         // Initialize these assuming the Y-only case
809         int loopStart  = std::max(srcBounds.left(),  dstBounds.left());
810         int loopEnd    = std::min(srcBounds.right(), dstBounds.right());
811         int dstYOffset = 0;
812 
813         if (makerX->window() > 1) {
814             // First an X-only blur from src into dst, including the extra rows that will become
815             // input for the second Y pass, which will then be performed in place.
816             loopStart = std::max(srcBounds.top(),    dstBounds.top());
817             loopEnd   = std::min(srcBounds.bottom(), dstBounds.bottom());
818 
819             auto srcAddr = src.getAddr32(0, loopStart - srcBounds.top());
820             auto dstAddr = dst.getAddr32(0, loopStart - dstBounds.top());
821 
822             // Iterate over each row to calculate 1D blur along X.
823             Pass* pass = makerX->makePass(buffer, &alloc);
824             for (int y = loopStart; y < loopEnd; ++y) {
825                 pass->blur(srcBounds.left()  - dstBounds.left(),
826                            srcBounds.right() - dstBounds.left(),
827                            dstBounds.width(),
828                            srcAddr, 1,
829                            dstAddr, 1);
830                 srcAddr += src.rowBytesAsPixels();
831                 dstAddr += dst.rowBytesAsPixels();
832             }
833 
834             // Set up the Y pass to blur from the full dst into the non-outset portion of dst
835             src = dst;
836             loopStart = originalDstBounds.left();
837             loopEnd   = originalDstBounds.right();
838             // The new 'dst' is equal to dst.extractSubset(originalDstBounds.offset(-dstOrigin)),
839             // but by construction only the Y offset has an interesting value so this is a little
840             // more efficient.
841             dstYOffset = originalDstBounds.top() - dstBounds.top();
842 
843             srcBounds = dstBounds;
844             dstBounds = originalDstBounds;
845         }
846 
847         // Iterate over each column to calculate 1D blur along Y. This is either blurring from src
848         // into dst for a 1D blur; or it's blurring from dst into dst for the second pass of a 2D
849         // blur.
850         if (makerY->window() > 1) {
851             auto srcAddr = src.getAddr32(loopStart - srcBounds.left(), 0);
852             auto dstAddr = dst.getAddr32(loopStart - dstBounds.left(), dstYOffset);
853 
854             Pass* pass = makerY->makePass(buffer, &alloc);
855             for (int x = loopStart; x < loopEnd; ++x) {
856                 pass->blur(srcBounds.top()    - dstBounds.top(),
857                            srcBounds.bottom() - dstBounds.top(),
858                            dstBounds.height(),
859                            srcAddr, src.rowBytesAsPixels(),
860                            dstAddr, dst.rowBytesAsPixels());
861                 srcAddr += 1;
862                 dstAddr += 1;
863             }
864         }
865 
866 #if defined(SK_AVOID_SLOW_RASTER_PIPELINE_BLURS)
867         // When avoiding the shader-based algorithm, handle the box identity case.
868         if (makerX->window() == 1 && makerY->window() == 1) {
869             dst.writePixels(src.pixmap(),
870                             srcBounds.left() - dstBounds.left(),
871                             srcBounds.top()  - dstBounds.top());
872         }
873 #endif
874 
875         dstBounds = originalDstBounds.makeOffset(-dstOrigin); // Make relative to dst's pixels
876         return SkSpecialImages::MakeFromRaster(dstBounds, dst, SkSurfaceProps{});
877     }
878 
879 };
880 
881 class RasterShaderBlurAlgorithm : public SkShaderBlurAlgorithm {
882 public:
makeDevice(const SkImageInfo & imageInfo) const883     sk_sp<SkDevice> makeDevice(const SkImageInfo& imageInfo) const override {
884         // This Device will only be used to draw blurs, so use default SkSurfaceProps. The pixel
885         // geometry and font configuration do not matter. This is not a GPU surface, so DMSAA and
886         // the kAlwaysDither surface property are also irrelevant.
887         return SkBitmapDevice::Create(imageInfo, SkSurfaceProps{});
888     }
889 };
890 
891 class RasterBlurEngine : public SkBlurEngine {
892 public:
findAlgorithm(SkSize sigma,SkColorType colorType) const893     const Algorithm* findAlgorithm(SkSize sigma,  SkColorType colorType) const override {
894 #if defined(SK_AVOID_SLOW_RASTER_PIPELINE_BLURS)
895         // For large source images, the shader-based blur can be prohibitively slow so setting this
896         // to zero means it'll never be used for 8888 color types.
897         static constexpr float kBoxBlurMinSigma = 0.f;
898 #else
899         static constexpr float kBoxBlurMinSigma = 2.f;
900 
901         // If the sigma is larger than kBoxBlurMinSigma, we should assume that we won't encounter
902         // an identity window assertion later on.
903         SkASSERT(SkBlurEngine::BoxBlurWindow(kBoxBlurMinSigma) > 1);
904 #endif
905 
906         // Using the shader-based blur for small blur sigmas only happens if both axes require a
907         // small blur. It's assumed that any inaccuracy along one axis is hidden by the large enough
908         // blur along the other axis.
909         const bool smallBlur = sigma.width() < kBoxBlurMinSigma &&
910                                sigma.height() < kBoxBlurMinSigma;
911         // The box blur doesn't actually care about channel order as long as it's 4 8-bit channels.
912         const bool rgba8Blur = colorType == kRGBA_8888_SkColorType ||
913                                colorType == kBGRA_8888_SkColorType;
914         // TODO: Specialize A8 color types as well by reusing the mask filter blur impl
915         if (smallBlur || !rgba8Blur) {
916             return &fShaderBlurAlgorithm;
917         } else {
918             return &fRGBA8BlurAlgorithm;
919         }
920     }
921 
922 private:
923     // For small sigmas and non-8888 or A8 color types, use the shader algorithm
924     RasterShaderBlurAlgorithm fShaderBlurAlgorithm;
925     // For large blurs with RGBA8 or BGRA8, use consecutive box blurs
926     Raster8888BlurAlgorithm fRGBA8BlurAlgorithm;
927 };
928 
929 } // anonymous namespace
930 
GetRasterBlurEngine()931 const SkBlurEngine* SkBlurEngine::GetRasterBlurEngine() {
932     static const RasterBlurEngine kInstance;
933     return &kInstance;
934 }
935 
936 // SkShaderBlurAlgorithm
937 // ----------------------------------------------------------------------------
938 
Compute2DBlurKernel(SkSize sigma,SkISize radius,SkSpan<float> kernel)939 void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
940                                                 SkISize radius,
941                                                 SkSpan<float> kernel) {
942     // Callers likely had to calculate the radius prior to filling out the kernel value, which is
943     // why it's provided; but make sure it's consistent with expectations.
944     SkASSERT(SkBlurEngine::SigmaToRadius(sigma.width()) == radius.width() &&
945              SkBlurEngine::SigmaToRadius(sigma.height()) == radius.height());
946 
947     // Callers are responsible for downscaling large sigmas to values that can be processed by the
948     // effects, so ensure the radius won't overflow 'kernel'
949     const int width = KernelWidth(radius.width());
950     const int height = KernelWidth(radius.height());
951     const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height));
952     SkASSERT(kernelSize <= kernel.size());
953 
954     // And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero
955     // when there's a non-trivial radius.
956     const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width();
957     const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height();
958     SkASSERT((radius.width() == 0 || !SkScalarNearlyZero(twoSigmaSqrdX)) &&
959              (radius.height() == 0 || !SkScalarNearlyZero(twoSigmaSqrdY)));
960 
961     // Setting the denominator to 1 when the radius is 0 automatically converts the remaining math
962     // to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0
963     const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f;
964     const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f;
965 
966     float sum = 0.0f;
967     for (int x = 0; x < width; x++) {
968         float xTerm = static_cast<float>(x - radius.width());
969         xTerm = xTerm * xTerm * sigmaXDenom;
970         for (int y = 0; y < height; y++) {
971             float yTerm = static_cast<float>(y - radius.height());
972             float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom));
973             // Note that the constant term (1/(sqrt(2*pi*sigma^2)) of the Gaussian
974             // is dropped here, since we renormalize the kernel below.
975             kernel[y * width + x] = xyTerm;
976             sum += xyTerm;
977         }
978     }
979     // Normalize the kernel
980     float scale = 1.0f / sum;
981     for (size_t i = 0; i < kernelSize; ++i) {
982         kernel[i] *= scale;
983     }
984     // Zero remainder of the array
985     memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize));
986 }
987 
Compute2DBlurKernel(SkSize sigma,SkISize radii,std::array<SkV4,kMaxSamples/4> & kernel)988 void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
989                                                 SkISize radii,
990                                                 std::array<SkV4, kMaxSamples/4>& kernel) {
991     static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxSamples>));
992     static_assert(alignof(float) == alignof(SkV4));
993     float* data = kernel[0].ptr();
994     Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxSamples));
995 }
996 
Compute2DBlurOffsets(SkISize radius,std::array<SkV4,kMaxSamples/2> & offsets)997 void SkShaderBlurAlgorithm::Compute2DBlurOffsets(SkISize radius,
998                                                  std::array<SkV4, kMaxSamples/2>& offsets) {
999     const int kernelArea = KernelWidth(radius.width()) * KernelWidth(radius.height());
1000     SkASSERT(kernelArea <= kMaxSamples);
1001 
1002     SkSpan<float> offsetView{offsets[0].ptr(), kMaxSamples*2};
1003 
1004     int i = 0;
1005     for (int y = -radius.height(); y <= radius.height(); ++y) {
1006         for (int x = -radius.width(); x <= radius.width(); ++x) {
1007             offsetView[2*i]   = x;
1008             offsetView[2*i+1] = y;
1009             ++i;
1010         }
1011     }
1012     SkASSERT(i == kernelArea);
1013     const int lastValidOffset = 2*(kernelArea - 1);
1014     for (; i < kMaxSamples; ++i) {
1015         offsetView[2*i]   = offsetView[lastValidOffset];
1016         offsetView[2*i+1] = offsetView[lastValidOffset+1];
1017     }
1018 }
1019 
Compute1DBlurLinearKernel(float sigma,int radius,std::array<SkV4,kMaxSamples/2> & offsetsAndKernel)1020 void SkShaderBlurAlgorithm::Compute1DBlurLinearKernel(
1021         float sigma,
1022         int radius,
1023         std::array<SkV4, kMaxSamples/2>& offsetsAndKernel) {
1024     SkASSERT(sigma <= kMaxLinearSigma);
1025     SkASSERT(radius == SkBlurEngine::SigmaToRadius(sigma));
1026     SkASSERT(LinearKernelWidth(radius) <= kMaxSamples);
1027 
1028     // Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj.
1029     // The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling.
1030     // Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj.
1031     // Solving W' * x = Wj, W' * (1 - x) = Wi:
1032     // W' = Wi + Wj
1033     // x = Wj / (Wi + Wj)
1034     auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) {
1035         *new_w = wi + wj;
1036         *offset = wj / (wi + wj);
1037     };
1038 
1039     // Create a temporary standard kernel. The maximum blur radius that can be passed to this
1040     // function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width.
1041     static constexpr int kMaxKernelWidth = KernelWidth(kMaxSamples - 1);
1042     SkASSERT(KernelWidth(radius) <= kMaxKernelWidth);
1043     std::array<float, kMaxKernelWidth> fullKernel;
1044     Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), KernelWidth(radius)});
1045 
1046     std::array<float, kMaxSamples> kernel;
1047     std::array<float, kMaxSamples> offsets;
1048     // Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array.
1049     int halfSize = LinearKernelWidth(radius);
1050     int halfRadius = halfSize / 2;
1051     int lowIndex = halfRadius - 1;
1052 
1053     // Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored,
1054     // compute only the upper half and mirror to the lower half.
1055 
1056     int index = radius;
1057     if (radius & 1) {
1058         // If N is odd, then use two samples.
1059         // The centre texel gets sampled twice, so halve its influence for each sample.
1060         // We essentially sample like this:
1061         // Texel edges
1062         // v    v    v    v
1063         // |    |    |    |
1064         // \-----^---/ Lower sample
1065         //      \---^-----/ Upper sample
1066         get_new_weight(&kernel[halfRadius],
1067                        &offsets[halfRadius],
1068                        fullKernel[index] * 0.5f,
1069                        fullKernel[index + 1]);
1070         kernel[lowIndex] = kernel[halfRadius];
1071         offsets[lowIndex] = -offsets[halfRadius];
1072         index++;
1073         lowIndex--;
1074     } else {
1075         // If N is even, then there are an even number of texels on either side of the centre texel.
1076         // Sample the centre texel directly.
1077         kernel[halfRadius] = fullKernel[index];
1078         offsets[halfRadius] = 0.0f;
1079     }
1080     index++;
1081 
1082     // Every other pair gets one sample.
1083     for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) {
1084         get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]);
1085         offsets[i] += static_cast<float>(index - radius);
1086 
1087         // Mirror to lower half.
1088         kernel[lowIndex] = kernel[i];
1089         offsets[lowIndex] = -offsets[i];
1090     }
1091 
1092     // Zero out remaining values in the kernel
1093     memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxSamples - halfSize));
1094     // But copy the last valid offset into the remaining offsets, to increase the chance that
1095     // over-iteration in a fragment shader will have a cache hit.
1096     for (int i = halfSize; i < kMaxSamples; ++i) {
1097         offsets[i] = offsets[halfSize - 1];
1098     }
1099 
1100     // Interleave into the output array to match the 1D SkSL effect
1101     for (int i = 0; i < kMaxSamples / 2; ++i) {
1102         offsetsAndKernel[i] = SkV4{offsets[2*i], kernel[2*i], offsets[2*i+1], kernel[2*i+1]};
1103     }
1104 }
1105 
to_stablekey(int kernelWidth,uint32_t baseKey)1106 static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) {
1107     SkASSERT(kernelWidth >= 2 && kernelWidth <= SkShaderBlurAlgorithm::kMaxSamples);
1108     switch(kernelWidth) {
1109         // Batch on multiples of 4 (skipping width=1, since that can't happen)
1110         case 2:  [[fallthrough]];
1111         case 3:  [[fallthrough]];
1112         case 4:  return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey);
1113         case 5:  [[fallthrough]];
1114         case 6:  [[fallthrough]];
1115         case 7:  [[fallthrough]];
1116         case 8:  return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1);
1117         case 9:  [[fallthrough]];
1118         case 10: [[fallthrough]];
1119         case 11: [[fallthrough]];
1120         case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2);
1121         case 13: [[fallthrough]];
1122         case 14: [[fallthrough]];
1123         case 15: [[fallthrough]];
1124         case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3);
1125         case 17: [[fallthrough]];
1126         case 18: [[fallthrough]];
1127         case 19: [[fallthrough]];
1128         // With larger kernels, batch on multiples of eight so up to 7 wasted samples.
1129         case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4);
1130         case 21: [[fallthrough]];
1131         case 22: [[fallthrough]];
1132         case 23: [[fallthrough]];
1133         case 24: [[fallthrough]];
1134         case 25: [[fallthrough]];
1135         case 26: [[fallthrough]];
1136         case 27: [[fallthrough]];
1137         case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5);
1138         default:
1139             SkUNREACHABLE;
1140     }
1141 }
1142 
GetLinearBlur1DEffect(int radius)1143 const SkRuntimeEffect* SkShaderBlurAlgorithm::GetLinearBlur1DEffect(int radius) {
1144     return GetKnownRuntimeEffect(
1145             to_stablekey(LinearKernelWidth(radius),
1146                          static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase)));
1147 }
1148 
GetBlur2DEffect(const SkISize & radii)1149 const SkRuntimeEffect* SkShaderBlurAlgorithm::GetBlur2DEffect(const SkISize& radii) {
1150     int kernelArea = KernelWidth(radii.width()) * KernelWidth(radii.height());
1151     return GetKnownRuntimeEffect(
1152             to_stablekey(kernelArea,
1153                          static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase)));
1154 }
1155 
renderBlur(SkRuntimeShaderBuilder * blurEffectBuilder,SkFilterMode filter,SkISize radii,sk_sp<SkSpecialImage> input,const SkIRect & srcRect,SkTileMode tileMode,const SkIRect & dstRect) const1156 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::renderBlur(SkRuntimeShaderBuilder* blurEffectBuilder,
1157                                                         SkFilterMode filter,
1158                                                         SkISize radii,
1159                                                         sk_sp<SkSpecialImage> input,
1160                                                         const SkIRect& srcRect,
1161                                                         SkTileMode tileMode,
1162                                                         const SkIRect& dstRect) const {
1163     SkImageInfo outII = SkImageInfo::Make({dstRect.width(), dstRect.height()},
1164                                           input->colorType(),
1165                                           kPremul_SkAlphaType,
1166                                           input->colorInfo().refColorSpace());
1167     sk_sp<SkDevice> device = this->makeDevice(outII);
1168     if (!device) {
1169         return nullptr;
1170     }
1171 
1172     SkIRect subset = SkIRect::MakeSize(dstRect.size());
1173     device->clipRect(SkRect::Make(subset), SkClipOp::kIntersect, /*aa=*/false);
1174     device->setLocalToDevice(SkM44::Translate(-dstRect.left(), -dstRect.top()));
1175 
1176     // renderBlur() will either mix multiple fast and strict draws to cover dstRect, or will issue
1177     // a single strict draw. While the SkShader object changes (really just strict mode), the rest
1178     // of the SkPaint remains the same.
1179     SkPaint paint;
1180     paint.setBlendMode(SkBlendMode::kSrc);
1181 
1182     SkIRect safeSrcRect = srcRect.makeInset(radii.width(), radii.height());
1183     SkIRect fastDstRect = dstRect;
1184 
1185     // Only consider the safeSrcRect for shader-based tiling if the original srcRect is different
1186     // from the backing store dimensions; when they match the full image we can use HW tiling.
1187     if (srcRect != SkIRect::MakeSize(input->backingStoreDimensions())) {
1188         if (fastDstRect.intersect(safeSrcRect)) {
1189             // If the area of the non-clamping shader is small, it's better to just issue a single
1190             // draw that performs shader tiling over the whole dst.
1191             if (fastDstRect != dstRect && fastDstRect.width() * fastDstRect.height() < 128 * 128) {
1192                 fastDstRect.setEmpty();
1193             }
1194         } else {
1195             fastDstRect.setEmpty();
1196         }
1197     }
1198 
1199     if (!fastDstRect.isEmpty()) {
1200         // Fill as much as possible without adding shader tiling logic to each blur sample,
1201         // switching to clamp tiling if we aren't in this block due to HW tiling.
1202         SkIRect untiledSrcRect = srcRect.makeInset(1, 1);
1203         SkTileMode fastTileMode = untiledSrcRect.contains(fastDstRect) ? SkTileMode::kClamp
1204                                                                        : tileMode;
1205         blurEffectBuilder->child("child") = input->asShader(
1206                 fastTileMode, filter, SkMatrix::I(), /*strict=*/false);
1207         paint.setShader(blurEffectBuilder->makeShader());
1208         device->drawRect(SkRect::Make(fastDstRect), paint);
1209     }
1210 
1211     // Switch to a strict shader if there are remaining pixels to fill
1212     if (fastDstRect != dstRect) {
1213         blurEffectBuilder->child("child") = input->makeSubset(srcRect)->asShader(
1214                 tileMode, filter, SkMatrix::Translate(srcRect.left(), srcRect.top()));
1215         paint.setShader(blurEffectBuilder->makeShader());
1216     }
1217 
1218     if (fastDstRect.isEmpty()) {
1219         // Fill the entire dst with the strict shader
1220         device->drawRect(SkRect::Make(dstRect), paint);
1221     } else if (fastDstRect != dstRect) {
1222         // There will be up to four additional strict draws to fill in the border. The left and
1223         // right sides will span the full height of the dst rect. The top and bottom will span
1224         // the just the width of the fast interior. Strict border draws with zero width/height
1225         // are skipped.
1226         auto drawBorder = [&](const SkIRect& r) {
1227             if (!r.isEmpty()) {
1228                 device->drawRect(SkRect::Make(r), paint);
1229             }
1230         };
1231 
1232         drawBorder({dstRect.left(),      dstRect.top(),
1233                     fastDstRect.left(),  dstRect.bottom()});   // Left, spanning full height
1234         drawBorder({fastDstRect.right(), dstRect.top(),
1235                     dstRect.right(),     dstRect.bottom()});   // Right, spanning full height
1236         drawBorder({fastDstRect.left(),  dstRect.top(),
1237                     fastDstRect.right(), fastDstRect.top()});  // Top, spanning inner width
1238         drawBorder({fastDstRect.left(),  fastDstRect.bottom(),
1239                     fastDstRect.right(), dstRect.bottom()});   // Bottom, spanning inner width
1240     }
1241 
1242     return device->snapSpecial(subset);
1243 }
1244 
evalBlur2D(SkSize sigma,SkISize radii,sk_sp<SkSpecialImage> input,const SkIRect & srcRect,SkTileMode tileMode,const SkIRect & dstRect) const1245 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur2D(SkSize sigma,
1246                                                         SkISize radii,
1247                                                         sk_sp<SkSpecialImage> input,
1248                                                         const SkIRect& srcRect,
1249                                                         SkTileMode tileMode,
1250                                                         const SkIRect& dstRect) const {
1251     std::array<SkV4, kMaxSamples/4> kernel;
1252     std::array<SkV4, kMaxSamples/2> offsets;
1253     Compute2DBlurKernel(sigma, radii, kernel);
1254     Compute2DBlurOffsets(radii, offsets);
1255 
1256     SkRuntimeShaderBuilder builder{sk_ref_sp(GetBlur2DEffect(radii))};
1257     builder.uniform("kernel") = kernel;
1258     builder.uniform("offsets") = offsets;
1259     // NOTE: renderBlur() will configure the "child" shader as needed. The 2D blur effect only
1260     // requires nearest-neighbor filtering.
1261     return this->renderBlur(&builder, SkFilterMode::kNearest, radii,
1262                             std::move(input), srcRect, tileMode, dstRect);
1263 }
1264 
evalBlur1D(float sigma,int radius,SkV2 dir,sk_sp<SkSpecialImage> input,SkIRect srcRect,SkTileMode tileMode,SkIRect dstRect) const1265 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur1D(float sigma,
1266                                                         int radius,
1267                                                         SkV2 dir,
1268                                                         sk_sp<SkSpecialImage> input,
1269                                                         SkIRect srcRect,
1270                                                         SkTileMode tileMode,
1271                                                         SkIRect dstRect) const {
1272     std::array<SkV4, kMaxSamples/2> offsetsAndKernel;
1273     Compute1DBlurLinearKernel(sigma, radius, offsetsAndKernel);
1274 
1275     SkRuntimeShaderBuilder builder{sk_ref_sp(GetLinearBlur1DEffect(radius))};
1276     builder.uniform("offsetsAndKernel") = offsetsAndKernel;
1277     builder.uniform("dir") = dir;
1278     // NOTE: renderBlur() will configure the "child" shader as needed. The 1D blur effect requires
1279     // linear filtering. Reconstruct the appropriate "2D" radii inset value from 'dir'.
1280     SkISize radii{dir.x ? radius : 0, dir.y ? radius : 0};
1281     return this->renderBlur(&builder, SkFilterMode::kLinear, radii,
1282                             std::move(input), srcRect, tileMode, dstRect);
1283 }
1284 
blur(SkSize sigma,sk_sp<SkSpecialImage> src,const SkIRect & srcRect,SkTileMode tileMode,const SkIRect & dstRect) const1285 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::blur(SkSize sigma,
1286                                                   sk_sp<SkSpecialImage> src,
1287                                                   const SkIRect& srcRect,
1288                                                   SkTileMode tileMode,
1289                                                   const SkIRect& dstRect) const {
1290     SkASSERT(sigma.width() <= kMaxLinearSigma &&  sigma.height() <= kMaxLinearSigma);
1291 
1292     int radiusX = SkBlurEngine::SigmaToRadius(sigma.width());
1293     int radiusY = SkBlurEngine::SigmaToRadius(sigma.height());
1294     const int kernelArea = KernelWidth(radiusX) * KernelWidth(radiusY);
1295     if (kernelArea <= kMaxSamples && radiusX > 0 && radiusY > 0) {
1296         // Use a single-pass 2D kernel if it fits and isn't just 1D already
1297         return this->evalBlur2D(sigma,
1298                                 {radiusX, radiusY},
1299                                 std::move(src),
1300                                 srcRect,
1301                                 tileMode,
1302                                 dstRect);
1303     } else {
1304         // Use two passes of a 1D kernel (one per axis).
1305         SkIRect intermediateSrcRect = srcRect;
1306         SkIRect intermediateDstRect = dstRect;
1307         if (radiusX > 0) {
1308             if (radiusY > 0) {
1309                 // May need to maintain extra rows above and below 'dstRect' for the follow-up pass.
1310                 if (tileMode == SkTileMode::kRepeat || tileMode == SkTileMode::kMirror) {
1311                     // If the srcRect and dstRect are aligned, then we don't need extra rows since
1312                     // the periodic tiling on srcRect is the same for the intermediate. If they
1313                     // are not aligned, then outset by the Y radius.
1314                     const int period = srcRect.height() * (tileMode == SkTileMode::kMirror ? 2 : 1);
1315                     if (std::abs(dstRect.fTop - srcRect.fTop) % period != 0 ||
1316                         dstRect.height() != srcRect.height()) {
1317                         intermediateDstRect.outset(0, radiusY);
1318                     }
1319                 } else {
1320                     // For clamp and decal tiling, we outset by the Y radius up to what's available
1321                     // from the srcRect. Anything beyond that is identical to tiling the
1322                     // intermediate dst image directly.
1323                     intermediateDstRect.outset(0, radiusY);
1324                     intermediateDstRect.fTop = std::max(intermediateDstRect.fTop, srcRect.fTop);
1325                     intermediateDstRect.fBottom =
1326                             std::min(intermediateDstRect.fBottom, srcRect.fBottom);
1327                     if (intermediateDstRect.fTop >= intermediateDstRect.fBottom) {
1328                         return nullptr;
1329                     }
1330                 }
1331             }
1332 
1333             src = this->evalBlur1D(sigma.width(),
1334                                    radiusX,
1335                                    /*dir=*/{1.f, 0.f},
1336                                    std::move(src),
1337                                    srcRect,
1338                                    tileMode,
1339                                    intermediateDstRect);
1340             if (!src) {
1341                 return nullptr;
1342             }
1343             intermediateSrcRect = SkIRect::MakeWH(src->width(), src->height());
1344             intermediateDstRect = dstRect.makeOffset(-intermediateDstRect.left(),
1345                                                      -intermediateDstRect.top());
1346         }
1347 
1348         if (radiusY > 0) {
1349             src = this->evalBlur1D(sigma.height(),
1350                                    radiusY,
1351                                    /*dir=*/{0.f, 1.f},
1352                                    std::move(src),
1353                                    intermediateSrcRect,
1354                                    tileMode,
1355                                    intermediateDstRect);
1356         }
1357 
1358         return src;
1359     }
1360 }
1361