1 /*
2 * Copyright 2011 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "include/effects/SkImageFilters.h"
9
10 #include "include/core/SkBitmap.h"
11 #include "include/core/SkColor.h"
12 #include "include/core/SkColorType.h"
13 #include "include/core/SkFlattenable.h"
14 #include "include/core/SkImageFilter.h"
15 #include "include/core/SkImageInfo.h"
16 #include "include/core/SkRect.h"
17 #include "include/core/SkRefCnt.h"
18 #include "include/core/SkScalar.h"
19 #include "include/core/SkSize.h"
20 #include "include/core/SkTileMode.h"
21 #include "include/core/SkTypes.h"
22 #include "include/private/base/SkFloatingPoint.h"
23 #include "include/private/base/SkMalloc.h"
24 #include "include/private/base/SkTo.h"
25 #include "src/base/SkArenaAlloc.h"
26 #include "src/base/SkVx.h"
27 #include "src/core/SkImageFilterTypes.h"
28 #include "src/core/SkImageFilter_Base.h"
29 #include "src/core/SkReadBuffer.h"
30 #include "src/core/SkSpecialImage.h"
31 #include "src/core/SkWriteBuffer.h"
32
33 #include <algorithm>
34 #include <cmath>
35 #include <cstdint>
36 #include <cstring>
37 #include <optional>
38 #include <utility>
39
40 struct SkIPoint;
41
42 #if defined(SK_GANESH) || defined(SK_GRAPHITE)
43 #include "src/gpu/BlurUtils.h"
44 #endif
45
46 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
47 #include <xmmintrin.h>
48 #define SK_PREFETCH(ptr) _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0)
49 #elif defined(__GNUC__)
50 #define SK_PREFETCH(ptr) __builtin_prefetch(ptr)
51 #else
52 #define SK_PREFETCH(ptr)
53 #endif
54
55 namespace {
56
57 class SkBlurImageFilter final : public SkImageFilter_Base {
58 public:
SkBlurImageFilter(SkSize sigma,sk_sp<SkImageFilter> input)59 SkBlurImageFilter(SkSize sigma, sk_sp<SkImageFilter> input)
60 : SkImageFilter_Base(&input, 1)
61 , fSigma{sigma} {}
62
SkBlurImageFilter(SkSize sigma,SkTileMode legacyTileMode,sk_sp<SkImageFilter> input)63 SkBlurImageFilter(SkSize sigma, SkTileMode legacyTileMode, sk_sp<SkImageFilter> input)
64 : SkImageFilter_Base(&input, 1)
65 , fSigma(sigma)
66 , fLegacyTileMode(legacyTileMode) {}
67
68 SkRect computeFastBounds(const SkRect&) const override;
69
70 protected:
71 void flatten(SkWriteBuffer&) const override;
72
73 private:
74 friend void ::SkRegisterBlurImageFilterFlattenable();
75 SK_FLATTENABLE_HOOKS(SkBlurImageFilter)
76
77 skif::FilterResult onFilterImage(const skif::Context& context) const override;
78
79 skif::LayerSpace<SkIRect> onGetInputLayerBounds(
80 const skif::Mapping& mapping,
81 const skif::LayerSpace<SkIRect>& desiredOutput,
82 std::optional<skif::LayerSpace<SkIRect>> contentBounds) const override;
83
84 std::optional<skif::LayerSpace<SkIRect>> onGetOutputLayerBounds(
85 const skif::Mapping& mapping,
86 std::optional<skif::LayerSpace<SkIRect>> contentBounds) const override;
87
88 skif::LayerSpace<SkSize> mapSigma(const skif::Mapping& mapping, bool gpuBacked) const;
89
kernelBounds(const skif::Mapping & mapping,skif::LayerSpace<SkIRect> bounds,bool gpuBacked) const90 skif::LayerSpace<SkIRect> kernelBounds(const skif::Mapping& mapping,
91 skif::LayerSpace<SkIRect> bounds,
92 bool gpuBacked) const {
93 skif::LayerSpace<SkSize> sigma = this->mapSigma(mapping, gpuBacked);
94 bounds.outset(skif::LayerSpace<SkSize>({3 * sigma.width(), 3 * sigma.height()}).ceil());
95 return bounds;
96 }
97
98 skif::ParameterSpace<SkSize> fSigma;
99 // kDecal means no legacy tiling, it will be handled by SkCropImageFilter instead. Legacy
100 // tiling occurs when there's no provided crop rect, and should be deleted once clients create
101 // their filters with defined tiling geometry.
102 SkTileMode fLegacyTileMode = SkTileMode::kDecal;
103 };
104
105 } // end namespace
106
Blur(SkScalar sigmaX,SkScalar sigmaY,SkTileMode tileMode,sk_sp<SkImageFilter> input,const CropRect & cropRect)107 sk_sp<SkImageFilter> SkImageFilters::Blur(
108 SkScalar sigmaX, SkScalar sigmaY, SkTileMode tileMode, sk_sp<SkImageFilter> input,
109 const CropRect& cropRect) {
110 if (!SkIsFinite(sigmaX, sigmaY) || sigmaX < 0.f || sigmaY < 0.f) {
111 // Non-finite or negative sigmas are error conditions. We allow 0 sigma for X and/or Y
112 // for 1D blurs; onFilterImage() will detect when no visible blurring would occur based on
113 // the Context mapping.
114 return nullptr;
115 }
116
117 // Temporarily allow tiling with no crop rect
118 if (tileMode != SkTileMode::kDecal && !cropRect) {
119 return sk_make_sp<SkBlurImageFilter>(SkSize{sigmaX, sigmaY}, tileMode, std::move(input));
120 }
121
122 // The 'tileMode' behavior is not well-defined if there is no crop. We only apply it if
123 // there is a provided 'cropRect'.
124 sk_sp<SkImageFilter> filter = std::move(input);
125 if (tileMode != SkTileMode::kDecal && cropRect) {
126 // Historically the input image was restricted to the cropRect when tiling was not
127 // kDecal, so that the kernel evaluated the tiled edge conditions, while a kDecal crop
128 // only affected the output.
129 filter = SkImageFilters::Crop(*cropRect, tileMode, std::move(filter));
130 }
131
132 filter = sk_make_sp<SkBlurImageFilter>(SkSize{sigmaX, sigmaY}, std::move(filter));
133 if (cropRect) {
134 // But regardless of the tileMode, the output is always decal cropped
135 filter = SkImageFilters::Crop(*cropRect, SkTileMode::kDecal, std::move(filter));
136 }
137 return filter;
138 }
139
SkRegisterBlurImageFilterFlattenable()140 void SkRegisterBlurImageFilterFlattenable() {
141 SK_REGISTER_FLATTENABLE(SkBlurImageFilter);
142 SkFlattenable::Register("SkBlurImageFilterImpl", SkBlurImageFilter::CreateProc);
143 }
144
CreateProc(SkReadBuffer & buffer)145 sk_sp<SkFlattenable> SkBlurImageFilter::CreateProc(SkReadBuffer& buffer) {
146 SK_IMAGEFILTER_UNFLATTEN_COMMON(common, 1);
147 SkScalar sigmaX = buffer.readScalar();
148 SkScalar sigmaY = buffer.readScalar();
149 SkTileMode tileMode = buffer.read32LE(SkTileMode::kLastTileMode);
150
151 // NOTE: For new SKPs, 'tileMode' holds the "legacy" tile mode; any originally specified tile
152 // mode with valid tiling geometry is handled in the SkCropImageFilters that wrap the blur.
153 // In a new SKP, when 'tileMode' is not kDecal, common.cropRect() will be null and the blur
154 // will automatically emulate the legacy tiling.
155 //
156 // In old SKPs, the 'tileMode' and common.cropRect() may not be null. ::Blur() automatically
157 // detects when this is a legacy or valid tiling and constructs the DAG appropriately.
158 return SkImageFilters::Blur(
159 sigmaX, sigmaY, tileMode, common.getInput(0), common.cropRect());
160 }
161
flatten(SkWriteBuffer & buffer) const162 void SkBlurImageFilter::flatten(SkWriteBuffer& buffer) const {
163 this->SkImageFilter_Base::flatten(buffer);
164
165 buffer.writeScalar(SkSize(fSigma).fWidth);
166 buffer.writeScalar(SkSize(fSigma).fHeight);
167 buffer.writeInt(static_cast<int>(fLegacyTileMode));
168 }
169
170 ///////////////////////////////////////////////////////////////////////////////
171
172 namespace {
173
174 // TODO: Move these functions into a CPU, 8888-only blur engine implementation; ideally share logic
175 // with the similar techniques in SkMaskBlurFilter on 4x A8 data.
176
177 // TODO(b/294575803): Provide a more accurate CPU implementation at s<2, at which point the notion
178 // of an identity sigma can be consolidated between the different functions.
179 // This is defined by the SVG spec:
180 // https://drafts.fxtf.org/filter-effects/#feGaussianBlurElement
calculate_window(double sigma)181 int calculate_window(double sigma) {
182 auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * SK_DoublePI) / 4 + 0.5));
183 return std::max(1, possibleWindow);
184 }
185
186 // This rather arbitrary-looking value results in a maximum box blur kernel size
187 // of 1000 pixels on the raster path, which matches the WebKit and Firefox
188 // implementations. Since the GPU path does not compute a box blur, putting
189 // the limit on sigma ensures consistent behaviour between the GPU and
190 // raster paths.
191 static constexpr SkScalar kMaxSigma = 532.f;
192
193 class Pass {
194 public:
Pass(int border)195 explicit Pass(int border) : fBorder(border) {}
196 virtual ~Pass() = default;
197
blur(int srcLeft,int srcRight,int dstRight,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)198 void blur(int srcLeft, int srcRight, int dstRight,
199 const uint32_t* src, int srcStride,
200 uint32_t* dst, int dstStride) {
201 this->startBlur();
202
203 auto srcStart = srcLeft - fBorder,
204 srcEnd = srcRight - fBorder,
205 dstEnd = dstRight,
206 srcIdx = srcStart,
207 dstIdx = 0;
208
209 const uint32_t* srcCursor = src;
210 uint32_t* dstCursor = dst;
211
212 if (dstIdx < srcIdx) {
213 // The destination pixels are not effected by the src pixels,
214 // change to zero as per the spec.
215 // https://drafts.fxtf.org/filter-effects/#FilterPrimitivesOverviewIntro
216 int commonEnd = std::min(srcIdx, dstEnd);
217 while (dstIdx < commonEnd) {
218 *dstCursor = 0;
219 dstCursor += dstStride;
220 SK_PREFETCH(dstCursor);
221 dstIdx++;
222 }
223 } else if (srcIdx < dstIdx) {
224 // The edge of the source is before the edge of the destination. Calculate the sums for
225 // the pixels before the start of the destination.
226 if (int commonEnd = std::min(dstIdx, srcEnd); srcIdx < commonEnd) {
227 // Preload the blur with values from src before dst is entered.
228 int n = commonEnd - srcIdx;
229 this->blurSegment(n, srcCursor, srcStride, nullptr, 0);
230 srcIdx += n;
231 srcCursor += n * srcStride;
232 }
233 if (srcIdx < dstIdx) {
234 // The weird case where src is out of pixels before dst is even started.
235 int n = dstIdx - srcIdx;
236 this->blurSegment(n, nullptr, 0, nullptr, 0);
237 srcIdx += n;
238 }
239 }
240
241 if (int commonEnd = std::min(dstEnd, srcEnd); dstIdx < commonEnd) {
242 // Both srcIdx and dstIdx are in sync now, and can run in a 1:1 fashion. This is the
243 // normal mode of operation.
244 SkASSERT(srcIdx == dstIdx);
245
246 int n = commonEnd - dstIdx;
247 this->blurSegment(n, srcCursor, srcStride, dstCursor, dstStride);
248 srcCursor += n * srcStride;
249 dstCursor += n * dstStride;
250 dstIdx += n;
251 srcIdx += n;
252 }
253
254 // Drain the remaining blur values into dst assuming 0's for the leading edge.
255 if (dstIdx < dstEnd) {
256 int n = dstEnd - dstIdx;
257 this->blurSegment(n, nullptr, 0, dstCursor, dstStride);
258 }
259 }
260
261 protected:
262 virtual void startBlur() = 0;
263 virtual void blurSegment(
264 int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) = 0;
265
266 private:
267 const int fBorder;
268 };
269
270 class PassMaker {
271 public:
PassMaker(int window)272 explicit PassMaker(int window) : fWindow{window} {}
273 virtual ~PassMaker() = default;
274 virtual Pass* makePass(void* buffer, SkArenaAlloc* alloc) const = 0;
275 virtual size_t bufferSizeBytes() const = 0;
window() const276 int window() const {return fWindow;}
277
278 private:
279 const int fWindow;
280 };
281
282 // Implement a scanline processor that uses a three-box filter to approximate a Gaussian blur.
283 // The GaussPass is limit to processing sigmas < 135.
284 class GaussPass final : public Pass {
285 public:
286 // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
287 // using the Gauss filter. It also limits the size of buffers used hold intermediate values.
288 // Explanation of maximums:
289 // sum0 = window * 255
290 // sum1 = window * sum0 -> window * window * 255
291 // sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255
292 //
293 // The value window^3 * 255 must fit in a uint32_t. So,
294 // window^3 < 2^32. window = 255.
295 //
296 // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
297 // For window <= 255, the largest value for sigma is 136.
MakeMaker(double sigma,SkArenaAlloc * alloc)298 static PassMaker* MakeMaker(double sigma, SkArenaAlloc* alloc) {
299 SkASSERT(0 <= sigma);
300 int window = calculate_window(sigma);
301 if (255 <= window) {
302 return nullptr;
303 }
304
305 class Maker : public PassMaker {
306 public:
307 explicit Maker(int window) : PassMaker{window} {}
308 Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override {
309 return GaussPass::Make(this->window(), buffer, alloc);
310 }
311
312 size_t bufferSizeBytes() const override {
313 int window = this->window();
314 size_t onePassSize = window - 1;
315 // If the window is odd, then there is an obvious middle element. For even sizes
316 // 2 passes are shifted, and the last pass has an extra element. Like this:
317 // S
318 // aaaAaa
319 // bbBbbb
320 // cccCccc
321 // D
322 size_t bufferCount = (window & 1) == 1 ? 3 * onePassSize : 3 * onePassSize + 1;
323 return bufferCount * sizeof(skvx::Vec<4, uint32_t>);
324 }
325 };
326
327 return alloc->make<Maker>(window);
328 }
329
Make(int window,void * buffers,SkArenaAlloc * alloc)330 static GaussPass* Make(int window, void* buffers, SkArenaAlloc* alloc) {
331 // We don't need to store the trailing edge pixel in the buffer;
332 int passSize = window - 1;
333 skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers);
334 skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize;
335 skvx::Vec<4, uint32_t>* buffer2 = buffer1 + passSize;
336 // If the window is odd just one buffer is needed, but if it's even, then there is one
337 // more element on that pass.
338 skvx::Vec<4, uint32_t>* buffersEnd = buffer2 + ((window & 1) ? passSize : passSize + 1);
339
340 // Calculating the border is tricky. The border is the distance in pixels between the first
341 // dst pixel and the first src pixel (or the last src pixel and the last dst pixel).
342 // I will go through the odd case which is simpler, and then through the even case. Given a
343 // stack of filters seven wide for the odd case of three passes.
344 //
345 // S
346 // aaaAaaa
347 // bbbBbbb
348 // cccCccc
349 // D
350 //
351 // The furthest changed pixel is when the filters are in the following configuration.
352 //
353 // S
354 // aaaAaaa
355 // bbbBbbb
356 // cccCccc
357 // D
358 //
359 // The A pixel is calculated using the value S, the B uses A, and the C uses B, and
360 // finally D is C. So, with a window size of seven the border is nine. In the odd case, the
361 // border is 3*((window - 1)/2).
362 //
363 // For even cases the filter stack is more complicated. The spec specifies two passes
364 // of even filters and a final pass of odd filters. A stack for a width of six looks like
365 // this.
366 //
367 // S
368 // aaaAaa
369 // bbBbbb
370 // cccCccc
371 // D
372 //
373 // The furthest pixel looks like this.
374 //
375 // S
376 // aaaAaa
377 // bbBbbb
378 // cccCccc
379 // D
380 //
381 // For a window of six, the border value is eight. In the even case the border is 3 *
382 // (window/2) - 1.
383 int border = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
384
385 // If the window is odd then the divisor is just window ^ 3 otherwise,
386 // it is window * window * (window + 1) = window ^ 3 + window ^ 2;
387 int window2 = window * window;
388 int window3 = window2 * window;
389 int divisor = (window & 1) == 1 ? window3 : window3 + window2;
390 return alloc->make<GaussPass>(buffer0, buffer1, buffer2, buffersEnd, border, divisor);
391 }
392
GaussPass(skvx::Vec<4,uint32_t> * buffer0,skvx::Vec<4,uint32_t> * buffer1,skvx::Vec<4,uint32_t> * buffer2,skvx::Vec<4,uint32_t> * buffersEnd,int border,int divisor)393 GaussPass(skvx::Vec<4, uint32_t>* buffer0,
394 skvx::Vec<4, uint32_t>* buffer1,
395 skvx::Vec<4, uint32_t>* buffer2,
396 skvx::Vec<4, uint32_t>* buffersEnd,
397 int border,
398 int divisor)
399 : Pass{border}
400 , fBuffer0{buffer0}
401 , fBuffer1{buffer1}
402 , fBuffer2{buffer2}
403 , fBuffersEnd{buffersEnd}
404 , fDivider(divisor) {}
405
406 private:
startBlur()407 void startBlur() override {
408 skvx::Vec<4, uint32_t> zero = {0u, 0u, 0u, 0u};
409 zero.store(fSum0);
410 zero.store(fSum1);
411 auto half = fDivider.half();
412 skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum2);
413 sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
414
415 fBuffer0Cursor = fBuffer0;
416 fBuffer1Cursor = fBuffer1;
417 fBuffer2Cursor = fBuffer2;
418 }
419
420 // GaussPass implements the common three pass box filter approximation of Gaussian blur,
421 // but combines all three passes into a single pass. This approach is facilitated by three
422 // circular buffers the width of the window which track values for trailing edges of each of
423 // the three passes. This allows the algorithm to use more precision in the calculation
424 // because the values are not rounded each pass. And this implementation also avoids a trap
425 // that's easy to fall into resulting in blending in too many zeroes near the edge.
426 //
427 // In general, a window sum has the form:
428 // sum_n+1 = sum_n + leading_edge - trailing_edge.
429 // If instead we do the subtraction at the end of the previous iteration, we can just
430 // calculate the sums instead of having to do the subtractions too.
431 //
432 // In previous iteration:
433 // sum_n+1 = sum_n - trailing_edge.
434 //
435 // In this iteration:
436 // sum_n+1 = sum_n + leading_edge.
437 //
438 // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the
439 // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the
440 // three passes at the same time has the form:
441 //
442 // sum0_n+1 = sum0_n + leading edge
443 // sum1_n+1 = sum1_n + sum0_n+1
444 // sum2_n+1 = sum2_n + sum1_n+1
445 //
446 // sum2_n+1 / window^3 is the new value of the destination pixel.
447 //
448 // Reduce the sums by the trailing edges which were stored in the circular buffers for the
449 // next go around. This is the case for odd sized windows, even windows the the third
450 // circular buffer is one larger then the first two circular buffers.
451 //
452 // sum2_n+2 = sum2_n+1 - buffer2[i];
453 // buffer2[i] = sum1;
454 // sum1_n+2 = sum1_n+1 - buffer1[i];
455 // buffer1[i] = sum0;
456 // sum0_n+2 = sum0_n+1 - buffer0[i];
457 // buffer0[i] = leading edge
blurSegment(int n,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)458 void blurSegment(
459 int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) override {
460 skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
461 skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
462 skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor;
463 skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0);
464 skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1);
465 skvx::Vec<4, uint32_t> sum2 = skvx::Vec<4, uint32_t>::Load(fSum2);
466
467 // Given an expanded input pixel, move the window ahead using the leadingEdge value.
468 auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) {
469 sum0 += leadingEdge;
470 sum1 += sum0;
471 sum2 += sum1;
472
473 skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum2);
474
475 sum2 -= *buffer2Cursor;
476 *buffer2Cursor = sum1;
477 buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2;
478 sum1 -= *buffer1Cursor;
479 *buffer1Cursor = sum0;
480 buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1;
481 sum0 -= *buffer0Cursor;
482 *buffer0Cursor = leadingEdge;
483 buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
484
485 return skvx::cast<uint8_t>(blurred);
486 };
487
488 auto loadEdge = [&](const uint32_t* srcCursor) {
489 return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor));
490 };
491
492 if (!src && !dst) {
493 while (n --> 0) {
494 (void)processValue(0);
495 }
496 } else if (src && !dst) {
497 while (n --> 0) {
498 (void)processValue(loadEdge(src));
499 src += srcStride;
500 }
501 } else if (!src && dst) {
502 while (n --> 0) {
503 processValue(0u).store(dst);
504 dst += dstStride;
505 }
506 } else if (src && dst) {
507 while (n --> 0) {
508 processValue(loadEdge(src)).store(dst);
509 src += srcStride;
510 dst += dstStride;
511 }
512 }
513
514 // Store the state
515 fBuffer0Cursor = buffer0Cursor;
516 fBuffer1Cursor = buffer1Cursor;
517 fBuffer2Cursor = buffer2Cursor;
518
519 sum0.store(fSum0);
520 sum1.store(fSum1);
521 sum2.store(fSum2);
522 }
523
524 skvx::Vec<4, uint32_t>* const fBuffer0;
525 skvx::Vec<4, uint32_t>* const fBuffer1;
526 skvx::Vec<4, uint32_t>* const fBuffer2;
527 skvx::Vec<4, uint32_t>* const fBuffersEnd;
528 const skvx::ScaledDividerU32 fDivider;
529
530 // blur state
531 char fSum0[sizeof(skvx::Vec<4, uint32_t>)];
532 char fSum1[sizeof(skvx::Vec<4, uint32_t>)];
533 char fSum2[sizeof(skvx::Vec<4, uint32_t>)];
534 skvx::Vec<4, uint32_t>* fBuffer0Cursor;
535 skvx::Vec<4, uint32_t>* fBuffer1Cursor;
536 skvx::Vec<4, uint32_t>* fBuffer2Cursor;
537 };
538
539 // Implement a scanline processor that uses a two-box filter to approximate a Tent filter.
540 // The TentPass is limit to processing sigmas < 2183.
541 class TentPass final : public Pass {
542 public:
543 // NB 2183 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
544 // using the Tent filter. It also limits the size of buffers used hold intermediate values.
545 // Explanation of maximums:
546 // sum0 = window * 255
547 // sum1 = window * sum0 -> window * window * 255
548 //
549 // The value window^2 * 255 must fit in a uint32_t. So,
550 // window^2 < 2^32. window = 4104.
551 //
552 // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
553 // For window <= 4104, the largest value for sigma is 2183.
MakeMaker(double sigma,SkArenaAlloc * alloc)554 static PassMaker* MakeMaker(double sigma, SkArenaAlloc* alloc) {
555 SkASSERT(0 <= sigma);
556 int gaussianWindow = calculate_window(sigma);
557 // This is a naive method of using the window size for the Gaussian blur to calculate the
558 // window size for the Tent blur. This seems to work well in practice.
559 //
560 // We can use a single pixel to generate the effective blur area given a window size. For
561 // the Gaussian blur this is 3 * window size. For the Tent filter this is 2 * window size.
562 int tentWindow = 3 * gaussianWindow / 2;
563 if (tentWindow >= 4104) {
564 return nullptr;
565 }
566
567 class Maker : public PassMaker {
568 public:
569 explicit Maker(int window) : PassMaker{window} {}
570 Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override {
571 return TentPass::Make(this->window(), buffer, alloc);
572 }
573
574 size_t bufferSizeBytes() const override {
575 size_t onePassSize = this->window() - 1;
576 // If the window is odd, then there is an obvious middle element. For even sizes 2
577 // passes are shifted, and the last pass has an extra element. Like this:
578 // S
579 // aaaAaa
580 // bbBbbb
581 // D
582 size_t bufferCount = 2 * onePassSize;
583 return bufferCount * sizeof(skvx::Vec<4, uint32_t>);
584 }
585 };
586
587 return alloc->make<Maker>(tentWindow);
588 }
589
Make(int window,void * buffers,SkArenaAlloc * alloc)590 static TentPass* Make(int window, void* buffers, SkArenaAlloc* alloc) {
591 if (window > 4104) {
592 return nullptr;
593 }
594
595 // We don't need to store the trailing edge pixel in the buffer;
596 int passSize = window - 1;
597 skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers);
598 skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize;
599 skvx::Vec<4, uint32_t>* buffersEnd = buffer1 + passSize;
600
601 // Calculating the border is tricky. The border is the distance in pixels between the first
602 // dst pixel and the first src pixel (or the last src pixel and the last dst pixel).
603 // I will go through the odd case which is simpler, and then through the even case. Given a
604 // stack of filters seven wide for the odd case of three passes.
605 //
606 // S
607 // aaaAaaa
608 // bbbBbbb
609 // D
610 //
611 // The furthest changed pixel is when the filters are in the following configuration.
612 //
613 // S
614 // aaaAaaa
615 // bbbBbbb
616 // D
617 //
618 // The A pixel is calculated using the value S, the B uses A, and the D uses B.
619 // So, with a window size of seven the border is nine. In the odd case, the border is
620 // window - 1.
621 //
622 // For even cases the filter stack is more complicated. It uses two passes
623 // of even filters offset from each other. A stack for a width of six looks like
624 // this.
625 //
626 // S
627 // aaaAaa
628 // bbBbbb
629 // D
630 //
631 // The furthest pixel looks like this.
632 //
633 // S
634 // aaaAaa
635 // bbBbbb
636 // D
637 //
638 // For a window of six, the border value is 5. In the even case the border is
639 // window - 1.
640 int border = window - 1;
641
642 int divisor = window * window;
643 return alloc->make<TentPass>(buffer0, buffer1, buffersEnd, border, divisor);
644 }
645
TentPass(skvx::Vec<4,uint32_t> * buffer0,skvx::Vec<4,uint32_t> * buffer1,skvx::Vec<4,uint32_t> * buffersEnd,int border,int divisor)646 TentPass(skvx::Vec<4, uint32_t>* buffer0,
647 skvx::Vec<4, uint32_t>* buffer1,
648 skvx::Vec<4, uint32_t>* buffersEnd,
649 int border,
650 int divisor)
651 : Pass{border}
652 , fBuffer0{buffer0}
653 , fBuffer1{buffer1}
654 , fBuffersEnd{buffersEnd}
655 , fDivider(divisor) {}
656
657 private:
startBlur()658 void startBlur() override {
659 skvx::Vec<4, uint32_t>{0u, 0u, 0u, 0u}.store(fSum0);
660 auto half = fDivider.half();
661 skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum1);
662 sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
663
664 fBuffer0Cursor = fBuffer0;
665 fBuffer1Cursor = fBuffer1;
666 }
667
668 // TentPass implements the common two pass box filter approximation of Tent filter,
669 // but combines all both passes into a single pass. This approach is facilitated by two
670 // circular buffers the width of the window which track values for trailing edges of each of
671 // both passes. This allows the algorithm to use more precision in the calculation
672 // because the values are not rounded each pass. And this implementation also avoids a trap
673 // that's easy to fall into resulting in blending in too many zeroes near the edge.
674 //
675 // In general, a window sum has the form:
676 // sum_n+1 = sum_n + leading_edge - trailing_edge.
677 // If instead we do the subtraction at the end of the previous iteration, we can just
678 // calculate the sums instead of having to do the subtractions too.
679 //
680 // In previous iteration:
681 // sum_n+1 = sum_n - trailing_edge.
682 //
683 // In this iteration:
684 // sum_n+1 = sum_n + leading_edge.
685 //
686 // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the
687 // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the
688 // three passes at the same time has the form:
689 //
690 // sum0_n+1 = sum0_n + leading edge
691 // sum1_n+1 = sum1_n + sum0_n+1
692 //
693 // sum1_n+1 / window^2 is the new value of the destination pixel.
694 //
695 // Reduce the sums by the trailing edges which were stored in the circular buffers for the
696 // next go around.
697 //
698 // sum1_n+2 = sum1_n+1 - buffer1[i];
699 // buffer1[i] = sum0;
700 // sum0_n+2 = sum0_n+1 - buffer0[i];
701 // buffer0[i] = leading edge
blurSegment(int n,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)702 void blurSegment(
703 int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) override {
704 skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
705 skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
706 skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0);
707 skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1);
708
709 // Given an expanded input pixel, move the window ahead using the leadingEdge value.
710 auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) {
711 sum0 += leadingEdge;
712 sum1 += sum0;
713
714 skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum1);
715
716 sum1 -= *buffer1Cursor;
717 *buffer1Cursor = sum0;
718 buffer1Cursor = (buffer1Cursor + 1) < fBuffersEnd ? buffer1Cursor + 1 : fBuffer1;
719 sum0 -= *buffer0Cursor;
720 *buffer0Cursor = leadingEdge;
721 buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
722
723 return skvx::cast<uint8_t>(blurred);
724 };
725
726 auto loadEdge = [&](const uint32_t* srcCursor) {
727 return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor));
728 };
729
730 if (!src && !dst) {
731 while (n --> 0) {
732 (void)processValue(0);
733 }
734 } else if (src && !dst) {
735 while (n --> 0) {
736 (void)processValue(loadEdge(src));
737 src += srcStride;
738 }
739 } else if (!src && dst) {
740 while (n --> 0) {
741 processValue(0u).store(dst);
742 dst += dstStride;
743 }
744 } else if (src && dst) {
745 while (n --> 0) {
746 processValue(loadEdge(src)).store(dst);
747 src += srcStride;
748 dst += dstStride;
749 }
750 }
751
752 // Store the state
753 fBuffer0Cursor = buffer0Cursor;
754 fBuffer1Cursor = buffer1Cursor;
755 sum0.store(fSum0);
756 sum1.store(fSum1);
757 }
758
759 skvx::Vec<4, uint32_t>* const fBuffer0;
760 skvx::Vec<4, uint32_t>* const fBuffer1;
761 skvx::Vec<4, uint32_t>* const fBuffersEnd;
762 const skvx::ScaledDividerU32 fDivider;
763
764 // blur state
765 char fSum0[sizeof(skvx::Vec<4, uint32_t>)];
766 char fSum1[sizeof(skvx::Vec<4, uint32_t>)];
767 skvx::Vec<4, uint32_t>* fBuffer0Cursor;
768 skvx::Vec<4, uint32_t>* fBuffer1Cursor;
769 };
770
771 // TODO: Implement CPU backend for different fTileMode. This is still worth doing inline with the
772 // blur; at the moment the tiling is applied via the CropImageFilter and carried as metadata on
773 // the FilterResult. This is forcefully applied in onFilterImage() to get a simple SkSpecialImage to
774 // pass to cpu_blur or gpu_blur, which evaluates the tile mode into a kernel-outset buffer that is
775 // then processed by these functions. If the tilemode is the only thing being applied, it would be
776 // ideal to tile from the input image directly instead of inserting a new temporary image. For CPU
777 // blurs this temporary image now creates the appearance of correctness; for GPU blurs that could
778 // tile already it may create a regression.
cpu_blur(const skif::Context & ctx,skif::LayerSpace<SkSize> sigma,const sk_sp<SkSpecialImage> & input,skif::LayerSpace<SkIRect> srcBounds,skif::LayerSpace<SkIRect> dstBounds)779 sk_sp<SkSpecialImage> cpu_blur(const skif::Context& ctx,
780 skif::LayerSpace<SkSize> sigma,
781 const sk_sp<SkSpecialImage>& input,
782 skif::LayerSpace<SkIRect> srcBounds,
783 skif::LayerSpace<SkIRect> dstBounds) {
784 // map_sigma limits sigma to 532 to match 1000px box filter limit of WebKit and Firefox.
785 // Since this does not exceed the limits of the TentPass (2183), there won't be overflow when
786 // computing a kernel over a pixel window filled with 255.
787 static_assert(kMaxSigma <= 2183.0f);
788
789 // The input image should fill the srcBounds
790 SkASSERT(input->width() == srcBounds.width() && input->height() == srcBounds.height());
791
792 SkSTArenaAlloc<1024> alloc;
793 auto makeMaker = [&](double sigma) -> PassMaker* {
794 SkASSERT(0 <= sigma && sigma <= 2183); // should be guaranteed after map_sigma
795 if (PassMaker* maker = GaussPass::MakeMaker(sigma, &alloc)) {
796 return maker;
797 }
798 if (PassMaker* maker = TentPass::MakeMaker(sigma, &alloc)) {
799 return maker;
800 }
801 SK_ABORT("Sigma is out of range.");
802 };
803
804 PassMaker* makerX = makeMaker(sigma.width());
805 PassMaker* makerY = makeMaker(sigma.height());
806 // A no-op blur should have been caught earlier in onFilterImage().
807 SkASSERT(makerX->window() > 1 || makerY->window() > 1);
808
809 SkBitmap src;
810 if (!SkSpecialImages::AsBitmap(input.get(), &src)) {
811 return nullptr;
812 }
813 if (src.colorType() != kN32_SkColorType) {
814 return nullptr;
815 }
816
817 auto originalDstBounds = dstBounds;
818 if (makerX->window() > 1) {
819 // Inflate the dst by the window required for the Y pass so that the X pass can prepare it.
820 // The Y pass will be offset to only write to the original rows in dstBounds, but its window
821 // will access these extra rows calculated by the X pass. The SpecialImage factory will
822 // then subset the bitmap so it appears to match 'originalDstBounds' tightly. We make one
823 // slightly larger image to hold this extra data instead of two separate images sized
824 // exactly to each pass because the CPU blur can write in place.
825 const auto yPadding = skif::LayerSpace<SkSize>({0.f, 3 * sigma.height()}).ceil();
826 dstBounds.outset(yPadding);
827 }
828
829 SkBitmap dst;
830 const skif::LayerSpace<SkIPoint> dstOrigin = dstBounds.topLeft();
831 if (!dst.tryAllocPixels(src.info().makeWH(dstBounds.width(), dstBounds.height()))) {
832 return nullptr;
833 }
834 dst.eraseColor(SK_ColorTRANSPARENT);
835
836 auto buffer = alloc.makeBytesAlignedTo(std::max(makerX->bufferSizeBytes(),
837 makerY->bufferSizeBytes()),
838 alignof(skvx::Vec<4, uint32_t>));
839
840 // Basic Plan: The three cases to handle
841 // * Horizontal and Vertical - blur horizontally while copying values from the source to
842 // the destination. Then, do an in-place vertical blur.
843 // * Horizontal only - blur horizontally copying values from the source to the destination.
844 // * Vertical only - blur vertically copying values from the source to the destination.
845
846 // Initialize these assuming the Y-only case
847 int loopStart = std::max(srcBounds.left(), dstBounds.left());
848 int loopEnd = std::min(srcBounds.right(), dstBounds.right());
849 int dstYOffset = 0;
850
851 if (makerX->window() > 1) {
852 // First an X-only blur from src into dst, including the extra rows that will become input
853 // for the second Y pass, which will then be performed in place.
854 loopStart = std::max(srcBounds.top(), dstBounds.top());
855 loopEnd = std::min(srcBounds.bottom(), dstBounds.bottom());
856
857 auto srcAddr = src.getAddr32(0, loopStart - srcBounds.top());
858 auto dstAddr = dst.getAddr32(0, loopStart - dstBounds.top());
859
860 // Iterate over each row to calculate 1D blur along X.
861 Pass* pass = makerX->makePass(buffer, &alloc);
862 for (int y = loopStart; y < loopEnd; ++y) {
863 pass->blur(srcBounds.left() - dstBounds.left(),
864 srcBounds.right() - dstBounds.left(),
865 dstBounds.width(),
866 srcAddr, 1,
867 dstAddr, 1);
868 srcAddr += src.rowBytesAsPixels();
869 dstAddr += dst.rowBytesAsPixels();
870 }
871
872 // Set up the Y pass to blur from the full dst into the non-outset portion of dst
873 src = dst;
874 loopStart = originalDstBounds.left();
875 loopEnd = originalDstBounds.right();
876 // The new 'dst' is equal to dst.extractSubset(originalDstBounds.offset(-dstOrigin)), but
877 // by construction only the Y offset has an interesting value so this is a little more
878 // efficient.
879 dstYOffset = originalDstBounds.top() - dstBounds.top();
880
881 srcBounds = dstBounds;
882 dstBounds = originalDstBounds;
883 }
884
885 // Iterate over each column to calculate 1D blur along Y. This is either blurring from src into
886 // dst for a 1D blur; or it's blurring from dst into dst for the second pass of a 2D blur.
887 if (makerY->window() > 1) {
888 auto srcAddr = src.getAddr32(loopStart - srcBounds.left(), 0);
889 auto dstAddr = dst.getAddr32(loopStart - dstBounds.left(), dstYOffset);
890
891 Pass* pass = makerY->makePass(buffer, &alloc);
892 for (int x = loopStart; x < loopEnd; ++x) {
893 pass->blur(srcBounds.top() - dstBounds.top(),
894 srcBounds.bottom() - dstBounds.top(),
895 dstBounds.height(),
896 srcAddr, src.rowBytesAsPixels(),
897 dstAddr, dst.rowBytesAsPixels());
898 srcAddr += 1;
899 dstAddr += 1;
900 }
901 }
902
903 originalDstBounds.offset(-dstOrigin); // Make relative to dst's pixels
904 return SkSpecialImages::MakeFromRaster(SkIRect(originalDstBounds),
905 dst,
906 ctx.backend()->surfaceProps());
907 }
908
909 } // namespace
910
onFilterImage(const skif::Context & ctx) const911 skif::FilterResult SkBlurImageFilter::onFilterImage(const skif::Context& ctx) const {
912 const bool gpuBacked = SkToBool(ctx.backend()->getBlurEngine());
913
914 skif::Context inputCtx = ctx.withNewDesiredOutput(
915 this->kernelBounds(ctx.mapping(), ctx.desiredOutput(), gpuBacked));
916
917 skif::FilterResult childOutput = this->getChildOutput(0, inputCtx);
918 skif::LayerSpace<SkSize> sigma = this->mapSigma(ctx.mapping(), gpuBacked);
919 if (sigma.width() == 0.f && sigma.height() == 0.f) {
920 // No actual blur, so just return the input unmodified
921 return childOutput;
922 }
923
924 SkASSERT(sigma.width() >= 0.f && sigma.width() <= kMaxSigma &&
925 sigma.height() >= 0.f && sigma.height() <= kMaxSigma);
926
927 // By default, FilterResult::blur() will calculate a more optimal output automatically, so
928 // convey the original output to it.
929 skif::LayerSpace<SkIRect> maxOutput = ctx.desiredOutput();
930 if (!gpuBacked || fLegacyTileMode != SkTileMode::kDecal) {
931 maxOutput = this->kernelBounds(ctx.mapping(), childOutput.layerBounds(), gpuBacked);
932 if (!maxOutput.intersect(ctx.desiredOutput())) {
933 return {};
934 }
935 }
936 if (fLegacyTileMode != SkTileMode::kDecal) {
937 // Legacy tiling applied to the input image when there was no explicit crop rect. Use the
938 // child's output image's layer bounds as the crop rectangle to adjust the edge tile mode
939 // without restricting the image.
940 childOutput = childOutput.applyCrop(inputCtx,
941 childOutput.layerBounds(),
942 fLegacyTileMode);
943 }
944
945 // TODO(b/40039877): Once the CPU blur functions can handle tile modes and color types beyond
946 // N32, there won't be any need to branch on how to apply the blur to the filter result.
947 if (gpuBacked) {
948 // For non-legacy tiling, 'maxOutput' is equal to the desired output. For decal's it matches
949 // what Builder::blur() calculates internally. For legacy tiling, however, it's dependent on
950 // the original child output's bounds ignoring the tile mode's effect.
951 skif::Context croppedOutput = ctx.withNewDesiredOutput(maxOutput);
952 skif::FilterResult::Builder builder{croppedOutput};
953 builder.add(childOutput);
954 return builder.blur(sigma);
955 }
956
957 // The CPU blur does not yet support tile modes so explicitly resolve it to a special image that
958 // has the tiling rendered into the pixels.
959
960 auto [resolvedChildOutput, origin] = childOutput.imageAndOffset(inputCtx);
961 if (!resolvedChildOutput) {
962 return {};
963 }
964 skif::LayerSpace<SkIRect> srcBounds{SkIRect::MakeXYWH(origin.x(),
965 origin.y(),
966 resolvedChildOutput->width(),
967 resolvedChildOutput->height())};
968
969 return skif::FilterResult{cpu_blur(ctx, sigma, std::move(resolvedChildOutput),
970 srcBounds, maxOutput),
971 maxOutput.topLeft()};
972 }
973
mapSigma(const skif::Mapping & mapping,bool gpuBacked) const974 skif::LayerSpace<SkSize> SkBlurImageFilter::mapSigma(const skif::Mapping& mapping,
975 bool gpuBacked) const {
976 skif::LayerSpace<SkSize> sigma = mapping.paramToLayer(fSigma);
977 // Clamp to the maximum sigma
978 sigma = skif::LayerSpace<SkSize>({std::min(sigma.width(), kMaxSigma),
979 std::min(sigma.height(), kMaxSigma)});
980
981 // TODO(b/294575803) - The CPU and GPU implementations have different requirements for
982 // "identity", with the GPU able to handle smaller sigmas. calculate_window() returns <= 1 once
983 // sigma is below ~0.8. Ideally we should work out the sigma threshold such that the max
984 // contribution from adjacent pixels is less than 0.5/255 and use that for both backends.
985 // NOTE: For convenience with builds, and the flux that is about to occur with the blur utils,
986 // this GPU logic is just copied from GrBlurUtils
987
988 // Disable bluring on axes that are not finite, or that are small enough that the blur is
989 // effectively an identity.
990 if (!SkIsFinite(sigma.width()) || (!gpuBacked && calculate_window(sigma.width()) <= 1)
991 #if defined(SK_GANESH) || defined(SK_GRAPHITE)
992 || (gpuBacked && skgpu::BlurIsEffectivelyIdentity(sigma.width()))
993 #endif
994 ) {
995 sigma = skif::LayerSpace<SkSize>({0.f, sigma.height()});
996 }
997
998 if (!SkIsFinite(sigma.height()) || (!gpuBacked && calculate_window(sigma.height()) <= 1)
999 #if defined(SK_GANESH) || defined(SK_GRAPHITE)
1000 || (gpuBacked && skgpu::BlurIsEffectivelyIdentity(sigma.height()))
1001 #endif
1002 ) {
1003 sigma = skif::LayerSpace<SkSize>({sigma.width(), 0.f});
1004 }
1005
1006 return sigma;
1007 }
1008
onGetInputLayerBounds(const skif::Mapping & mapping,const skif::LayerSpace<SkIRect> & desiredOutput,std::optional<skif::LayerSpace<SkIRect>> contentBounds) const1009 skif::LayerSpace<SkIRect> SkBlurImageFilter::onGetInputLayerBounds(
1010 const skif::Mapping& mapping,
1011 const skif::LayerSpace<SkIRect>& desiredOutput,
1012 std::optional<skif::LayerSpace<SkIRect>> contentBounds) const {
1013 // Use gpuBacked=true since that has a more sensitive kernel, ensuring any layer input bounds
1014 // will be sufficient for both GPU and CPU evaluations.
1015 skif::LayerSpace<SkIRect> requiredInput =
1016 this->kernelBounds(mapping, desiredOutput, /*gpuBacked=*/true);
1017 return this->getChildInputLayerBounds(0, mapping, requiredInput, contentBounds);
1018 }
1019
onGetOutputLayerBounds(const skif::Mapping & mapping,std::optional<skif::LayerSpace<SkIRect>> contentBounds) const1020 std::optional<skif::LayerSpace<SkIRect>> SkBlurImageFilter::onGetOutputLayerBounds(
1021 const skif::Mapping& mapping,
1022 std::optional<skif::LayerSpace<SkIRect>> contentBounds) const {
1023 auto childOutput = this->getChildOutputLayerBounds(0, mapping, contentBounds);
1024 if (childOutput) {
1025 // Use gpuBacked=true since it will ensure output bounds are conservative; CPU-based blurs
1026 // may produce 1px inset from this for very small sigmas.
1027 return this->kernelBounds(mapping, *childOutput, /*gpuBacked=*/true);
1028 } else {
1029 return skif::LayerSpace<SkIRect>::Unbounded();
1030 }
1031 }
1032
computeFastBounds(const SkRect & src) const1033 SkRect SkBlurImageFilter::computeFastBounds(const SkRect& src) const {
1034 SkRect bounds = this->getInput(0) ? this->getInput(0)->computeFastBounds(src) : src;
1035 bounds.outset(SkSize(fSigma).width() * 3, SkSize(fSigma).height() * 3);
1036 return bounds;
1037 }
1038