1 /*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "src/core/SkMaskBlurFilter.h"
9
10 #include "include/core/SkColorPriv.h"
11 #include "include/private/SkMalloc.h"
12 #include "include/private/SkNx.h"
13 #include "include/private/SkTPin.h"
14 #include "include/private/SkTemplates.h"
15 #include "include/private/SkTo.h"
16 #include "src/core/SkArenaAlloc.h"
17 #include "src/core/SkGaussFilter.h"
18
19 #include <cmath>
20 #include <climits>
21
22 namespace {
23 static const double kPi = 3.14159265358979323846264338327950288;
24
25 class PlanGauss final {
26 public:
PlanGauss(double sigma)27 explicit PlanGauss(double sigma) {
28 auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5));
29 auto window = std::max(1, possibleWindow);
30
31 fPass0Size = window - 1;
32 fPass1Size = window - 1;
33 fPass2Size = (window & 1) == 1 ? window - 1 : window;
34
35 // Calculating the border is tricky. I will go through the odd case which is simpler, and
36 // then through the even case. Given a stack of filters seven wide for the odd case of
37 // three passes.
38 //
39 // S
40 // aaaAaaa
41 // bbbBbbb
42 // cccCccc
43 // D
44 //
45 // The furthest changed pixel is when the filters are in the following configuration.
46 //
47 // S
48 // aaaAaaa
49 // bbbBbbb
50 // cccCccc
51 // D
52 //
53 // The A pixel is calculated using the value S, the B uses A, and the C uses B, and
54 // finally D is C. So, with a window size of seven the border is nine. In general, the
55 // border is 3*((window - 1)/2).
56 //
57 // For even cases the filter stack is more complicated. The spec specifies two passes
58 // of even filters and a final pass of odd filters. A stack for a width of six looks like
59 // this.
60 //
61 // S
62 // aaaAaa
63 // bbBbbb
64 // cccCccc
65 // D
66 //
67 // The furthest pixel looks like this.
68 //
69 // S
70 // aaaAaa
71 // bbBbbb
72 // cccCccc
73 // D
74 //
75 // For a window of size, the border value is seven. In general the border is 3 *
76 // (window/2) -1.
77 fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
78 fSlidingWindow = 2 * fBorder + 1;
79
80 // If the window is odd then the divisor is just window ^ 3 otherwise,
81 // it is window * window * (window + 1) = window ^ 2 + window ^ 3;
82 auto window2 = window * window;
83 auto window3 = window2 * window;
84 auto divisor = (window & 1) == 1 ? window3 : window3 + window2;
85
86 fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32)));
87 }
88
bufferSize() const89 size_t bufferSize() const { return fPass0Size + fPass1Size + fPass2Size; }
90
border() const91 int border() const { return fBorder; }
92
93 public:
94 class Scan {
95 public:
Scan(uint64_t weight,int noChangeCount,uint32_t * buffer0,uint32_t * buffer0End,uint32_t * buffer1,uint32_t * buffer1End,uint32_t * buffer2,uint32_t * buffer2End)96 Scan(uint64_t weight, int noChangeCount,
97 uint32_t* buffer0, uint32_t* buffer0End,
98 uint32_t* buffer1, uint32_t* buffer1End,
99 uint32_t* buffer2, uint32_t* buffer2End)
100 : fWeight{weight}
101 , fNoChangeCount{noChangeCount}
102 , fBuffer0{buffer0}
103 , fBuffer0End{buffer0End}
104 , fBuffer1{buffer1}
105 , fBuffer1End{buffer1End}
106 , fBuffer2{buffer2}
107 , fBuffer2End{buffer2End}
108 { }
109
blur(const AlphaIter srcBegin,const AlphaIter srcEnd,uint8_t * dst,int dstStride,uint8_t * dstEnd) const110 template <typename AlphaIter> void blur(const AlphaIter srcBegin, const AlphaIter srcEnd,
111 uint8_t* dst, int dstStride, uint8_t* dstEnd) const {
112 auto buffer0Cursor = fBuffer0;
113 auto buffer1Cursor = fBuffer1;
114 auto buffer2Cursor = fBuffer2;
115
116 std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
117
118 uint32_t sum0 = 0;
119 uint32_t sum1 = 0;
120 uint32_t sum2 = 0;
121
122 // Consume the source generating pixels.
123 for (AlphaIter src = srcBegin; src < srcEnd; ++src, dst += dstStride) {
124 uint32_t leadingEdge = *src;
125 sum0 += leadingEdge;
126 sum1 += sum0;
127 sum2 += sum1;
128
129 *dst = this->finalScale(sum2);
130
131 sum2 -= *buffer2Cursor;
132 *buffer2Cursor = sum1;
133 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
134
135 sum1 -= *buffer1Cursor;
136 *buffer1Cursor = sum0;
137 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
138
139 sum0 -= *buffer0Cursor;
140 *buffer0Cursor = leadingEdge;
141 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
142 }
143
144 // The leading edge is off the right side of the mask.
145 for (int i = 0; i < fNoChangeCount; i++) {
146 uint32_t leadingEdge = 0;
147 sum0 += leadingEdge;
148 sum1 += sum0;
149 sum2 += sum1;
150
151 *dst = this->finalScale(sum2);
152
153 sum2 -= *buffer2Cursor;
154 *buffer2Cursor = sum1;
155 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
156
157 sum1 -= *buffer1Cursor;
158 *buffer1Cursor = sum0;
159 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
160
161 sum0 -= *buffer0Cursor;
162 *buffer0Cursor = leadingEdge;
163 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
164
165 dst += dstStride;
166 }
167
168 // Starting from the right, fill in the rest of the buffer.
169 std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
170
171 sum0 = sum1 = sum2 = 0;
172
173 uint8_t* dstCursor = dstEnd;
174 AlphaIter src = srcEnd;
175 while (dstCursor > dst) {
176 dstCursor -= dstStride;
177 uint32_t leadingEdge = *(--src);
178 sum0 += leadingEdge;
179 sum1 += sum0;
180 sum2 += sum1;
181
182 *dstCursor = this->finalScale(sum2);
183
184 sum2 -= *buffer2Cursor;
185 *buffer2Cursor = sum1;
186 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
187
188 sum1 -= *buffer1Cursor;
189 *buffer1Cursor = sum0;
190 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
191
192 sum0 -= *buffer0Cursor;
193 *buffer0Cursor = leadingEdge;
194 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
195 }
196 }
197
198 private:
199 inline static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
200
finalScale(uint32_t sum) const201 uint8_t finalScale(uint32_t sum) const {
202 return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
203 }
204
205 uint64_t fWeight;
206 int fNoChangeCount;
207 uint32_t* fBuffer0;
208 uint32_t* fBuffer0End;
209 uint32_t* fBuffer1;
210 uint32_t* fBuffer1End;
211 uint32_t* fBuffer2;
212 uint32_t* fBuffer2End;
213 };
214
makeBlurScan(int width,uint32_t * buffer) const215 Scan makeBlurScan(int width, uint32_t* buffer) const {
216 uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End;
217 buffer0 = buffer;
218 buffer0End = buffer1 = buffer0 + fPass0Size;
219 buffer1End = buffer2 = buffer1 + fPass1Size;
220 buffer2End = buffer2 + fPass2Size;
221 int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0;
222
223 return Scan(
224 fWeight, noChangeCount,
225 buffer0, buffer0End,
226 buffer1, buffer1End,
227 buffer2, buffer2End);
228 }
229
230 uint64_t fWeight;
231 int fBorder;
232 int fSlidingWindow;
233 int fPass0Size;
234 int fPass1Size;
235 int fPass2Size;
236 };
237
238 } // namespace
239
240 // NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
241 // using the Gauss filter. It also limits the size of buffers used hold intermediate values. The
242 // additional + 1 added to window represents adding one more leading element before subtracting the
243 // trailing element.
244 // Explanation of maximums:
245 // sum0 = (window + 1) * 255
246 // sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255
247 // sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255
248 //
249 // The value (window + 1)^3 * 255 must fit in a uint32_t. So,
250 // (window + 1)^3 * 255 < 2^32. window = 255.
251 //
252 // window = floor(sigma * 3 * sqrt(2 * kPi) / 4)
253 // For window <= 255, the largest value for sigma is 135.
SkMaskBlurFilter(double sigmaW,double sigmaH)254 SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH)
255 : fSigmaW{SkTPin(sigmaW, 0.0, 135.0)}
256 , fSigmaH{SkTPin(sigmaH, 0.0, 135.0)}
257 {
258 SkASSERT(sigmaW >= 0);
259 SkASSERT(sigmaH >= 0);
260 }
261
hasNoBlur() const262 bool SkMaskBlurFilter::hasNoBlur() const {
263 return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1);
264 }
265
266 // We favor A8 masks, and if we need to work with another format, we'll convert to A8 first.
267 // Each of these converts width (up to 8) mask values to A8.
bw_to_a8(uint8_t * a8,const uint8_t * from,int width)268 static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) {
269 SkASSERT(0 < width && width <= 8);
270
271 uint8_t masks = *from;
272 for (int i = 0; i < width; ++i) {
273 a8[i] = (masks >> (7 - i)) & 1 ? 0xFF
274 : 0x00;
275 }
276 }
lcd_to_a8(uint8_t * a8,const uint8_t * from,int width)277 static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) {
278 SkASSERT(0 < width && width <= 8);
279
280 for (int i = 0; i < width; ++i) {
281 unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i],
282 r = SkPacked16ToR32(rgb),
283 g = SkPacked16ToG32(rgb),
284 b = SkPacked16ToB32(rgb);
285 a8[i] = (r + g + b) / 3;
286 }
287 }
argb32_to_a8(uint8_t * a8,const uint8_t * from,int width)288 static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) {
289 SkASSERT(0 < width && width <= 8);
290 for (int i = 0; i < width; ++i) {
291 uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i];
292 a8[i] = SkGetPackedA32(rgba);
293 }
294 }
295 using ToA8 = decltype(bw_to_a8);
296
load(const uint8_t * from,int width,ToA8 * toA8)297 static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
298 // Our fast path is a full 8-byte load of A8.
299 // So we'll conditionally handle the two slow paths using tmp:
300 // - if we have a function to convert another mask to A8, use it;
301 // - if not but we have less than 8 bytes to load, load them one at a time.
302 uint8_t tmp[8] = {0,0,0,0, 0,0,0,0};
303 if (toA8) {
304 toA8(tmp, from, width);
305 from = tmp;
306 } else if (width < 8) {
307 for (int i = 0; i < width; ++i) {
308 tmp[i] = from[i];
309 }
310 from = tmp;
311 }
312
313 // Load A8 and convert to 8.8 fixed-point.
314 return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8;
315 }
316
store(uint8_t * to,const Sk8h & v,int width)317 static void store(uint8_t* to, const Sk8h& v, int width) {
318 Sk8b b = SkNx_cast<uint8_t>(v >> 8);
319 if (width == 8) {
320 b.store(to);
321 } else {
322 uint8_t buffer[8];
323 b.store(buffer);
324 for (int i = 0; i < width; i++) {
325 to[i] = buffer[i];
326 }
327 }
328 };
329
330 static constexpr uint16_t _____ = 0u;
331 static constexpr uint16_t kHalf = 0x80u;
332
333 // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
334 // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
335 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
336 // resulting in a 8.8 format.
337 //
338 // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
339 // system is setup to minimize the number of multiplies needed.
340 //
341 // Explanation:
342 // Blurring a specific mask value is given by the following equation where D_n is the resulting
343 // mask value and S_n is the source value. The example below is for a filter with a radius of 1
344 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
345 // aligned. The filter is given by G_n where n is the symmetric filter value.
346 //
347 // D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
348 //
349 // We can start the source index at an offset relative to the destination separated by the
350 // radius. This results in a non-traditional restating of the above filter.
351 //
352 // D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
353 //
354 // If we look at three specific consecutive destinations the following equations result:
355 //
356 // D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
357 // D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
358 // D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
359 //
360 // In the above equations, notice that S[7] is used in all three. In particular, two values are
361 // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
362 // D[7].
363 //
364 // From the point of view of a source value we end up with the following three equations.
365 //
366 // Given S[7]:
367 // D[5] += S[7]*G[1]
368 // D[6] += S[7]*G[0]
369 // D[7] += S[7]*G[1]
370 //
371 // In General:
372 // D[n] += S[n]*G[1]
373 // D[n+1] += S[n]*G[0]
374 // D[n+2] += S[n]*G[1]
375 //
376 // Now these equations can be ganged using SIMD to form:
377 // D[n..n+7] += S[n..n+7]*G[1]
378 // D[n+1..n+8] += S[n..n+7]*G[0]
379 // D[n+2..n+9] += S[n..n+7]*G[1]
380 // The next set of values becomes.
381 // D[n+8..n+15] += S[n+8..n+15]*G[1]
382 // D[n+9..n+16] += S[n+8..n+15]*G[0]
383 // D[n+10..n+17] += S[n+8..n+15]*G[1]
384 // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
385 // S[n..7] and S[n+8..n+15].
386 //
387 // Just one more transformation allows the code to maintain all working values in
388 // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
389 // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
390 //
391 // D[n..n+7] += S[n..n+7] * G[1]
392 // D[n..n+8] += {0, S[n..n+7] * G[0]}
393 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
394 //
395 // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
396 // register d8. In addition, S[0..n+7] becomes s0.
397 //
398 // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
399 //
400 // Sk8h v0 = s0*G[0]
401 // Sk8h v1 = s0*G[1]
402 // /* D[n..n+7] += S[n..n+7] * G[1] */
403 // d0 += v1;
404 // /* D[n..n+8] += {0, S[n..n+7] * G[0]} */
405 // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
406 // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
407 // /* D[n..n+9] += {0, 0, S[n..n+7] * G[1]} */
408 // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
409 // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
410 // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
411
blur_x_radius_1(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h &,const Sk8h &,const Sk8h &,Sk8h * d0,Sk8h * d8)412 static void blur_x_radius_1(
413 const Sk8h& s0,
414 const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
415 Sk8h* d0, Sk8h* d8) {
416
417 auto v1 = s0.mulHi(g1);
418 auto v0 = s0.mulHi(g0);
419
420 // D[n..n+7] += S[n..n+7] * G[1]
421 *d0 += v1;
422
423 //D[n..n+8] += {0, S[n..n+7] * G[0]}
424 *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
425 *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
426
427 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
428 *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
429 *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
430
431 }
432
blur_x_radius_2(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h &,const Sk8h &,Sk8h * d0,Sk8h * d8)433 static void blur_x_radius_2(
434 const Sk8h& s0,
435 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
436 Sk8h* d0, Sk8h* d8) {
437 auto v0 = s0.mulHi(g0);
438 auto v1 = s0.mulHi(g1);
439 auto v2 = s0.mulHi(g2);
440
441 // D[n..n+7] += S[n..n+7] * G[2]
442 *d0 += v2;
443
444 // D[n..n+8] += {0, S[n..n+7] * G[1]}
445 *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
446 *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
447
448 // D[n..n+9] += {0, 0, S[n..n+7] * G[0]}
449 *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
450 *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
451
452 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
453 *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
454 *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
455
456 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]}
457 *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
458 *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
459 }
460
blur_x_radius_3(const Sk8h & s0,const Sk8h & gauss0,const Sk8h & gauss1,const Sk8h & gauss2,const Sk8h & gauss3,const Sk8h &,Sk8h * d0,Sk8h * d8)461 static void blur_x_radius_3(
462 const Sk8h& s0,
463 const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
464 Sk8h* d0, Sk8h* d8) {
465 auto v0 = s0.mulHi(gauss0);
466 auto v1 = s0.mulHi(gauss1);
467 auto v2 = s0.mulHi(gauss2);
468 auto v3 = s0.mulHi(gauss3);
469
470 // D[n..n+7] += S[n..n+7] * G[3]
471 *d0 += v3;
472
473 // D[n..n+8] += {0, S[n..n+7] * G[2]}
474 *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
475 *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
476
477 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]}
478 *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
479 *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
480
481 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]}
482 *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
483 *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
484
485 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]}
486 *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
487 *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
488
489 // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
490 *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
491 *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
492
493 // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
494 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
495 *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
496 }
497
blur_x_radius_4(const Sk8h & s0,const Sk8h & gauss0,const Sk8h & gauss1,const Sk8h & gauss2,const Sk8h & gauss3,const Sk8h & gauss4,Sk8h * d0,Sk8h * d8)498 static void blur_x_radius_4(
499 const Sk8h& s0,
500 const Sk8h& gauss0,
501 const Sk8h& gauss1,
502 const Sk8h& gauss2,
503 const Sk8h& gauss3,
504 const Sk8h& gauss4,
505 Sk8h* d0, Sk8h* d8) {
506 auto v0 = s0.mulHi(gauss0);
507 auto v1 = s0.mulHi(gauss1);
508 auto v2 = s0.mulHi(gauss2);
509 auto v3 = s0.mulHi(gauss3);
510 auto v4 = s0.mulHi(gauss4);
511
512 // D[n..n+7] += S[n..n+7] * G[4]
513 *d0 += v4;
514
515 // D[n..n+8] += {0, S[n..n+7] * G[3]}
516 *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
517 *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
518
519 // D[n..n+9] += {0, 0, S[n..n+7] * G[2]}
520 *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
521 *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
522
523 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]}
524 *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
525 *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
526
527 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]}
528 *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
529 *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
530
531 // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
532 *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
533 *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
534
535 // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
536 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
537 *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
538
539 // D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
540 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
541 *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
542
543 // D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
544 *d8 += v4;
545 }
546
547 using BlurX = decltype(blur_x_radius_1);
548
549 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
blur_row(BlurX blur,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h & g4,const uint8_t * src,int srcW,uint8_t * dst,int dstW)550 static void blur_row(
551 BlurX blur,
552 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
553 const uint8_t* src, int srcW,
554 uint8_t* dst, int dstW) {
555 // Clear the buffer to handle summing wider than source.
556 Sk8h d0{kHalf}, d8{kHalf};
557
558 // Go by multiples of 8 in src.
559 int x = 0;
560 for (; x <= srcW - 8; x += 8) {
561 blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
562
563 store(dst, d0, 8);
564
565 d0 = d8;
566 d8 = Sk8h{kHalf};
567
568 src += 8;
569 dst += 8;
570 }
571
572 // There are src values left, but the remainder of src values is not a multiple of 8.
573 int srcTail = srcW - x;
574 if (srcTail > 0) {
575
576 blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
577
578 int dstTail = std::min(8, dstW - x);
579 store(dst, d0, dstTail);
580
581 d0 = d8;
582 dst += dstTail;
583 x += dstTail;
584 }
585
586 // There are dst mask values to complete.
587 int dstTail = dstW - x;
588 if (dstTail > 0) {
589 store(dst, d0, dstTail);
590 }
591 }
592
593 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
blur_x_rect(BlurX blur,uint16_t * gauss,const uint8_t * src,size_t srcStride,int srcW,uint8_t * dst,size_t dstStride,int dstW,int dstH)594 static void blur_x_rect(BlurX blur,
595 uint16_t* gauss,
596 const uint8_t* src, size_t srcStride, int srcW,
597 uint8_t* dst, size_t dstStride, int dstW, int dstH) {
598
599 Sk8h g0{gauss[0]},
600 g1{gauss[1]},
601 g2{gauss[2]},
602 g3{gauss[3]},
603 g4{gauss[4]};
604
605 // Blur *ALL* the rows.
606 for (int y = 0; y < dstH; y++) {
607 blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW);
608 src += srcStride;
609 dst += dstStride;
610 }
611 }
612
direct_blur_x(int radius,uint16_t * gauss,const uint8_t * src,size_t srcStride,int srcW,uint8_t * dst,size_t dstStride,int dstW,int dstH)613 static void direct_blur_x(int radius, uint16_t* gauss,
614 const uint8_t* src, size_t srcStride, int srcW,
615 uint8_t* dst, size_t dstStride, int dstW, int dstH) {
616
617 switch (radius) {
618 case 1:
619 blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
620 break;
621
622 case 2:
623 blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
624 break;
625
626 case 3:
627 blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
628 break;
629
630 case 4:
631 blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
632 break;
633
634 default:
635 SkASSERTF(false, "The radius %d is not handled\n", radius);
636 }
637 }
638
639 // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
640 // functions, but end up being simpler because there is no complicated shift of registers. We
641 // start with the non-traditional form of the gaussian filter. In the following r is the value
642 // when added generates the next value in the column.
643 //
644 // D[n+0r] = S[n+0r]*G[1]
645 // + S[n+1r]*G[0]
646 // + S[n+2r]*G[1]
647 //
648 // Expanding out in a way similar to blur_x_radius_N for specific values of n.
649 //
650 // D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
651 // D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
652 // D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
653 //
654 // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
655 // can look at the calculation form the point of view of a source value.
656 //
657 // Given S[n+0r]:
658 // D[n+0r] += S[n+0r]*G[1];
659 // /* D[n+0r] is done and can be stored now. */
660 // D[n+1r] += S[n+0r]*G[0];
661 // D[n+2r] = S[n+0r]*G[1];
662 //
663 // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
664 // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
665 // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
666 // how values flow in the following two iterations in source.
667 //
668 // D[n+0r] += S[n+0r]*G[1]
669 // D[n+1r] += S[n+0r]*G[0]
670 // D[n+2r] = S[n+0r]*G[1]
671 // /* ------- */
672 // D[n+1r] += S[n+1r]*G[1]
673 // D[n+2r] += S[n+1r]*G[0]
674 // D[n+3r] = S[n+1r]*G[1]
675 //
676 // Instead of using memory we can introduce temporaries d01 and d12. The update step changes
677 // to the following.
678 //
679 // answer = d01 + S[n+0r]*G[1]
680 // d01 = d12 + S[n+0r]*G[0]
681 // d12 = S[n+0r]*G[1]
682 // return answer
683 //
684 // Finally, this can be ganged into SIMD style.
685 // answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
686 // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0]
687 // d12[0..7] = S[n+0r..n+0r+7]*G[1]
688 // return answer[0..7]
blur_y_radius_1(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h &,const Sk8h &,const Sk8h &,Sk8h * d01,Sk8h * d12,Sk8h *,Sk8h *,Sk8h *,Sk8h *,Sk8h *,Sk8h *)689 static Sk8h blur_y_radius_1(
690 const Sk8h& s0,
691 const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
692 Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
693 auto v0 = s0.mulHi(g0);
694 auto v1 = s0.mulHi(g1);
695
696 Sk8h answer = *d01 + v1;
697 *d01 = *d12 + v0;
698 *d12 = v1 + kHalf;
699
700 return answer;
701 }
702
blur_y_radius_2(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h &,const Sk8h &,Sk8h * d01,Sk8h * d12,Sk8h * d23,Sk8h * d34,Sk8h *,Sk8h *,Sk8h *,Sk8h *)703 static Sk8h blur_y_radius_2(
704 const Sk8h& s0,
705 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
706 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
707 auto v0 = s0.mulHi(g0);
708 auto v1 = s0.mulHi(g1);
709 auto v2 = s0.mulHi(g2);
710
711 Sk8h answer = *d01 + v2;
712 *d01 = *d12 + v1;
713 *d12 = *d23 + v0;
714 *d23 = *d34 + v1;
715 *d34 = v2 + kHalf;
716
717 return answer;
718 }
719
blur_y_radius_3(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h &,Sk8h * d01,Sk8h * d12,Sk8h * d23,Sk8h * d34,Sk8h * d45,Sk8h * d56,Sk8h *,Sk8h *)720 static Sk8h blur_y_radius_3(
721 const Sk8h& s0,
722 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
723 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
724 auto v0 = s0.mulHi(g0);
725 auto v1 = s0.mulHi(g1);
726 auto v2 = s0.mulHi(g2);
727 auto v3 = s0.mulHi(g3);
728
729 Sk8h answer = *d01 + v3;
730 *d01 = *d12 + v2;
731 *d12 = *d23 + v1;
732 *d23 = *d34 + v0;
733 *d34 = *d45 + v1;
734 *d45 = *d56 + v2;
735 *d56 = v3 + kHalf;
736
737 return answer;
738 }
739
blur_y_radius_4(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h & g4,Sk8h * d01,Sk8h * d12,Sk8h * d23,Sk8h * d34,Sk8h * d45,Sk8h * d56,Sk8h * d67,Sk8h * d78)740 static Sk8h blur_y_radius_4(
741 const Sk8h& s0,
742 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
743 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
744 auto v0 = s0.mulHi(g0);
745 auto v1 = s0.mulHi(g1);
746 auto v2 = s0.mulHi(g2);
747 auto v3 = s0.mulHi(g3);
748 auto v4 = s0.mulHi(g4);
749
750 Sk8h answer = *d01 + v4;
751 *d01 = *d12 + v3;
752 *d12 = *d23 + v2;
753 *d23 = *d34 + v1;
754 *d34 = *d45 + v0;
755 *d45 = *d56 + v1;
756 *d56 = *d67 + v2;
757 *d67 = *d78 + v3;
758 *d78 = v4 + kHalf;
759
760 return answer;
761 }
762
763 using BlurY = decltype(blur_y_radius_1);
764
765 // BlurY will be one of blur_y_radius_(1|2|3|4).
blur_column(ToA8 toA8,BlurY blur,int radius,int width,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h & g4,const uint8_t * src,size_t srcRB,int srcH,uint8_t * dst,size_t dstRB)766 static void blur_column(
767 ToA8 toA8,
768 BlurY blur, int radius, int width,
769 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
770 const uint8_t* src, size_t srcRB, int srcH,
771 uint8_t* dst, size_t dstRB) {
772 Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
773 d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
774
775 auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
776 store(to, v0, width);
777 to += dstRB;
778 store(to, v1, width);
779 return to + dstRB;
780 };
781
782 for (int y = 0; y < srcH; y += 1) {
783 auto s = load(src, width, toA8);
784 auto b = blur(s,
785 g0, g1, g2, g3, g4,
786 &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78);
787 store(dst, b, width);
788 src += srcRB;
789 dst += dstRB;
790 }
791
792 if (radius >= 1) {
793 dst = flush(dst, d01, d12);
794 }
795 if (radius >= 2) {
796 dst = flush(dst, d23, d34);
797 }
798 if (radius >= 3) {
799 dst = flush(dst, d45, d56);
800 }
801 if (radius >= 4) {
802 flush(dst, d67, d78);
803 }
804 }
805
806 // BlurY will be one of blur_y_radius_(1|2|3|4).
blur_y_rect(ToA8 toA8,const int strideOf8,BlurY blur,int radius,uint16_t * gauss,const uint8_t * src,size_t srcRB,int srcW,int srcH,uint8_t * dst,size_t dstRB)807 static void blur_y_rect(ToA8 toA8, const int strideOf8,
808 BlurY blur, int radius, uint16_t *gauss,
809 const uint8_t *src, size_t srcRB, int srcW, int srcH,
810 uint8_t *dst, size_t dstRB) {
811
812 Sk8h g0{gauss[0]},
813 g1{gauss[1]},
814 g2{gauss[2]},
815 g3{gauss[3]},
816 g4{gauss[4]};
817
818 int x = 0;
819 for (; x <= srcW - 8; x += 8) {
820 blur_column(toA8, blur, radius, 8,
821 g0, g1, g2, g3, g4,
822 src, srcRB, srcH,
823 dst, dstRB);
824 src += strideOf8;
825 dst += 8;
826 }
827
828 int xTail = srcW - x;
829 if (xTail > 0) {
830 blur_column(toA8, blur, radius, xTail,
831 g0, g1, g2, g3, g4,
832 src, srcRB, srcH,
833 dst, dstRB);
834 }
835 }
836
direct_blur_y(ToA8 toA8,const int strideOf8,int radius,uint16_t * gauss,const uint8_t * src,size_t srcRB,int srcW,int srcH,uint8_t * dst,size_t dstRB)837 static void direct_blur_y(ToA8 toA8, const int strideOf8,
838 int radius, uint16_t* gauss,
839 const uint8_t* src, size_t srcRB, int srcW, int srcH,
840 uint8_t* dst, size_t dstRB) {
841
842 switch (radius) {
843 case 1:
844 blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss,
845 src, srcRB, srcW, srcH,
846 dst, dstRB);
847 break;
848
849 case 2:
850 blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss,
851 src, srcRB, srcW, srcH,
852 dst, dstRB);
853 break;
854
855 case 3:
856 blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss,
857 src, srcRB, srcW, srcH,
858 dst, dstRB);
859 break;
860
861 case 4:
862 blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss,
863 src, srcRB, srcW, srcH,
864 dst, dstRB);
865 break;
866
867 default:
868 SkASSERTF(false, "The radius %d is not handled\n", radius);
869 }
870 }
871
small_blur(double sigmaX,double sigmaY,const SkMask & src,SkMask * dst)872 static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) {
873 SkASSERT(sigmaX == sigmaY); // TODO
874 SkASSERT(0.01 <= sigmaX && sigmaX < 2);
875 SkASSERT(0.01 <= sigmaY && sigmaY < 2);
876
877 SkGaussFilter filterX{sigmaX},
878 filterY{sigmaY};
879
880 int radiusX = filterX.radius(),
881 radiusY = filterY.radius();
882
883 SkASSERT(radiusX <= 4 && radiusY <= 4);
884
885 auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) {
886 int i = 0;
887 for (double d : filter) {
888 factors[i++] = static_cast<uint16_t>(round(d * (1 << 16)));
889 }
890 };
891
892 uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax],
893 gaussFactorsY[SkGaussFilter::kGaussArrayMax];
894
895 prepareGauss(filterX, gaussFactorsX);
896 prepareGauss(filterY, gaussFactorsY);
897
898 *dst = SkMask::PrepareDestination(radiusX, radiusY, src);
899 if (src.fImage == nullptr) {
900 return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)};
901 }
902 if (dst->fImage == nullptr) {
903 dst->fBounds.setEmpty();
904 return {0, 0};
905 }
906
907 int srcW = src.fBounds.width(),
908 srcH = src.fBounds.height();
909
910 int dstW = dst->fBounds.width(),
911 dstH = dst->fBounds.height();
912
913 size_t srcRB = src.fRowBytes,
914 dstRB = dst->fRowBytes;
915
916 //TODO: handle bluring in only one direction.
917
918 // Blur vertically and copy to destination.
919 switch (src.fFormat) {
920 case SkMask::kBW_Format:
921 direct_blur_y(bw_to_a8, 1,
922 radiusY, gaussFactorsY,
923 src.fImage, srcRB, srcW, srcH,
924 dst->fImage + radiusX, dstRB);
925 break;
926 case SkMask::kA8_Format:
927 direct_blur_y(nullptr, 8,
928 radiusY, gaussFactorsY,
929 src.fImage, srcRB, srcW, srcH,
930 dst->fImage + radiusX, dstRB);
931 break;
932 case SkMask::kARGB32_Format:
933 direct_blur_y(argb32_to_a8, 32,
934 radiusY, gaussFactorsY,
935 src.fImage, srcRB, srcW, srcH,
936 dst->fImage + radiusX, dstRB);
937 break;
938 case SkMask::kLCD16_Format:
939 direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY,
940 src.fImage, srcRB, srcW, srcH,
941 dst->fImage + radiusX, dstRB);
942 break;
943 default:
944 SK_ABORT("Unhandled format.");
945 }
946
947 // Blur horizontally in place.
948 direct_blur_x(radiusX, gaussFactorsX,
949 dst->fImage + radiusX, dstRB, srcW,
950 dst->fImage, dstRB, dstW, dstH);
951
952 return {radiusX, radiusY};
953 }
954
955 // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the
956 // API forces the sigmas to be the same.
blur(const SkMask & src,SkMask * dst) const957 SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
958
959 if (fSigmaW < 2.0 && fSigmaH < 2.0) {
960 return small_blur(fSigmaW, fSigmaH, src, dst);
961 }
962
963 // 1024 is a place holder guess until more analysis can be done.
964 SkSTArenaAlloc<1024> alloc;
965
966 PlanGauss planW(fSigmaW);
967 PlanGauss planH(fSigmaH);
968
969 int borderW = planW.border(),
970 borderH = planH.border();
971 SkASSERT(borderH >= 0 && borderW >= 0);
972
973 *dst = SkMask::PrepareDestination(borderW, borderH, src);
974 if (src.fImage == nullptr) {
975 return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
976 }
977 if (dst->fImage == nullptr) {
978 dst->fBounds.setEmpty();
979 return {0, 0};
980 }
981
982 int srcW = src.fBounds.width(),
983 srcH = src.fBounds.height(),
984 dstW = dst->fBounds.width(),
985 dstH = dst->fBounds.height();
986 SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0);
987
988 auto bufferSize = std::max(planW.bufferSize(), planH.bufferSize());
989 auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize);
990
991 // Blur both directions.
992 int tmpW = srcH,
993 tmpH = dstW;
994
995 // Make sure not to overflow the multiply for the tmp buffer size.
996 if (tmpH > std::numeric_limits<int>::max() / tmpW) {
997 return {0, 0};
998 }
999 auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
1000
1001 // Blur horizontally, and transpose.
1002 const PlanGauss::Scan& scanW = planW.makeBlurScan(srcW, buffer);
1003 switch (src.fFormat) {
1004 case SkMask::kBW_Format: {
1005 const uint8_t* bwStart = src.fImage;
1006 auto start = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart, 0);
1007 auto end = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart + (srcW / 8), srcW % 8);
1008 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1009 auto tmpStart = &tmp[y];
1010 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1011 }
1012 } break;
1013 case SkMask::kA8_Format: {
1014 const uint8_t* a8Start = src.fImage;
1015 auto start = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start);
1016 auto end = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start + srcW);
1017 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1018 auto tmpStart = &tmp[y];
1019 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1020 }
1021 } break;
1022 case SkMask::kARGB32_Format: {
1023 const uint32_t* argbStart = reinterpret_cast<const uint32_t*>(src.fImage);
1024 auto start = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart);
1025 auto end = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart + srcW);
1026 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1027 auto tmpStart = &tmp[y];
1028 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1029 }
1030 } break;
1031 case SkMask::kLCD16_Format: {
1032 const uint16_t* lcdStart = reinterpret_cast<const uint16_t*>(src.fImage);
1033 auto start = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart);
1034 auto end = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart + srcW);
1035 for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1036 auto tmpStart = &tmp[y];
1037 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1038 }
1039 } break;
1040 default:
1041 SK_ABORT("Unhandled format.");
1042 }
1043
1044 // Blur vertically (scan in memory order because of the transposition),
1045 // and transpose back to the original orientation.
1046 const PlanGauss::Scan& scanH = planH.makeBlurScan(tmpW, buffer);
1047 for (int y = 0; y < tmpH; y++) {
1048 auto tmpStart = &tmp[y * tmpW];
1049 auto dstStart = &dst->fImage[y];
1050
1051 scanH.blur(tmpStart, tmpStart + tmpW,
1052 dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
1053 }
1054
1055 return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
1056 }
1057