• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2   Copyright 1999-2019 ImageMagick Studio LLC, a non-profit organization
3   dedicated to making software imaging solutions freely available.
4 
5   You may not use this file except in compliance with the License.  You may
6   obtain a copy of the License at
7 
8     https://imagemagick.org/script/license.php
9 
10   Unless required by applicable law or agreed to in writing, software
11   distributed under the License is distributed on an "AS IS" BASIS,
12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   See the License for the specific language governing permissions and
14   limitations under the License.
15 
16   MagickCore private kernels for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29   Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...)	"\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...)	"\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE()		"\n #""else " " \n"
34 #define OPENCL_ENDIF()		"\n #""endif " " \n"
35 #define OPENCL_IF(...)		"\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char *accelerateKernels =
39 
40 /*
41   Define declarations.
42 */
43   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
44   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
45   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
46   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
47   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
48   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
49   OPENCL_DEFINE(SigmaRandom, (attenuate))
50   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
51   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
52   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
53 
54 /*
55   Typedef declarations.
56 */
57   STRINGIFY(
58     typedef enum
59     {
60       UndefinedColorspace,
61       CMYColorspace,           /* negated linear RGB colorspace */
62       CMYKColorspace,          /* CMY with Black separation */
63       GRAYColorspace,          /* Single Channel greyscale (non-linear) image */
64       HCLColorspace,
65       HCLpColorspace,
66       HSBColorspace,
67       HSIColorspace,
68       HSLColorspace,
69       HSVColorspace,           /* alias for HSB */
70       HWBColorspace,
71       LabColorspace,
72       LCHColorspace,           /* alias for LCHuv */
73       LCHabColorspace,         /* Cylindrical (Polar) Lab */
74       LCHuvColorspace,         /* Cylindrical (Polar) Luv */
75       LogColorspace,
76       LMSColorspace,
77       LuvColorspace,
78       OHTAColorspace,
79       Rec601YCbCrColorspace,
80       Rec709YCbCrColorspace,
81       RGBColorspace,           /* Linear RGB colorspace */
82       scRGBColorspace,         /* ??? */
83       sRGBColorspace,          /* Default: non-linear sRGB colorspace */
84       TransparentColorspace,
85       xyYColorspace,
86       XYZColorspace,           /* IEEE Color Reference colorspace */
87       YCbCrColorspace,
88       YCCColorspace,
89       YDbDrColorspace,
90       YIQColorspace,
91       YPbPrColorspace,
92       YUVColorspace,
93       LinearGRAYColorspace     /* Single Channel greyscale (linear) image */
94     } ColorspaceType;
95   )
96 
97   STRINGIFY(
98     typedef enum
99     {
100       UndefinedCompositeOp,
101       AlphaCompositeOp,
102       AtopCompositeOp,
103       BlendCompositeOp,
104       BlurCompositeOp,
105       BumpmapCompositeOp,
106       ChangeMaskCompositeOp,
107       ClearCompositeOp,
108       ColorBurnCompositeOp,
109       ColorDodgeCompositeOp,
110       ColorizeCompositeOp,
111       CopyBlackCompositeOp,
112       CopyBlueCompositeOp,
113       CopyCompositeOp,
114       CopyCyanCompositeOp,
115       CopyGreenCompositeOp,
116       CopyMagentaCompositeOp,
117       CopyAlphaCompositeOp,
118       CopyRedCompositeOp,
119       CopyYellowCompositeOp,
120       DarkenCompositeOp,
121       DarkenIntensityCompositeOp,
122       DifferenceCompositeOp,
123       DisplaceCompositeOp,
124       DissolveCompositeOp,
125       DistortCompositeOp,
126       DivideDstCompositeOp,
127       DivideSrcCompositeOp,
128       DstAtopCompositeOp,
129       DstCompositeOp,
130       DstInCompositeOp,
131       DstOutCompositeOp,
132       DstOverCompositeOp,
133       ExclusionCompositeOp,
134       HardLightCompositeOp,
135       HardMixCompositeOp,
136       HueCompositeOp,
137       InCompositeOp,
138       IntensityCompositeOp,
139       LightenCompositeOp,
140       LightenIntensityCompositeOp,
141       LinearBurnCompositeOp,
142       LinearDodgeCompositeOp,
143       LinearLightCompositeOp,
144       LuminizeCompositeOp,
145       MathematicsCompositeOp,
146       MinusDstCompositeOp,
147       MinusSrcCompositeOp,
148       ModulateCompositeOp,
149       ModulusAddCompositeOp,
150       ModulusSubtractCompositeOp,
151       MultiplyCompositeOp,
152       NoCompositeOp,
153       OutCompositeOp,
154       OverCompositeOp,
155       OverlayCompositeOp,
156       PegtopLightCompositeOp,
157       PinLightCompositeOp,
158       PlusCompositeOp,
159       ReplaceCompositeOp,
160       SaturateCompositeOp,
161       ScreenCompositeOp,
162       SoftLightCompositeOp,
163       SrcAtopCompositeOp,
164       SrcCompositeOp,
165       SrcInCompositeOp,
166       SrcOutCompositeOp,
167       SrcOverCompositeOp,
168       ThresholdCompositeOp,
169       VividLightCompositeOp,
170       XorCompositeOp,
171       StereoCompositeOp
172     } CompositeOperator;
173   )
174 
175   STRINGIFY(
176     typedef enum
177     {
178       UndefinedFunction,
179       ArcsinFunction,
180       ArctanFunction,
181       PolynomialFunction,
182       SinusoidFunction
183     } MagickFunction;
184   )
185 
186   STRINGIFY(
187     typedef enum
188     {
189       UndefinedNoise,
190       UniformNoise,
191       GaussianNoise,
192       MultiplicativeGaussianNoise,
193       ImpulseNoise,
194       LaplacianNoise,
195       PoissonNoise,
196       RandomNoise
197     } NoiseType;
198   )
199 
200   STRINGIFY(
201     typedef enum
202     {
203       UndefinedPixelIntensityMethod = 0,
204       AveragePixelIntensityMethod,
205       BrightnessPixelIntensityMethod,
206       LightnessPixelIntensityMethod,
207       MSPixelIntensityMethod,
208       Rec601LumaPixelIntensityMethod,
209       Rec601LuminancePixelIntensityMethod,
210       Rec709LumaPixelIntensityMethod,
211       Rec709LuminancePixelIntensityMethod,
212       RMSPixelIntensityMethod
213     } PixelIntensityMethod;
214   )
215 
216   STRINGIFY(
217     typedef enum
218     {
219       BoxWeightingFunction = 0,
220       TriangleWeightingFunction,
221       CubicBCWeightingFunction,
222       HannWeightingFunction,
223       HammingWeightingFunction,
224       BlackmanWeightingFunction,
225       GaussianWeightingFunction,
226       QuadraticWeightingFunction,
227       JincWeightingFunction,
228       SincWeightingFunction,
229       SincFastWeightingFunction,
230       KaiserWeightingFunction,
231       WelchWeightingFunction,
232       BohmanWeightingFunction,
233       LagrangeWeightingFunction,
234       CosineWeightingFunction
235     } ResizeWeightingFunctionType;
236   )
237 
238   STRINGIFY(
239     typedef enum
240     {
241       UndefinedChannel = 0x0000,
242       RedChannel = 0x0001,
243       GrayChannel = 0x0001,
244       CyanChannel = 0x0001,
245       GreenChannel = 0x0002,
246       MagentaChannel = 0x0002,
247       BlueChannel = 0x0004,
248       YellowChannel = 0x0004,
249       BlackChannel = 0x0008,
250       AlphaChannel = 0x0010,
251       OpacityChannel = 0x0010,
252       IndexChannel = 0x0020,             /* Color Index Table? */
253       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
254       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
255       MetaChannel = 0x0100,              /* ???? */
256       CompositeChannels = 0x001F,
257       AllChannels = 0x7ffffff,
258       /*
259         Special purpose channel types.
260         FUTURE: are these needed any more - they are more like hacks
261         SyncChannels for example is NOT a real channel but a 'flag'
262         It really says -- "User has not defined channels"
263         Though it does have extra meaning in the "-auto-level" operator
264       */
265       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
266       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
267       GrayChannels = 0x0400,
268       SyncChannels = 0x20000,    /* channels modified as a single unit */
269       DefaultChannels = AllChannels
270     } ChannelType;  /* must correspond to PixelChannel */
271   )
272 
273 /*
274   Helper functions.
275 */
276 
277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
278 
279   STRINGIFY(
280     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
281     {
282       return((CLQuantum) value);
283     }
284   )
285 
286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
287 
288   STRINGIFY(
289     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
290     {
291       return((CLQuantum) (257.0f*value));
292     }
293   )
294 
295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
296 
297   STRINGIFY(
298     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
299     {
300       return((CLQuantum) (16843009.0*value));
301     }
302   )
303 
304 OPENCL_ENDIF()
305 
306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
307 
308   STRINGIFY(
309     inline CLQuantum ClampToQuantum(const float value)
310       {
311         return (CLQuantum) clamp(value, 0.0f, QuantumRange);
312       }
313   )
314 
315 OPENCL_ELSE()
316 
317   STRINGIFY(
318     inline CLQuantum ClampToQuantum(const float value)
319       {
320         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
321       }
322   )
323 
324 OPENCL_ENDIF()
325 
326   STRINGIFY(
327     inline int ClampToCanvas(const int offset,const int range)
328       {
329         return clamp(offset, (int)0, range-1);
330       }
331   )
332 
333   STRINGIFY(
334     inline uint ScaleQuantumToMap(CLQuantum value)
335       {
336         if (value >= (CLQuantum) MaxMap)
337           return ((uint)MaxMap);
338         else
339           return ((uint)value);
340       }
341   )
342 
343   STRINGIFY(
344     inline float PerceptibleReciprocal(const float x)
345     {
346       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
347       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
348     }
349   )
350 
351   STRINGIFY(
352   inline float RoundToUnity(const float value)
353    {
354      return clamp(value,0.0f,1.0f);
355    }
356   )
357 
358   STRINGIFY(
359 
360   inline unsigned int getPixelIndex(const unsigned int number_channels,
361     const unsigned int columns, const unsigned int x, const unsigned int y)
362   {
363     return (x * number_channels) + (y * columns * number_channels);
364   }
365 
366   inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
367   inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
368   inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
369   inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
370 
371   inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
372   inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
373   inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
374   inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
375 
376   inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
377   inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
378   inline float getBlueF4(float4 p)                      { return p.x; }
379   inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
380 
381   inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
382   inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
383   inline float getGreenF4(float4 p)                     { return p.y; }
384   inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
385 
386   inline CLQuantum getRed(CLPixelType p)                { return p.z; }
387   inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
388   inline float getRedF4(float4 p)                       { return p.z; }
389   inline void setRedF4(float4* p, float value)          { (*p).z = value; }
390 
391   inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
392   inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
393   inline float getAlphaF4(float4 p)                     { return p.w; }
394   inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
395 
396   inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
397     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
398   {
399     if ((channel & RedChannel) != 0)
400       *red=getPixelRed(p);
401 
402     if (number_channels > 2)
403       {
404         if ((channel & GreenChannel) != 0)
405           *green=getPixelGreen(p);
406 
407         if ((channel & BlueChannel) != 0)
408           *blue=getPixelBlue(p);
409       }
410 
411     if (((number_channels == 4) || (number_channels == 2)) &&
412         ((channel & AlphaChannel) != 0))
413       *alpha=getPixelAlpha(p,number_channels);
414   }
415 
416   inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
417     const unsigned int columns, const unsigned int x, const unsigned int y)
418   {
419     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
420 
421     float4 pixel;
422 
423     pixel.x=getPixelRed(p);
424 
425     if (number_channels > 2)
426       {
427         pixel.y=getPixelGreen(p);
428         pixel.z=getPixelBlue(p);
429       }
430 
431     if ((number_channels == 4) || (number_channels == 2))
432       pixel.w=getPixelAlpha(p,number_channels);
433     return(pixel);
434   }
435 
436   inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
437     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
438   {
439     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
440 
441     float red = 0.0f;
442     float green = 0.0f;
443     float blue = 0.0f;
444     float alpha = 0.0f;
445 
446     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
447     return (float4)(red, green, blue, alpha);
448   }
449 
450   inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
451     const ChannelType channel, float red, float green, float blue, float alpha)
452   {
453     if ((channel & RedChannel) != 0)
454       setPixelRed(p,ClampToQuantum(red));
455 
456     if (number_channels > 2)
457       {
458         if ((channel & GreenChannel) != 0)
459           setPixelGreen(p,ClampToQuantum(green));
460 
461         if ((channel & BlueChannel) != 0)
462           setPixelBlue(p,ClampToQuantum(blue));
463       }
464 
465     if (((number_channels == 4) || (number_channels == 2)) &&
466         ((channel & AlphaChannel) != 0))
467       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
468   }
469 
470   inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
471     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
472   {
473     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
474 
475     setPixelRed(p,ClampToQuantum(pixel.x));
476 
477     if (number_channels > 2)
478       {
479         setPixelGreen(p,ClampToQuantum(pixel.y));
480         setPixelBlue(p,ClampToQuantum(pixel.z));
481       }
482 
483     if ((number_channels == 4) || (number_channels == 2))
484       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
485   }
486 
487   inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
488     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
489     float4 pixel)
490   {
491     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
492     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
493   }
494 
495   inline float GetPixelIntensity(const unsigned int colorspace,
496     const unsigned int method,float red,float green,float blue)
497   {
498     float intensity;
499 
500     if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
501       return red;
502 
503     switch (method)
504     {
505       case AveragePixelIntensityMethod:
506         {
507           intensity=(red+green+blue)/3.0;
508           break;
509         }
510       case BrightnessPixelIntensityMethod:
511         {
512           intensity=MagickMax(MagickMax(red,green),blue);
513           break;
514         }
515       case LightnessPixelIntensityMethod:
516         {
517           intensity=(MagickMin(MagickMin(red,green),blue)+
518               MagickMax(MagickMax(red,green),blue))/2.0;
519           break;
520         }
521       case MSPixelIntensityMethod:
522         {
523           intensity=(float) (((float) red*red+green*green+blue*blue)/
524               (3.0*QuantumRange));
525           break;
526         }
527       case Rec601LumaPixelIntensityMethod:
528         {
529           /*
530           if (image->colorspace == RGBColorspace)
531           {
532             red=EncodePixelGamma(red);
533             green=EncodePixelGamma(green);
534             blue=EncodePixelGamma(blue);
535           }
536           */
537           intensity=0.298839*red+0.586811*green+0.114350*blue;
538           break;
539         }
540       case Rec601LuminancePixelIntensityMethod:
541         {
542           /*
543           if (image->colorspace == sRGBColorspace)
544           {
545             red=DecodePixelGamma(red);
546             green=DecodePixelGamma(green);
547             blue=DecodePixelGamma(blue);
548           }
549           */
550           intensity=0.298839*red+0.586811*green+0.114350*blue;
551           break;
552         }
553       case Rec709LumaPixelIntensityMethod:
554       default:
555         {
556           /*
557           if (image->colorspace == RGBColorspace)
558           {
559             red=EncodePixelGamma(red);
560             green=EncodePixelGamma(green);
561             blue=EncodePixelGamma(blue);
562           }
563           */
564           intensity=0.212656*red+0.715158*green+0.072186*blue;
565           break;
566         }
567       case Rec709LuminancePixelIntensityMethod:
568         {
569           /*
570           if (image->colorspace == sRGBColorspace)
571           {
572             red=DecodePixelGamma(red);
573             green=DecodePixelGamma(green);
574             blue=DecodePixelGamma(blue);
575           }
576           */
577           intensity=0.212656*red+0.715158*green+0.072186*blue;
578           break;
579         }
580       case RMSPixelIntensityMethod:
581         {
582           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
583               sqrt(3.0));
584           break;
585         }
586     }
587 
588     return intensity;
589   }
590 
591   inline int mirrorBottom(int value)
592   {
593       return (value < 0) ? - (value) : value;
594   }
595 
596   inline int mirrorTop(int value, int width)
597   {
598       return (value >= width) ? (2 * width - value - 1) : value;
599   }
600   )
601 
602 /*
603 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
604 %                                                                             %
605 %                                                                             %
606 %                                                                             %
607 %     A d d N o i s e                                                         %
608 %                                                                             %
609 %                                                                             %
610 %                                                                             %
611 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
612 */
613 
614   STRINGIFY(
615   /*
616   Part of MWC64X by David Thomas, dt10@imperial.ac.uk
617   This is provided under BSD, full license is with the main package.
618   See http://www.doc.ic.ac.uk/~dt10/research
619   */
620 
621   // Pre: a<M, b<M
622   // Post: r=(a+b) mod M
623   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
624   {
625     ulong v=a+b;
626     //if( (v>=M) || (v<a) )
627     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
628       v=v-M;
629     return v;
630   }
631 
632   // Pre: a<M,b<M
633   // Post: r=(a*b) mod M
634   // This could be done more efficently, but it is portable, and should
635   // be easy to understand. It can be replaced with any of the better
636   // modular multiplication algorithms (for example if you know you have
637   // double precision available or something).
638   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
639   {
640     ulong r=0;
641     while(a!=0){
642       if(a&1)
643         r=MWC_AddMod64(r,b,M);
644       b=MWC_AddMod64(b,b,M);
645       a=a>>1;
646     }
647     return r;
648   }
649 
650   // Pre: a<M, e>=0
651   // Post: r=(a^b) mod M
652   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
653   // most architectures
654   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
655   {
656     ulong sqr=a, acc=1;
657     while(e!=0){
658       if(e&1)
659         acc=MWC_MulMod64(acc,sqr,M);
660         sqr=MWC_MulMod64(sqr,sqr,M);
661       e=e>>1;
662     }
663     return acc;
664   }
665 
666   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
667   {
668     ulong m=MWC_PowMod64(A, distance, M);
669     ulong x=curr.x*(ulong)A+curr.y;
670     x=MWC_MulMod64(x, m, M);
671     return (uint2)((uint)(x/A), (uint)(x%A));
672   }
673 
674   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
675   {
676     // This is an arbitrary constant for starting LCG jumping from. I didn't
677     // want to start from 1, as then you end up with the two or three first values
678     // being a bit poor in ones - once you've decided that, one constant is as
679     // good as any another. There is no deep mathematical reason for it, I just
680     // generated a random number.
681     enum{ MWC_BASEID = 4077358422479273989UL };
682 
683     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
684     ulong m=MWC_PowMod64(A, dist, M);
685 
686     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
687     return (uint2)((uint)(x/A), (uint)(x%A));
688   }
689 
690   //! Represents the state of a particular generator
691   typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
692 
693   void MWC64X_Step(mwc64x_state_t *s)
694   {
695     uint X=s->x, C=s->c;
696 
697     uint Xn=s->seed0*X+C;
698     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
699     uint Cn=mad_hi(s->seed0,X,carry);
700 
701     s->x=Xn;
702     s->c=Cn;
703   }
704 
705   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
706   {
707     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
708     s->x=tmp.x;
709     s->c=tmp.y;
710   }
711 
712   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
713   {
714     uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
715     s->x=tmp.x;
716     s->c=tmp.y;
717   }
718 
719   //! Return a 32-bit integer in the range [0..2^32)
720   uint MWC64X_NextUint(mwc64x_state_t *s)
721   {
722     uint res=s->x ^ s->c;
723     MWC64X_Step(s);
724     return res;
725   }
726 
727   //
728   // End of MWC64X excerpt
729   //
730 
731   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
732   {
733     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
734   }
735 
736   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
737   {
738     float
739       alpha,
740       beta,
741       noise,
742       sigma;
743 
744     noise = 0.0f;
745     alpha=mwcReadPseudoRandomValue(r);
746     switch(noise_type)
747     {
748       case UniformNoise:
749       default:
750         {
751           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
752           break;
753         }
754       case GaussianNoise:
755         {
756           float
757             gamma,
758             tau;
759 
760           if (alpha == 0.0f)
761             alpha=1.0f;
762           beta=mwcReadPseudoRandomValue(r);
763           gamma=sqrt(-2.0f*log(alpha));
764           sigma=gamma*cospi((2.0f*beta));
765           tau=gamma*sinpi((2.0f*beta));
766           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
767           break;
768         }
769       case ImpulseNoise:
770       {
771         if (alpha < (SigmaImpulse/2.0f))
772           noise=0.0f;
773         else
774           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
775             noise=QuantumRange;
776           else
777             noise=pixel;
778         break;
779       }
780       case LaplacianNoise:
781       {
782         if (alpha <= 0.5f)
783           {
784             if (alpha <= MagickEpsilon)
785               noise=(pixel-QuantumRange);
786             else
787               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
788                 0.5f);
789             break;
790           }
791         beta=1.0f-alpha;
792         if (beta <= (0.5f*MagickEpsilon))
793           noise=(pixel+QuantumRange);
794         else
795           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
796         break;
797       }
798       case MultiplicativeGaussianNoise:
799       {
800         sigma=1.0f;
801         if (alpha > MagickEpsilon)
802           sigma=sqrt(-2.0f*log(alpha));
803         beta=mwcReadPseudoRandomValue(r);
804         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
805           cospi((2.0f*beta))/2.0f);
806         break;
807       }
808       case PoissonNoise:
809       {
810         float
811           poisson;
812         unsigned int i;
813         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
814         for (i=0; alpha > poisson; i++)
815         {
816           beta=mwcReadPseudoRandomValue(r);
817           alpha*=beta;
818         }
819         noise=(QuantumRange*i/SigmaPoisson);
820         break;
821       }
822       case RandomNoise:
823       {
824         noise=(QuantumRange*SigmaRandom*alpha);
825         break;
826       }
827     }
828     return noise;
829   }
830 
831   __kernel
832   void AddNoise(const __global CLQuantum *image,
833     const unsigned int number_channels,const ChannelType channel,
834     const unsigned int length,const unsigned int pixelsPerWorkItem,
835     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
836     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
837     __global CLQuantum *filteredImage)
838   {
839     mwc64x_state_t rng;
840     rng.seed0 = seed0;
841     rng.seed1 = seed1;
842 
843     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
844     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
845     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
846 
847     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
848     uint count = pixelsPerWorkItem;
849 
850     while (count > 0 && pos < length)
851     {
852       const __global CLQuantum *p = image + pos;
853       __global CLQuantum *q = filteredImage + pos;
854 
855       float red;
856       float green;
857       float blue;
858       float alpha;
859 
860       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
861 
862       if ((channel & RedChannel) != 0)
863         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
864 
865       if (number_channels > 2)
866       {
867         if ((channel & GreenChannel) != 0)
868           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
869 
870         if ((channel & BlueChannel) != 0)
871           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
872       }
873 
874       if (((number_channels == 4) || (number_channels == 2)) &&
875           ((channel & AlphaChannel) != 0))
876         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
877 
878       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
879 
880       pos += (get_local_size(0) * number_channels);
881       count--;
882     }
883   }
884   )
885 
886 /*
887 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
888 %                                                                             %
889 %                                                                             %
890 %                                                                             %
891 %    B l u r                                                                  %
892 %                                                                             %
893 %                                                                             %
894 %                                                                             %
895 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
896 */
897 
898   STRINGIFY(
899   /*
900   Reduce image noise and reduce detail levels by row
901   */
902   __kernel void BlurRow(const __global CLQuantum *image,
903     const unsigned int number_channels,const ChannelType channel,
904     __constant float *filter,const unsigned int width,
905     const unsigned int imageColumns,const unsigned int imageRows,
906     __local float4 *temp,__global float4 *tempImage)
907   {
908     const int x = get_global_id(0);
909     const int y = get_global_id(1);
910 
911     const int columns = imageColumns;
912 
913     const unsigned int radius = (width-1)/2;
914     const int wsize = get_local_size(0);
915     const unsigned int loadSize = wsize+width;
916 
917     //group coordinate
918     const int groupX=get_local_size(0)*get_group_id(0);
919 
920     //parallel load and clamp
921     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
922     {
923       int cx = ClampToCanvas(i + groupX - radius, columns);
924       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
925     }
926 
927     // barrier
928     barrier(CLK_LOCAL_MEM_FENCE);
929 
930     // only do the work if this is not a patched item
931     if (get_global_id(0) < columns)
932     {
933       // compute
934       float4 result = (float4) 0;
935 
936       int i = 0;
937 
938       for ( ; i+7 < width; )
939       {
940         for (int j=0; j < 8; j++)
941           result+=filter[i+j]*temp[i+j+get_local_id(0)];
942         i+=8;
943       }
944 
945       for ( ; i < width; i++)
946         result+=filter[i]*temp[i+get_local_id(0)];
947 
948       // write back to global
949       tempImage[y*columns+x] = result;
950     }
951   }
952   )
953 
954   STRINGIFY(
955   /*
956   Reduce image noise and reduce detail levels by line
957   */
958   __kernel void BlurColumn(const __global float4 *blurRowData,
959     const unsigned int number_channels,const ChannelType channel,
960     __constant float *filter,const unsigned int width,
961     const unsigned int imageColumns,const unsigned int imageRows,
962     __local float4 *temp,__global CLQuantum *filteredImage)
963   {
964     const int x = get_global_id(0);
965     const int y = get_global_id(1);
966 
967     const int columns = imageColumns;
968     const int rows = imageRows;
969 
970     unsigned int radius = (width-1)/2;
971     const int wsize = get_local_size(1);
972     const unsigned int loadSize = wsize+width;
973 
974     //group coordinate
975     const int groupX=get_local_size(0)*get_group_id(0);
976     const int groupY=get_local_size(1)*get_group_id(1);
977     //notice that get_local_size(0) is 1, so
978     //groupX=get_group_id(0);
979 
980     //parallel load and clamp
981     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
982       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
983 
984     // barrier
985     barrier(CLK_LOCAL_MEM_FENCE);
986 
987     // only do the work if this is not a patched item
988     if (get_global_id(1) < rows)
989     {
990       // compute
991       float4 result = (float4) 0;
992 
993       int i = 0;
994 
995       for ( ; i+7 < width; )
996       {
997         for (int j=0; j < 8; j++)
998           result+=filter[i+j]*temp[i+j+get_local_id(1)];
999         i+=8;
1000       }
1001 
1002       for ( ; i < width; i++)
1003         result+=filter[i]*temp[i+get_local_id(1)];
1004 
1005       // write back to global
1006       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
1007     }
1008   }
1009   )
1010 
1011 /*
1012 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1013 %                                                                             %
1014 %                                                                             %
1015 %                                                                             %
1016 %    C o n t r a s t                                                          %
1017 %                                                                             %
1018 %                                                                             %
1019 %                                                                             %
1020 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1021 */
1022 
1023   STRINGIFY(
1024 
1025   inline float4 ConvertRGBToHSB(const float4 pixel)
1026   {
1027     float4 result=0.0f;
1028     result.w=pixel.w;
1029     float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
1030     if (tmax != 0.0f)
1031     {
1032       float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
1033       float delta=tmax-tmin;
1034 
1035       result.y=delta/tmax;
1036       result.z=QuantumScale*tmax;
1037       if (delta != 0.0f)
1038       {
1039         result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
1040         result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
1041           (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
1042         result.x/=6.0f;
1043         result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
1044       }
1045     }
1046     return(result);
1047   }
1048 
1049   inline float4 ConvertHSBToRGB(const float4 pixel)
1050   {
1051     float hue=pixel.x;
1052     float saturation=pixel.y;
1053     float brightness=pixel.z;
1054 
1055     float4 result=pixel;
1056 
1057     if (saturation == 0.0f)
1058     {
1059       result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1060     }
1061     else
1062     {
1063       float h=6.0f*(hue-floor(hue));
1064       float f=h-floor(h);
1065       float p=brightness*(1.0f-saturation);
1066       float q=brightness*(1.0f-saturation*f);
1067       float t=brightness*(1.0f-(saturation*(1.0f-f)));
1068       int ih = (int)h;
1069 
1070       if (ih == 1)
1071       {
1072         result.x=ClampToQuantum(QuantumRange*q);
1073         result.y=ClampToQuantum(QuantumRange*brightness);
1074         result.z=ClampToQuantum(QuantumRange*p);
1075       }
1076       else if (ih == 2)
1077       {
1078         result.x=ClampToQuantum(QuantumRange*p);
1079         result.y=ClampToQuantum(QuantumRange*brightness);
1080         result.z=ClampToQuantum(QuantumRange*t);
1081       }
1082       else if (ih == 3)
1083       {
1084         result.x=ClampToQuantum(QuantumRange*p);
1085         result.y=ClampToQuantum(QuantumRange*q);
1086         result.z=ClampToQuantum(QuantumRange*brightness);
1087       }
1088       else if (ih == 4)
1089       {
1090         result.x=ClampToQuantum(QuantumRange*t);
1091         result.y=ClampToQuantum(QuantumRange*p);
1092         result.z=ClampToQuantum(QuantumRange*brightness);
1093       }
1094       else if (ih == 5)
1095       {
1096         result.x=ClampToQuantum(QuantumRange*brightness);
1097         result.y=ClampToQuantum(QuantumRange*p);
1098         result.z=ClampToQuantum(QuantumRange*q);
1099       }
1100       else
1101       {
1102         result.x=ClampToQuantum(QuantumRange*brightness);
1103         result.y=ClampToQuantum(QuantumRange*t);
1104         result.z=ClampToQuantum(QuantumRange*p);
1105       }
1106     }
1107     return(result);
1108   }
1109 
1110   __kernel void Contrast(__global CLQuantum *image,
1111     const unsigned int number_channels,const int sign)
1112   {
1113     const int x=get_global_id(0);
1114     const int y=get_global_id(1);
1115     const unsigned int columns=get_global_size(0);
1116 
1117     float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1118     if (number_channels < 3)
1119       pixel.y=pixel.z=pixel.x;
1120 
1121     pixel=ConvertRGBToHSB(pixel);
1122     float brightness=pixel.z;
1123     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1124     brightness=clamp(brightness,0.0f,1.0f);
1125     pixel.z=brightness;
1126     pixel=ConvertHSBToRGB(pixel);
1127 
1128     WriteAllChannels(image,number_channels,columns,x,y,pixel);
1129   }
1130   )
1131 
1132 /*
1133 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1134 %                                                                             %
1135 %                                                                             %
1136 %                                                                             %
1137 %    C o n t r a s t S t r e t c h                                            %
1138 %                                                                             %
1139 %                                                                             %
1140 %                                                                             %
1141 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1142 */
1143 
1144     STRINGIFY(
1145     /*
1146     */
1147     __kernel void Histogram(__global CLPixelType * restrict im,
1148       const ChannelType channel,
1149       const unsigned int colorspace,
1150       const unsigned int method,
1151       __global uint4 * restrict histogram)
1152       {
1153         const int x = get_global_id(0);
1154         const int y = get_global_id(1);
1155         const int columns = get_global_size(0);
1156         const int c = x + y * columns;
1157         if ((channel & SyncChannels) != 0)
1158         {
1159           float red=(float)getRed(im[c]);
1160           float green=(float)getGreen(im[c]);
1161           float blue=(float)getBlue(im[c]);
1162 
1163           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1164           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1165           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1166         }
1167         else
1168         {
1169           // for equalizing, we always need all channels?
1170           // otherwise something more
1171         }
1172       }
1173     )
1174 
1175     STRINGIFY(
1176     /*
1177     */
1178     __kernel void ContrastStretch(__global CLPixelType * restrict im,
1179       const ChannelType channel,
1180       __global CLPixelType * restrict stretch_map,
1181       const float4 white, const float4 black)
1182       {
1183         const int x = get_global_id(0);
1184         const int y = get_global_id(1);
1185         const int columns = get_global_size(0);
1186         const int c = x + y * columns;
1187 
1188         uint ePos;
1189         CLPixelType oValue, eValue;
1190         CLQuantum red, green, blue, alpha;
1191 
1192         //read from global
1193         oValue=im[c];
1194 
1195         if ((channel & RedChannel) != 0)
1196         {
1197           if (getRedF4(white) != getRedF4(black))
1198           {
1199             ePos = ScaleQuantumToMap(getRed(oValue));
1200             eValue = stretch_map[ePos];
1201             red = getRed(eValue);
1202           }
1203         }
1204 
1205         if ((channel & GreenChannel) != 0)
1206         {
1207           if (getGreenF4(white) != getGreenF4(black))
1208           {
1209             ePos = ScaleQuantumToMap(getGreen(oValue));
1210             eValue = stretch_map[ePos];
1211             green = getGreen(eValue);
1212           }
1213         }
1214 
1215         if ((channel & BlueChannel) != 0)
1216         {
1217           if (getBlueF4(white) != getBlueF4(black))
1218           {
1219             ePos = ScaleQuantumToMap(getBlue(oValue));
1220             eValue = stretch_map[ePos];
1221             blue = getBlue(eValue);
1222           }
1223         }
1224 
1225         if ((channel & AlphaChannel) != 0)
1226         {
1227           if (getAlphaF4(white) != getAlphaF4(black))
1228           {
1229             ePos = ScaleQuantumToMap(getAlpha(oValue));
1230             eValue = stretch_map[ePos];
1231             alpha = getAlpha(eValue);
1232           }
1233         }
1234 
1235         //write back
1236         im[c]=(CLPixelType)(blue, green, red, alpha);
1237 
1238       }
1239     )
1240 
1241 /*
1242 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1243 %                                                                             %
1244 %                                                                             %
1245 %                                                                             %
1246 %    C o n v o l v e                                                          %
1247 %                                                                             %
1248 %                                                                             %
1249 %                                                                             %
1250 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1251 */
1252 
1253   STRINGIFY(
1254     __kernel
1255     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1256     const unsigned int imageWidth, const unsigned int imageHeight,
1257     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1258     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1259 
1260       int2 blockID;
1261       blockID.x = get_global_id(0) / get_local_size(0);
1262       blockID.y = get_global_id(1) / get_local_size(1);
1263 
1264       // image area processed by this workgroup
1265       int2 imageAreaOrg;
1266       imageAreaOrg.x = blockID.x * get_local_size(0);
1267       imageAreaOrg.y = blockID.y * get_local_size(1);
1268 
1269       int2 midFilterDimen;
1270       midFilterDimen.x = (filterWidth-1)/2;
1271       midFilterDimen.y = (filterHeight-1)/2;
1272 
1273       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1274 
1275       // dimension of the local cache
1276       int2 cachedAreaDimen;
1277       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1278       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1279 
1280       // cache the pixels accessed by this workgroup in local memory
1281       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1282       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1283       int groupSize = get_local_size(0) * get_local_size(1);
1284       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1285 
1286         int2 cachedAreaIndex;
1287         cachedAreaIndex.x = i % cachedAreaDimen.x;
1288         cachedAreaIndex.y = i / cachedAreaDimen.x;
1289 
1290         int2 imagePixelIndex;
1291         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1292 
1293         // only support EdgeVirtualPixelMethod through ClampToCanvas
1294         // TODO: implement other virtual pixel method
1295         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1296         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1297 
1298         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1299       }
1300 
1301       // cache the filter
1302       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1303         filterCache[i] = filter[i];
1304       }
1305       barrier(CLK_LOCAL_MEM_FENCE);
1306 
1307 
1308       int2 imageIndex;
1309       imageIndex.x = imageAreaOrg.x + get_local_id(0);
1310       imageIndex.y = imageAreaOrg.y + get_local_id(1);
1311 
1312       // if out-of-range, stops here and quit
1313       if (imageIndex.x >= imageWidth
1314         || imageIndex.y >= imageHeight) {
1315           return;
1316       }
1317 
1318       int filterIndex = 0;
1319       float4 sum = (float4)0.0f;
1320       float gamma = 0.0f;
1321       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1322         int cacheIndexY = get_local_id(1);
1323         for (int j = 0; j < filterHeight; j++) {
1324           int cacheIndexX = get_local_id(0);
1325           for (int i = 0; i < filterWidth; i++) {
1326             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1327             float f = filterCache[filterIndex];
1328 
1329             sum.x += f * p.x;
1330             sum.y += f * p.y;
1331             sum.z += f * p.z;
1332             sum.w += f * p.w;
1333 
1334             gamma += f;
1335             filterIndex++;
1336             cacheIndexX++;
1337           }
1338           cacheIndexY++;
1339         }
1340       }
1341       else {
1342         int cacheIndexY = get_local_id(1);
1343         for (int j = 0; j < filterHeight; j++) {
1344           int cacheIndexX = get_local_id(0);
1345           for (int i = 0; i < filterWidth; i++) {
1346 
1347             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1348             float alpha = QuantumScale*p.w;
1349             float f = filterCache[filterIndex];
1350             float g = alpha * f;
1351 
1352             sum.x += g*p.x;
1353             sum.y += g*p.y;
1354             sum.z += g*p.z;
1355             sum.w += f*p.w;
1356 
1357             gamma += g;
1358             filterIndex++;
1359             cacheIndexX++;
1360           }
1361           cacheIndexY++;
1362         }
1363         gamma = PerceptibleReciprocal(gamma);
1364         sum.xyz = gamma*sum.xyz;
1365       }
1366       CLPixelType outputPixel;
1367       outputPixel.x = ClampToQuantum(sum.x);
1368       outputPixel.y = ClampToQuantum(sum.y);
1369       outputPixel.z = ClampToQuantum(sum.z);
1370       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1371 
1372       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1373     }
1374   )
1375 
1376   STRINGIFY(
1377     __kernel
1378     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1379                   const uint imageWidth, const uint imageHeight,
1380                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1381                   const uint matte, const ChannelType channel) {
1382 
1383       int2 imageIndex;
1384       imageIndex.x = get_global_id(0);
1385       imageIndex.y = get_global_id(1);
1386 
1387       /*
1388       unsigned int imageWidth = get_global_size(0);
1389       unsigned int imageHeight = get_global_size(1);
1390       */
1391       if (imageIndex.x >= imageWidth
1392           || imageIndex.y >= imageHeight)
1393           return;
1394 
1395       int2 midFilterDimen;
1396       midFilterDimen.x = (filterWidth-1)/2;
1397       midFilterDimen.y = (filterHeight-1)/2;
1398 
1399       int filterIndex = 0;
1400       float4 sum = (float4)0.0f;
1401       float gamma = 0.0f;
1402       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1403         for (int j = 0; j < filterHeight; j++) {
1404           int2 inputPixelIndex;
1405           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1406           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1407           for (int i = 0; i < filterWidth; i++) {
1408             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1409             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1410 
1411             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1412             float f = filter[filterIndex];
1413 
1414             sum.x += f * p.x;
1415             sum.y += f * p.y;
1416             sum.z += f * p.z;
1417             sum.w += f * p.w;
1418 
1419             gamma += f;
1420 
1421             filterIndex++;
1422           }
1423         }
1424       }
1425       else {
1426 
1427         for (int j = 0; j < filterHeight; j++) {
1428           int2 inputPixelIndex;
1429           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1430           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1431           for (int i = 0; i < filterWidth; i++) {
1432             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1433             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1434 
1435             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1436             float alpha = QuantumScale*p.w;
1437             float f = filter[filterIndex];
1438             float g = alpha * f;
1439 
1440             sum.x += g*p.x;
1441             sum.y += g*p.y;
1442             sum.z += g*p.z;
1443             sum.w += f*p.w;
1444 
1445             gamma += g;
1446 
1447 
1448             filterIndex++;
1449           }
1450         }
1451         gamma = PerceptibleReciprocal(gamma);
1452         sum.xyz = gamma*sum.xyz;
1453       }
1454 
1455       CLPixelType outputPixel;
1456       outputPixel.x = ClampToQuantum(sum.x);
1457       outputPixel.y = ClampToQuantum(sum.y);
1458       outputPixel.z = ClampToQuantum(sum.z);
1459       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1460 
1461       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1462     }
1463   )
1464 
1465 /*
1466 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1467 %                                                                             %
1468 %                                                                             %
1469 %                                                                             %
1470 %     D e s p e c k l e                                                       %
1471 %                                                                             %
1472 %                                                                             %
1473 %                                                                             %
1474 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1475 */
1476 
1477   STRINGIFY(
1478 
1479   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1480   , const unsigned int imageWidth, const unsigned int imageHeight
1481   , const int2 offset, const int polarity, const int matte) {
1482 
1483     int x = get_global_id(0);
1484     int y = get_global_id(1);
1485 
1486     CLPixelType v = inputImage[y*imageWidth+x];
1487 
1488     int2 neighbor;
1489     neighbor.y = y + offset.y;
1490     neighbor.x = x + offset.x;
1491 
1492     int2 clampedNeighbor;
1493     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1494     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1495 
1496     CLPixelType r = (clampedNeighbor.x == neighbor.x
1497                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1498     :(CLPixelType)0;
1499 
1500     int sv[4];
1501     sv[0] = (int)v.x;
1502     sv[1] = (int)v.y;
1503     sv[2] = (int)v.z;
1504     sv[3] = (int)v.w;
1505 
1506     int sr[4];
1507     sr[0] = (int)r.x;
1508     sr[1] = (int)r.y;
1509     sr[2] = (int)r.z;
1510     sr[3] = (int)r.w;
1511 
1512     if (polarity > 0) {
1513       \n #pragma unroll 4\n
1514       for (unsigned int i = 0; i < 4; i++) {
1515         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1516       }
1517     }
1518     else {
1519       \n #pragma unroll 4\n
1520       for (unsigned int i = 0; i < 4; i++) {
1521         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1522       }
1523 
1524     }
1525 
1526     v.x = (CLQuantum)sv[0];
1527     v.y = (CLQuantum)sv[1];
1528     v.z = (CLQuantum)sv[2];
1529 
1530     if (matte!=0)
1531       v.w = (CLQuantum)sv[3];
1532 
1533     outputImage[y*imageWidth+x] = v;
1534 
1535     }
1536 
1537 
1538   )
1539 
1540   STRINGIFY(
1541 
1542   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1543   , const unsigned int imageWidth, const unsigned int imageHeight
1544   , const int2 offset, const int polarity, const int matte) {
1545 
1546     int x = get_global_id(0);
1547     int y = get_global_id(1);
1548 
1549     CLPixelType v = inputImage[y*imageWidth+x];
1550 
1551     int2 neighbor, clampedNeighbor;
1552 
1553     neighbor.y = y + offset.y;
1554     neighbor.x = x + offset.x;
1555     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1556     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1557 
1558     CLPixelType r = (clampedNeighbor.x == neighbor.x
1559       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1560     :(CLPixelType)0;
1561 
1562 
1563     neighbor.y = y - offset.y;
1564     neighbor.x = x - offset.x;
1565     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1566     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1567 
1568     CLPixelType s = (clampedNeighbor.x == neighbor.x
1569       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1570     :(CLPixelType)0;
1571 
1572 
1573     int sv[4];
1574     sv[0] = (int)v.x;
1575     sv[1] = (int)v.y;
1576     sv[2] = (int)v.z;
1577     sv[3] = (int)v.w;
1578 
1579     int sr[4];
1580     sr[0] = (int)r.x;
1581     sr[1] = (int)r.y;
1582     sr[2] = (int)r.z;
1583     sr[3] = (int)r.w;
1584 
1585     int ss[4];
1586     ss[0] = (int)s.x;
1587     ss[1] = (int)s.y;
1588     ss[2] = (int)s.z;
1589     ss[3] = (int)s.w;
1590 
1591     if (polarity > 0) {
1592       \n #pragma unroll 4\n
1593       for (unsigned int i = 0; i < 4; i++) {
1594         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1595         //
1596         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1597         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1598         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1599       }
1600     }
1601     else {
1602       \n #pragma unroll 4\n
1603       for (unsigned int i = 0; i < 4; i++) {
1604         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1605         //
1606         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1607         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1608       }
1609     }
1610 
1611     v.x = (CLQuantum)sv[0];
1612     v.y = (CLQuantum)sv[1];
1613     v.z = (CLQuantum)sv[2];
1614 
1615     if (matte!=0)
1616       v.w = (CLQuantum)sv[3];
1617 
1618     outputImage[y*imageWidth+x] = v;
1619 
1620     }
1621   )
1622 
1623 /*
1624 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1625 %                                                                             %
1626 %                                                                             %
1627 %                                                                             %
1628 %     E q u a l i z e                                                         %
1629 %                                                                             %
1630 %                                                                             %
1631 %                                                                             %
1632 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1633 */
1634 
1635     STRINGIFY(
1636     /*
1637     */
1638     __kernel void Equalize(__global CLPixelType * restrict im,
1639       const ChannelType channel,
1640       __global CLPixelType * restrict equalize_map,
1641       const float4 white, const float4 black)
1642       {
1643         const int x = get_global_id(0);
1644         const int y = get_global_id(1);
1645         const int columns = get_global_size(0);
1646         const int c = x + y * columns;
1647 
1648         uint ePos;
1649         CLPixelType oValue, eValue;
1650         CLQuantum red, green, blue, alpha;
1651 
1652         //read from global
1653         oValue=im[c];
1654 
1655         if ((channel & SyncChannels) != 0)
1656         {
1657           if (getRedF4(white) != getRedF4(black))
1658           {
1659             ePos = ScaleQuantumToMap(getRed(oValue));
1660             eValue = equalize_map[ePos];
1661             red = getRed(eValue);
1662             ePos = ScaleQuantumToMap(getGreen(oValue));
1663             eValue = equalize_map[ePos];
1664             green = getRed(eValue);
1665             ePos = ScaleQuantumToMap(getBlue(oValue));
1666             eValue = equalize_map[ePos];
1667             blue = getRed(eValue);
1668             ePos = ScaleQuantumToMap(getAlpha(oValue));
1669             eValue = equalize_map[ePos];
1670             alpha = getRed(eValue);
1671 
1672             //write back
1673             im[c]=(CLPixelType)(blue, green, red, alpha);
1674           }
1675 
1676         }
1677 
1678         // for equalizing, we always need all channels?
1679         // otherwise something more
1680 
1681      }
1682     )
1683 
1684 /*
1685 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1686 %                                                                             %
1687 %                                                                             %
1688 %                                                                             %
1689 %     F u n c t i o n                                                         %
1690 %                                                                             %
1691 %                                                                             %
1692 %                                                                             %
1693 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1694 */
1695 
1696   STRINGIFY(
1697   /*
1698   apply FunctionImageChannel(braightness-contrast)
1699   */
1700   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1701     const unsigned int number_parameters,__constant float *parameters)
1702   {
1703     float result = 0.0f;
1704 
1705     switch (function)
1706     {
1707     case PolynomialFunction:
1708       {
1709         for (unsigned int i=0; i < number_parameters; i++)
1710           result = result*QuantumScale*pixel + parameters[i];
1711         result *= QuantumRange;
1712         break;
1713       }
1714     case SinusoidFunction:
1715       {
1716         float  freq,phase,ampl,bias;
1717         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1718         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1719         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1720         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1721         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1722           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1723         break;
1724       }
1725     case ArcsinFunction:
1726       {
1727         float  width,range,center,bias;
1728         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1729         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1730         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1731         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1732 
1733         result = 2.0f/width*(QuantumScale*pixel - center);
1734         result = range/MagickPI*asin(result)+bias;
1735         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1736         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1737         result *= QuantumRange;
1738         break;
1739       }
1740     case ArctanFunction:
1741       {
1742         float slope,range,center,bias;
1743         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1744         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1745         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1746         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1747         result = MagickPI*slope*(QuantumScale*pixel-center);
1748         result = QuantumRange*(range/MagickPI*atan(result) + bias);
1749         break;
1750       }
1751     case UndefinedFunction:
1752       break;
1753     }
1754     return(ClampToQuantum(result));
1755   }
1756   )
1757 
1758   STRINGIFY(
1759   /*
1760   Improve brightness / contrast of the image
1761   channel : define which channel is improved
1762   function : the function called to enchance the brightness contrast
1763   number_parameters : numbers of parameters
1764   parameters : the parameter
1765   */
1766   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1767     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1768     __constant float *parameters)
1769   {
1770     const unsigned int x = get_global_id(0);
1771     const unsigned int y = get_global_id(1);
1772     const unsigned int columns = get_global_size(0);
1773     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1774 
1775     float red;
1776     float green;
1777     float blue;
1778     float alpha;
1779 
1780     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1781 
1782     if ((channel & RedChannel) != 0)
1783       red=ApplyFunction(red, function, number_parameters, parameters);
1784 
1785     if (number_channels > 2)
1786       {
1787         if ((channel & GreenChannel) != 0)
1788           green=ApplyFunction(green, function, number_parameters, parameters);
1789 
1790         if ((channel & BlueChannel) != 0)
1791           blue=ApplyFunction(blue, function, number_parameters, parameters);
1792       }
1793 
1794     if (((number_channels == 4) || (number_channels == 2)) &&
1795         ((channel & AlphaChannel) != 0))
1796       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1797 
1798     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1799   }
1800   )
1801 
1802 /*
1803 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1804 %                                                                             %
1805 %                                                                             %
1806 %                                                                             %
1807 %     G r a y s c a l e                                                       %
1808 %                                                                             %
1809 %                                                                             %
1810 %                                                                             %
1811 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1812 */
1813 
1814   STRINGIFY(
1815   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1816     const unsigned int colorspace,const unsigned int method)
1817   {
1818     const unsigned int x = get_global_id(0);
1819     const unsigned int y = get_global_id(1);
1820     const unsigned int columns = get_global_size(0);
1821     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1822 
1823     float
1824       blue,
1825       green,
1826       red;
1827 
1828     red=getPixelRed(p);
1829     green=getPixelGreen(p);
1830     blue=getPixelBlue(p);
1831 
1832     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1833 
1834     setPixelRed(p,intensity);
1835     setPixelGreen(p,intensity);
1836     setPixelBlue(p,intensity);
1837   }
1838   )
1839 
1840 /*
1841 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1842 %                                                                             %
1843 %                                                                             %
1844 %                                                                             %
1845 %     L o c a l C o n t r a s t                                               %
1846 %                                                                             %
1847 %                                                                             %
1848 %                                                                             %
1849 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1850 */
1851 
1852     STRINGIFY(
1853 
1854       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1855           const int radius,
1856           const int imageWidth,
1857           const int imageHeight)
1858       {
1859         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1860 
1861         int x = get_local_id(0);
1862         int y = get_global_id(1);
1863 
1864         if ((x >= imageWidth) || (y >= imageHeight))
1865           return;
1866 
1867         global CLPixelType *src = srcImage + y * imageWidth;
1868 
1869         for (int i = x; i < imageWidth; i += get_local_size(0)) {
1870             float sum = 0.0f;
1871             float weight = 1.0f;
1872 
1873             int j = i - radius;
1874             while ((j + 7) < i) {
1875                 for (int k = 0; k < 8; ++k) // Unroll 8x
1876                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1877                 weight += 8.0f;
1878                 j+=8;
1879             }
1880             while (j < i) {
1881                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1882                 weight += 1.0f;
1883                 ++j;
1884             }
1885 
1886             while ((j + 7) < radius + i) {
1887                 for (int k = 0; k < 8; ++k) // Unroll 8x
1888                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1889                 weight -= 8.0f;
1890                 j+=8;
1891             }
1892             while (j < radius + i) {
1893                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1894                 weight -= 1.0f;
1895                 ++j;
1896             }
1897 
1898             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1899         }
1900       }
1901     )
1902 
1903     STRINGIFY(
1904       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1905           const int radius,
1906           const float strength,
1907           const int imageWidth,
1908           const int imageHeight)
1909       {
1910         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1911 
1912         int x = get_global_id(0);
1913         int y = get_global_id(1);
1914 
1915         if ((x >= imageWidth) || (y >= imageHeight))
1916                 return;
1917 
1918         global float *src = blurImage + x;
1919 
1920         float sum = 0.0f;
1921         float weight = 1.0f;
1922 
1923         int j = y - radius;
1924         while ((j + 7) < y) {
1925             for (int k = 0; k < 8; ++k) // Unroll 8x
1926                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1927             weight += 8.0f;
1928             j+=8;
1929         }
1930         while (j < y) {
1931             sum += weight * src[mirrorBottom(j) * imageWidth];
1932             weight += 1.0f;
1933             ++j;
1934         }
1935 
1936         while ((j + 7) < radius + y) {
1937             for (int k = 0; k < 8; ++k) // Unroll 8x
1938                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1939             weight -= 8.0f;
1940             j+=8;
1941         }
1942         while (j < radius + y) {
1943             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1944             weight -= 1.0f;
1945             ++j;
1946         }
1947 
1948         CLPixelType pixel = srcImage[x + y * imageWidth];
1949         float srcVal = dot(RGB, convert_float4(pixel));
1950         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1951         mult = (srcVal + mult) / srcVal;
1952 
1953         pixel.x = ClampToQuantum(pixel.x * mult);
1954         pixel.y = ClampToQuantum(pixel.y * mult);
1955         pixel.z = ClampToQuantum(pixel.z * mult);
1956 
1957         dstImage[x + y * imageWidth] = pixel;
1958       }
1959     )
1960 
1961 /*
1962 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1963 %                                                                             %
1964 %                                                                             %
1965 %                                                                             %
1966 %     M o d u l a t e                                                         %
1967 %                                                                             %
1968 %                                                                             %
1969 %                                                                             %
1970 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1971 */
1972 
1973   STRINGIFY(
1974 
1975   inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1976     float *hue, float *saturation, float *lightness)
1977   {
1978   float
1979     c,
1980     tmax,
1981     tmin;
1982 
1983   /*
1984      Convert RGB to HSL colorspace.
1985      */
1986   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1987   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1988 
1989   c=tmax-tmin;
1990 
1991   *lightness=(tmax+tmin)/2.0;
1992   if (c <= 0.0)
1993   {
1994     *hue=0.0;
1995     *saturation=0.0;
1996     return;
1997   }
1998 
1999   if (tmax == (QuantumScale*red))
2000   {
2001     *hue=(QuantumScale*green-QuantumScale*blue)/c;
2002     if ((QuantumScale*green) < (QuantumScale*blue))
2003       *hue+=6.0;
2004   }
2005   else
2006     if (tmax == (QuantumScale*green))
2007       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2008     else
2009       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2010 
2011   *hue*=60.0/360.0;
2012   if (*lightness <= 0.5)
2013     *saturation=c/(2.0*(*lightness));
2014   else
2015     *saturation=c/(2.0-2.0*(*lightness));
2016   }
2017 
2018   inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2019       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2020   {
2021     float
2022       b,
2023       c,
2024       g,
2025       h,
2026       tmin,
2027       r,
2028       x;
2029 
2030     /*
2031        Convert HSL to RGB colorspace.
2032        */
2033     h=hue*360.0;
2034     if (lightness <= 0.5)
2035       c=2.0*lightness*saturation;
2036     else
2037       c=(2.0-2.0*lightness)*saturation;
2038     tmin=lightness-0.5*c;
2039     h-=360.0*floor(h/360.0);
2040     h/=60.0;
2041     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2042     switch ((int) floor(h) % 6)
2043     {
2044       case 0:
2045         {
2046           r=tmin+c;
2047           g=tmin+x;
2048           b=tmin;
2049           break;
2050         }
2051       case 1:
2052         {
2053           r=tmin+x;
2054           g=tmin+c;
2055           b=tmin;
2056           break;
2057         }
2058       case 2:
2059         {
2060           r=tmin;
2061           g=tmin+c;
2062           b=tmin+x;
2063           break;
2064         }
2065       case 3:
2066         {
2067           r=tmin;
2068           g=tmin+x;
2069           b=tmin+c;
2070           break;
2071         }
2072       case 4:
2073         {
2074           r=tmin+x;
2075           g=tmin;
2076           b=tmin+c;
2077           break;
2078         }
2079       case 5:
2080         {
2081           r=tmin+c;
2082           g=tmin;
2083           b=tmin+x;
2084           break;
2085         }
2086       default:
2087         {
2088           r=0.0;
2089           g=0.0;
2090           b=0.0;
2091         }
2092     }
2093     *red=ClampToQuantum(QuantumRange*r);
2094     *green=ClampToQuantum(QuantumRange*g);
2095     *blue=ClampToQuantum(QuantumRange*b);
2096   }
2097 
2098   inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2099     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2100   {
2101     float
2102       hue,
2103       lightness,
2104       saturation;
2105 
2106     /*
2107     Increase or decrease color lightness, saturation, or hue.
2108     */
2109     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2110     hue+=0.5*(0.01*percent_hue-1.0);
2111     while (hue < 0.0)
2112       hue+=1.0;
2113     while (hue >= 1.0)
2114       hue-=1.0;
2115     saturation*=0.01*percent_saturation;
2116     lightness*=0.01*percent_lightness;
2117     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2118   }
2119 
2120   __kernel void Modulate(__global CLPixelType *im,
2121     const float percent_brightness,
2122     const float percent_hue,
2123     const float percent_saturation,
2124     const int colorspace)
2125   {
2126 
2127     const int x = get_global_id(0);
2128     const int y = get_global_id(1);
2129     const int columns = get_global_size(0);
2130     const int c = x + y * columns;
2131 
2132     CLPixelType pixel = im[c];
2133 
2134     CLQuantum
2135         blue,
2136         green,
2137         red;
2138 
2139     red=getRed(pixel);
2140     green=getGreen(pixel);
2141     blue=getBlue(pixel);
2142 
2143     switch (colorspace)
2144     {
2145       case HSLColorspace:
2146       default:
2147         {
2148           ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2149               &red, &green, &blue);
2150         }
2151 
2152     }
2153 
2154     CLPixelType filteredPixel;
2155 
2156     setRed(&filteredPixel, red);
2157     setGreen(&filteredPixel, green);
2158     setBlue(&filteredPixel, blue);
2159     filteredPixel.w = pixel.w;
2160 
2161     im[c] = filteredPixel;
2162   }
2163   )
2164 
2165 /*
2166 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2167 %                                                                             %
2168 %                                                                             %
2169 %                                                                             %
2170 %     M o t i o n B l u r                                                     %
2171 %                                                                             %
2172 %                                                                             %
2173 %                                                                             %
2174 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2175 */
2176 
2177   STRINGIFY(
2178     __kernel
2179     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2180                     const unsigned int imageWidth, const unsigned int imageHeight,
2181                     const __global float *filter, const unsigned int width, const __global int2* offset,
2182                     const float4 bias,
2183                     const ChannelType channel, const unsigned int matte) {
2184 
2185       int2 currentPixel;
2186       currentPixel.x = get_global_id(0);
2187       currentPixel.y = get_global_id(1);
2188 
2189       if (currentPixel.x >= imageWidth
2190           || currentPixel.y >= imageHeight)
2191           return;
2192 
2193       float4 pixel;
2194       pixel.x = (float)bias.x;
2195       pixel.y = (float)bias.y;
2196       pixel.z = (float)bias.z;
2197       pixel.w = (float)bias.w;
2198 
2199       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2200 
2201         for (int i = 0; i < width; i++) {
2202           // only support EdgeVirtualPixelMethod through ClampToCanvas
2203           // TODO: implement other virtual pixel method
2204           int2 samplePixel = currentPixel + offset[i];
2205           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2206           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2207           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2208 
2209           pixel.x += (filter[i] * (float)samplePixelValue.x);
2210           pixel.y += (filter[i] * (float)samplePixelValue.y);
2211           pixel.z += (filter[i] * (float)samplePixelValue.z);
2212           pixel.w += (filter[i] * (float)samplePixelValue.w);
2213         }
2214 
2215         CLPixelType outputPixel;
2216         outputPixel.x = ClampToQuantum(pixel.x);
2217         outputPixel.y = ClampToQuantum(pixel.y);
2218         outputPixel.z = ClampToQuantum(pixel.z);
2219         outputPixel.w = ClampToQuantum(pixel.w);
2220         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2221       }
2222       else {
2223 
2224         float gamma = 0.0f;
2225         for (int i = 0; i < width; i++) {
2226           // only support EdgeVirtualPixelMethod through ClampToCanvas
2227           // TODO: implement other virtual pixel method
2228           int2 samplePixel = currentPixel + offset[i];
2229           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2230           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2231 
2232           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2233 
2234           float alpha = QuantumScale*samplePixelValue.w;
2235           float k = filter[i];
2236           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2237           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2238           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2239 
2240           pixel.w += k * alpha * samplePixelValue.w;
2241 
2242           gamma+=k*alpha;
2243         }
2244         gamma = PerceptibleReciprocal(gamma);
2245         pixel.xyz = gamma*pixel.xyz;
2246 
2247         CLPixelType outputPixel;
2248         outputPixel.x = ClampToQuantum(pixel.x);
2249         outputPixel.y = ClampToQuantum(pixel.y);
2250         outputPixel.z = ClampToQuantum(pixel.z);
2251         outputPixel.w = ClampToQuantum(pixel.w);
2252         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2253       }
2254     }
2255   )
2256 
2257 /*
2258 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2259 %                                                                             %
2260 %                                                                             %
2261 %                                                                             %
2262 %     R e s i z e                                                             %
2263 %                                                                             %
2264 %                                                                             %
2265 %                                                                             %
2266 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2267 */
2268 
2269   STRINGIFY(
2270   // Based on Box from resize.c
2271   float BoxResizeFilter(const float x)
2272   {
2273     return 1.0f;
2274   }
2275   )
2276 
2277   STRINGIFY(
2278   // Based on CubicBC from resize.c
2279   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2280   {
2281     /*
2282     Cubic Filters using B,C determined values:
2283     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2284     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2285     Spline              B = 1   C = 0    B-Spline Gaussian approximation
2286     Hermite             B = 0   C = 0    B-Spline interpolator
2287 
2288     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2289     Graphics Computer Graphics, Volume 22, Number 4, August 1988
2290     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2291     Mitchell.pdf.
2292 
2293     Coefficents are determined from B,C values:
2294     P0 = (  6 - 2*B       )/6 = coeff[0]
2295     P1 =         0
2296     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2297     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2298     Q0 = (      8*B +24*C )/6 = coeff[3]
2299     Q1 = (    -12*B -48*C )/6 = coeff[4]
2300     Q2 = (      6*B +30*C )/6 = coeff[5]
2301     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2302 
2303     which are used to define the filter:
2304 
2305     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2306     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2307 
2308     which ensures function is continuous in value and derivative (slope).
2309     */
2310     if (x < 1.0)
2311       return(resizeFilterCoefficients[0]+x*(x*
2312       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2313     if (x < 2.0)
2314       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2315       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2316     return(0.0);
2317   }
2318   )
2319 
2320   STRINGIFY(
2321   float Sinc(const float x)
2322   {
2323     if (x != 0.0f)
2324     {
2325       const float alpha=(float) (MagickPI*x);
2326       return sinpi(x)/alpha;
2327     }
2328     return(1.0f);
2329   }
2330   )
2331 
2332   STRINGIFY(
2333   float Triangle(const float x)
2334   {
2335     /*
2336     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2337     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2338     for Sinc().
2339     */
2340     return ((x<1.0f)?(1.0f-x):0.0f);
2341   }
2342   )
2343 
2344 
2345   STRINGIFY(
2346   float Hann(const float x)
2347   {
2348     /*
2349     Cosine window function:
2350       0.5+0.5*cos(pi*x).
2351     */
2352     const float cosine=cos((MagickPI*x));
2353     return(0.5f+0.5f*cosine);
2354   }
2355   )
2356 
2357   STRINGIFY(
2358   float Hamming(const float x)
2359   {
2360     /*
2361       Offset cosine window function:
2362        .54 + .46 cos(pi x).
2363     */
2364     const float cosine=cos((MagickPI*x));
2365     return(0.54f+0.46f*cosine);
2366   }
2367   )
2368 
2369   STRINGIFY(
2370   float Blackman(const float x)
2371   {
2372     /*
2373       Blackman: 2nd order cosine windowing function:
2374         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2375 
2376       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2377       five flops.
2378     */
2379     const float cosine=cos((MagickPI*x));
2380     return(0.34f+cosine*(0.5f+cosine*0.16f));
2381   }
2382   )
2383 
2384   STRINGIFY(
2385   inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2386   {
2387     switch (filterType)
2388     {
2389     /* Call Sinc even for SincFast to get better precision on GPU
2390        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2391     case SincWeightingFunction:
2392     case SincFastWeightingFunction:
2393       return Sinc(x);
2394     case CubicBCWeightingFunction:
2395       return CubicBC(x,filterCoefficients);
2396     case BoxWeightingFunction:
2397       return BoxResizeFilter(x);
2398     case TriangleWeightingFunction:
2399       return Triangle(x);
2400     case HannWeightingFunction:
2401       return Hann(x);
2402     case HammingWeightingFunction:
2403       return Hamming(x);
2404     case BlackmanWeightingFunction:
2405       return Blackman(x);
2406 
2407     default:
2408       return 0.0f;
2409     }
2410   }
2411   )
2412 
2413 
2414   STRINGIFY(
2415   inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2416            , const ResizeWeightingFunctionType resizeWindowType
2417            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2418   {
2419     float scale;
2420     float xBlur = fabs(x/resizeFilterBlur);
2421     if (resizeWindowSupport < MagickEpsilon
2422         || resizeWindowType == BoxWeightingFunction)
2423     {
2424       scale = 1.0f;
2425     }
2426     else
2427     {
2428       scale = resizeFilterScale;
2429       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2430     }
2431     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2432     return weight;
2433   }
2434 
2435   )
2436 
2437   ;
2438   const char *accelerateKernels2 =
2439 
2440   STRINGIFY(
2441 
2442   inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2443     return (numWorkItems/pixelPerWorkgroup);
2444   }
2445 
2446   // returns the index of the pixel for the current workitem to compute.
2447   // returns -1 if this workitem doesn't need to participate in any computation
2448   inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2449     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2450     int pixelIndex = itemID/numWorkItemsPerPixel;
2451     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2452     return pixelIndex;
2453   }
2454 
2455   )
2456 
2457   STRINGIFY(
2458   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2459     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2460       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2461       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2462       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2463       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2464       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2465       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2466       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2467   {
2468     // calculate the range of resized image pixels computed by this workgroup
2469     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2470     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2471     const unsigned int actualNumPixelToCompute = stopX - startX;
2472 
2473     // calculate the range of input image pixels to cache
2474     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2475     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2476     scale = PerceptibleReciprocal(scale);
2477 
2478     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2479     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2480 
2481     // cache the input pixels into local memory
2482     const unsigned int y = get_global_id(1);
2483     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2484     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2485     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2486     wait_group_events(1, &e);
2487 
2488     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2489     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2490     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2491     {
2492       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2493       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2494       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2495 
2496       // determine which resized pixel computed by this workitem
2497       const unsigned int itemID = get_local_id(0);
2498       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2499 
2500       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2501 
2502       float4 filteredPixel = (float4)0.0f;
2503       float density = 0.0f;
2504       float gamma = 0.0f;
2505       // -1 means this workitem doesn't participate in the computation
2506       if (pixelIndex != -1)
2507       {
2508         // x coordinated of the resized pixel computed by this workitem
2509         const int x = chunkStartX + pixelIndex;
2510 
2511         // calculate how many steps required for this pixel
2512         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2513         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2514         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2515         const unsigned int n = stop - start;
2516 
2517         // calculate how many steps this workitem will contribute
2518         unsigned int numStepsPerWorkItem = n / numItems;
2519         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2520 
2521         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2522         if (startStep < n)
2523         {
2524           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2525 
2526           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2527           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2528           {
2529             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2530               (ResizeWeightingFunctionType) resizeFilterType,
2531               (ResizeWeightingFunctionType) resizeWindowType,
2532               resizeFilterScale, resizeFilterWindowSupport,
2533               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2534 
2535             float4 cp = (float4)0.0f;
2536 
2537             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2538             cp.x = (float) *(p);
2539             if (number_channels > 2)
2540             {
2541               cp.y = (float) *(p + 1);
2542               cp.z = (float) *(p + 2);
2543             }
2544 
2545             if (alpha_index != 0)
2546             {
2547               cp.w = (float) *(p + alpha_index);
2548 
2549               float alpha = weight * QuantumScale * cp.w;
2550 
2551               filteredPixel.x += alpha * cp.x;
2552               filteredPixel.y += alpha * cp.y;
2553               filteredPixel.z += alpha * cp.z;
2554               filteredPixel.w += weight * cp.w;
2555               gamma += alpha;
2556             }
2557             else
2558               filteredPixel += ((float4) weight)*cp;
2559 
2560             density += weight;
2561           }
2562         }
2563       }
2564 
2565       // initialize the accumulators to zero
2566       if (itemID < actualNumPixelInThisChunk) {
2567         outputPixelCache[itemID] = (float4)0.0f;
2568         densityCache[itemID] = 0.0f;
2569         if (alpha_index != 0)
2570           gammaCache[itemID] = 0.0f;
2571       }
2572       barrier(CLK_LOCAL_MEM_FENCE);
2573 
2574       // accumulatte the filtered pixel value and the density
2575       for (unsigned int i = 0; i < numItems; i++) {
2576         if (pixelIndex != -1) {
2577           if (itemID%numItems == i) {
2578             outputPixelCache[pixelIndex]+=filteredPixel;
2579             densityCache[pixelIndex]+=density;
2580             if (alpha_index != 0)
2581               gammaCache[pixelIndex]+=gamma;
2582           }
2583         }
2584         barrier(CLK_LOCAL_MEM_FENCE);
2585       }
2586 
2587       if (itemID < actualNumPixelInThisChunk)
2588       {
2589         float4 filteredPixel = outputPixelCache[itemID];
2590 
2591         float gamma = 0.0f;
2592         if (alpha_index != 0)
2593           gamma = gammaCache[itemID];
2594 
2595         float density = densityCache[itemID];
2596         if ((density != 0.0f) && (density != 1.0f))
2597         {
2598           density = PerceptibleReciprocal(density);
2599           filteredPixel *= (float4) density;
2600           if (alpha_index != 0)
2601             gamma *= density;
2602         }
2603 
2604         if (alpha_index != 0)
2605         {
2606           gamma = PerceptibleReciprocal(gamma);
2607           filteredPixel.x *= gamma;
2608           filteredPixel.y *= gamma;
2609           filteredPixel.z *= gamma;
2610         }
2611 
2612         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2613       }
2614     }
2615   }
2616   )
2617 
2618 
2619   STRINGIFY(
2620  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2621     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2622       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2623       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2624       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2625       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2626       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2627       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2628       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2629   {
2630     // calculate the range of resized image pixels computed by this workgroup
2631     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2632     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2633     const unsigned int actualNumPixelToCompute = stopY - startY;
2634 
2635     // calculate the range of input image pixels to cache
2636     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2637     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2638     scale = PerceptibleReciprocal(scale);
2639 
2640     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2641     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2642 
2643     // cache the input pixels into local memory
2644     const unsigned int x = get_global_id(0);
2645     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2646     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2647     unsigned int stride = inputColumns * number_channels;
2648     for (unsigned int i = 0; i < number_channels; i++)
2649     {
2650       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2651       wait_group_events(1,&e);
2652     }
2653 
2654     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2655     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2656     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2657     {
2658       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2659       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2660       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2661 
2662       // determine which resized pixel computed by this workitem
2663       const unsigned int itemID = get_local_id(1);
2664       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2665 
2666       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2667 
2668       float4 filteredPixel = (float4)0.0f;
2669       float density = 0.0f;
2670       float gamma = 0.0f;
2671       // -1 means this workitem doesn't participate in the computation
2672       if (pixelIndex != -1)
2673       {
2674         // x coordinated of the resized pixel computed by this workitem
2675         const int y = chunkStartY + pixelIndex;
2676 
2677         // calculate how many steps required for this pixel
2678         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2679         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2680         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2681         const unsigned int n = stop - start;
2682 
2683         // calculate how many steps this workitem will contribute
2684         unsigned int numStepsPerWorkItem = n / numItems;
2685         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2686 
2687         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2688         if (startStep < n)
2689         {
2690           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2691 
2692           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2693           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2694           {
2695             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2696               (ResizeWeightingFunctionType) resizeFilterType,
2697               (ResizeWeightingFunctionType) resizeWindowType,
2698               resizeFilterScale, resizeFilterWindowSupport,
2699               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2700 
2701             float4 cp = (float4)0.0f;
2702 
2703             __local CLQuantum *p = inputImageCache + cacheIndex;
2704             cp.x = (float) *(p);
2705             if (number_channels > 2)
2706             {
2707               cp.y = (float) *(p + rangeLength);
2708               cp.z = (float) *(p + (rangeLength * 2));
2709             }
2710 
2711             if (alpha_index != 0)
2712             {
2713               cp.w = (float) *(p + (rangeLength * alpha_index));
2714 
2715               float alpha = weight * QuantumScale * cp.w;
2716 
2717               filteredPixel.x += alpha * cp.x;
2718               filteredPixel.y += alpha * cp.y;
2719               filteredPixel.z += alpha * cp.z;
2720               filteredPixel.w += weight * cp.w;
2721               gamma += alpha;
2722             }
2723             else
2724               filteredPixel += ((float4) weight)*cp;
2725 
2726             density += weight;
2727           }
2728         }
2729       }
2730 
2731       // initialize the accumulators to zero
2732       if (itemID < actualNumPixelInThisChunk) {
2733         outputPixelCache[itemID] = (float4)0.0f;
2734         densityCache[itemID] = 0.0f;
2735         if (alpha_index != 0)
2736           gammaCache[itemID] = 0.0f;
2737       }
2738       barrier(CLK_LOCAL_MEM_FENCE);
2739 
2740       // accumulatte the filtered pixel value and the density
2741       for (unsigned int i = 0; i < numItems; i++) {
2742         if (pixelIndex != -1) {
2743           if (itemID%numItems == i) {
2744             outputPixelCache[pixelIndex]+=filteredPixel;
2745             densityCache[pixelIndex]+=density;
2746             if (alpha_index != 0)
2747               gammaCache[pixelIndex]+=gamma;
2748           }
2749         }
2750         barrier(CLK_LOCAL_MEM_FENCE);
2751       }
2752 
2753       if (itemID < actualNumPixelInThisChunk)
2754       {
2755         float4 filteredPixel = outputPixelCache[itemID];
2756 
2757         float gamma = 0.0f;
2758         if (alpha_index != 0)
2759           gamma = gammaCache[itemID];
2760 
2761         float density = densityCache[itemID];
2762         if ((density != 0.0f) && (density != 1.0f))
2763         {
2764           density = PerceptibleReciprocal(density);
2765           filteredPixel *= (float4) density;
2766           if (alpha_index != 0)
2767             gamma *= density;
2768         }
2769 
2770         if (alpha_index != 0)
2771         {
2772           gamma = PerceptibleReciprocal(gamma);
2773           filteredPixel.x *= gamma;
2774           filteredPixel.y *= gamma;
2775           filteredPixel.z *= gamma;
2776         }
2777 
2778         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2779       }
2780     }
2781   }
2782   )
2783 
2784 /*
2785 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2786 %                                                                             %
2787 %                                                                             %
2788 %                                                                             %
2789 %     R o t a t i o n a l B l u r                                             %
2790 %                                                                             %
2791 %                                                                             %
2792 %                                                                             %
2793 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2794 */
2795 
2796   STRINGIFY(
2797   __kernel void RotationalBlur(const __global CLQuantum *image,
2798     const unsigned int number_channels,const unsigned int channel,
2799     const float2 blurCenter,__constant float *cos_theta,
2800     __constant float *sin_theta,const unsigned int cossin_theta_size,
2801     __global CLQuantum *filteredImage)
2802   {
2803     const int x = get_global_id(0);
2804     const int y = get_global_id(1);
2805     const int columns = get_global_size(0);
2806     const int rows = get_global_size(1);
2807     unsigned int step = 1;
2808     float center_x = (float) x - blurCenter.x;
2809     float center_y = (float) y - blurCenter.y;
2810     float radius = hypot(center_x, center_y);
2811 
2812     float blur_radius = hypot(blurCenter.x, blurCenter.y);
2813 
2814     if (radius > MagickEpsilon)
2815     {
2816       step = (unsigned int) (blur_radius / radius);
2817       if (step == 0)
2818         step = 1;
2819       if (step >= cossin_theta_size)
2820         step = cossin_theta_size-1;
2821     }
2822 
2823     float4 result = 0.0f;
2824     float normalize = 0.0f;
2825     float gamma = 0.0f;
2826 
2827     for (unsigned int i=0; i<cossin_theta_size; i+=step)
2828     {
2829       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2830       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2831 
2832       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2833 
2834       if ((number_channels == 4) || (number_channels == 2))
2835       {
2836         float alpha = (float)(QuantumScale*pixel.w);
2837 
2838         gamma += alpha;
2839 
2840         result.x += alpha * pixel.x;
2841         result.y += alpha * pixel.y;
2842         result.z += alpha * pixel.z;
2843         result.w += pixel.w;
2844       }
2845       else
2846         result += pixel;
2847 
2848       normalize += 1.0f;
2849     }
2850 
2851     normalize = PerceptibleReciprocal(normalize);
2852 
2853     if ((number_channels == 4) || (number_channels == 2))
2854     {
2855       gamma = PerceptibleReciprocal(gamma);
2856       result.x *= gamma;
2857       result.y *= gamma;
2858       result.z *= gamma;
2859       result.w *= normalize;
2860     }
2861     else
2862       result *= normalize;
2863 
2864     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2865   }
2866   )
2867 
2868 /*
2869 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2870 %                                                                             %
2871 %                                                                             %
2872 %                                                                             %
2873 %     U n s h a r p M a s k                                                   %
2874 %                                                                             %
2875 %                                                                             %
2876 %                                                                             %
2877 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2878 */
2879 
2880   STRINGIFY(
2881   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2882     const __global float4 *blurRowData,const unsigned int number_channels,
2883     const ChannelType channel,const unsigned int columns,
2884     const unsigned int rows,__local float4* cachedData,
2885     __local float* cachedFilter,const __global float *filter,
2886     const unsigned int width,const float gain, const float threshold,
2887     __global CLQuantum *filteredImage)
2888   {
2889     const unsigned int radius = (width-1)/2;
2890 
2891     // cache the pixel shared by the workgroup
2892     const int groupX = get_group_id(0);
2893     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2894     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2895 
2896     if ((groupStartY >= 0) && (groupStopY < rows))
2897     {
2898       event_t e = async_work_group_strided_copy(cachedData,
2899         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2900       wait_group_events(1,&e);
2901     }
2902     else
2903     {
2904       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2905         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2906 
2907       barrier(CLK_LOCAL_MEM_FENCE);
2908     }
2909     // cache the filter as well
2910     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2911     wait_group_events(1,&e);
2912 
2913     // only do the work if this is not a patched item
2914     const int cy = get_global_id(1);
2915 
2916     if (cy < rows)
2917     {
2918       float4 blurredPixel = (float4) 0.0f;
2919 
2920       int i = 0;
2921 
2922       for ( ; i+7 < width; )
2923       {
2924         for (int j=0; j < 8; j++, i++)
2925           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2926       }
2927 
2928       for ( ; i < width; i++)
2929         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2930 
2931       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2932       float4 outputPixel = inputImagePixel - blurredPixel;
2933 
2934       float quantumThreshold = QuantumRange*threshold;
2935 
2936       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2937       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2938 
2939       //write back
2940       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2941     }
2942   }
2943   )
2944 
2945   STRINGIFY(
2946   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2947     const ChannelType channel,__constant float *filter,const unsigned int width,
2948     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2949     const float gain,const float threshold,__global CLQuantum *filteredImage)
2950   {
2951     const unsigned int x = get_global_id(0);
2952     const unsigned int y = get_global_id(1);
2953 
2954     const unsigned int radius = (width - 1) / 2;
2955 
2956     int row = y - radius;
2957     int baseRow = get_group_id(1) * get_local_size(1) - radius;
2958     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2959 
2960     while (row < endRow) {
2961       int srcy = (row < 0) ? -row : row; // mirror pad
2962       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2963 
2964       float4 value = 0.0f;
2965 
2966       int ix = x - radius;
2967       int i = 0;
2968 
2969       while (i + 7 < width) {
2970         for (int j = 0; j < 8; ++j) { // unrolled
2971           int srcx = ix + j;
2972           srcx = (srcx < 0) ? -srcx : srcx;
2973           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2974           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2975         }
2976         ix += 8;
2977         i += 8;
2978       }
2979 
2980       while (i < width) {
2981         int srcx = (ix < 0) ? -ix : ix; // mirror pad
2982         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2983         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2984         ++i;
2985         ++ix;
2986       }
2987       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2988       row += get_local_size(1);
2989     }
2990 
2991     barrier(CLK_LOCAL_MEM_FENCE);
2992 
2993     const int px = get_local_id(0);
2994     const int py = get_local_id(1);
2995     const int prp = get_local_size(0);
2996     float4 value = (float4)(0.0f);
2997 
2998     int i = 0;
2999     while (i + 7 < width) {
3000       for (int j = 0; j < 8; ++j) // unrolled
3001         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
3002       i += 8;
3003     }
3004     while (i < width) {
3005       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3006       ++i;
3007     }
3008 
3009     if ((x < columns) && (y < rows)) {
3010       float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
3011       float4 diff = srcPixel - value;
3012 
3013       float quantumThreshold = QuantumRange*threshold;
3014 
3015       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3016       value = select(srcPixel + diff * gain, srcPixel, mask);
3017 
3018       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3019     }
3020   }
3021   )
3022 
3023   STRINGIFY(
3024     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3025     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3026       const unsigned int number_channels,const unsigned int max_channels,
3027       const float threshold,const int passes,const unsigned int imageWidth,
3028       const unsigned int imageHeight)
3029   {
3030     const int pad = (1 << (passes - 1));
3031     const int tileSize = 64;
3032     const int tileRowPixels = 64;
3033     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3034 
3035     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3036 
3037     local float buffer[64 * 64];
3038 
3039     int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3040     int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3041 
3042     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3043       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3044                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3045 
3046       for (int channel = 0; channel < max_channels; ++channel)
3047         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3048     }
3049 
3050     for (int channel = 0; channel < max_channels; ++channel) {
3051       // Load LDS
3052       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3053         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3054 
3055       // Process
3056 
3057       float tmp[16];
3058       float accum[16];
3059       float pixel;
3060 
3061       for (int i = 0; i < 16; i++)
3062         accum[i]=0.0f;
3063 
3064       for (int pass = 0; pass < passes; ++pass) {
3065         const int radius = 1 << pass;
3066         const int x = get_local_id(0);
3067         const float thresh = threshold * noise[pass];
3068 
3069         // Apply horizontal hat
3070         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3071           const int offset = i * tileRowPixels;
3072           if (pass == 0)
3073             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3074           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3075           barrier(CLK_LOCAL_MEM_FENCE);
3076           buffer[x + offset] = pixel;
3077         }
3078         barrier(CLK_LOCAL_MEM_FENCE);
3079 
3080         // Apply vertical hat
3081         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3082           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3083           float delta = tmp[i / 4] - pixel;
3084           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3085           if (delta < -thresh)
3086             delta += thresh;
3087           else if (delta > thresh)
3088             delta -= thresh;
3089           else
3090             delta = 0;
3091           accum[i / 4] += delta;
3092         }
3093         barrier(CLK_LOCAL_MEM_FENCE);
3094 
3095         if (pass < passes - 1)
3096           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3097             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3098         else  // last pass
3099           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3100             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3101         barrier(CLK_LOCAL_MEM_FENCE);
3102       }
3103 
3104       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3105         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3106 
3107       barrier(CLK_LOCAL_MEM_FENCE);
3108     }
3109 
3110     // Write from stage to output
3111 
3112     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3113       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3114         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3115           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3116           for (int channel = 0; channel < max_channels; ++channel) {
3117             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3118           }
3119         }
3120       }
3121     }
3122   }
3123   )
3124 
3125   ;
3126 
3127 #endif // MAGICKCORE_OPENCL_SUPPORT
3128 
3129 #if defined(__cplusplus) || defined(c_plusplus)
3130 }
3131 #endif
3132 
3133 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
3134