1 /* 2 Copyright 1999-2019 ImageMagick Studio LLC, a non-profit organization 3 dedicated to making software imaging solutions freely available. 4 5 You may not use this file except in compliance with the License. You may 6 obtain a copy of the License at 7 8 https://imagemagick.org/script/license.php 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 16 MagickCore private kernels for accelerated functions. 17 */ 18 19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 21 22 #if defined(__cplusplus) || defined(c_plusplus) 23 extern "C" { 24 #endif 25 26 #if defined(MAGICKCORE_OPENCL_SUPPORT) 27 28 /* 29 Define declarations. 30 */ 31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" 32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" 33 #define OPENCL_ELSE() "\n #""else " " \n" 34 #define OPENCL_ENDIF() "\n #""endif " " \n" 35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" 36 #define STRINGIFY(...) #__VA_ARGS__ "\n" 37 38 const char *accelerateKernels = 39 40 /* 41 Define declarations. 42 */ 43 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) 44 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) 45 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) 46 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) 47 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) 48 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) 49 OPENCL_DEFINE(SigmaRandom, (attenuate)) 50 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) 51 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) 52 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) 53 54 /* 55 Typedef declarations. 56 */ 57 STRINGIFY( 58 typedef enum 59 { 60 UndefinedColorspace, 61 CMYColorspace, /* negated linear RGB colorspace */ 62 CMYKColorspace, /* CMY with Black separation */ 63 GRAYColorspace, /* Single Channel greyscale (non-linear) image */ 64 HCLColorspace, 65 HCLpColorspace, 66 HSBColorspace, 67 HSIColorspace, 68 HSLColorspace, 69 HSVColorspace, /* alias for HSB */ 70 HWBColorspace, 71 LabColorspace, 72 LCHColorspace, /* alias for LCHuv */ 73 LCHabColorspace, /* Cylindrical (Polar) Lab */ 74 LCHuvColorspace, /* Cylindrical (Polar) Luv */ 75 LogColorspace, 76 LMSColorspace, 77 LuvColorspace, 78 OHTAColorspace, 79 Rec601YCbCrColorspace, 80 Rec709YCbCrColorspace, 81 RGBColorspace, /* Linear RGB colorspace */ 82 scRGBColorspace, /* ??? */ 83 sRGBColorspace, /* Default: non-linear sRGB colorspace */ 84 TransparentColorspace, 85 xyYColorspace, 86 XYZColorspace, /* IEEE Color Reference colorspace */ 87 YCbCrColorspace, 88 YCCColorspace, 89 YDbDrColorspace, 90 YIQColorspace, 91 YPbPrColorspace, 92 YUVColorspace, 93 LinearGRAYColorspace /* Single Channel greyscale (linear) image */ 94 } ColorspaceType; 95 ) 96 97 STRINGIFY( 98 typedef enum 99 { 100 UndefinedCompositeOp, 101 AlphaCompositeOp, 102 AtopCompositeOp, 103 BlendCompositeOp, 104 BlurCompositeOp, 105 BumpmapCompositeOp, 106 ChangeMaskCompositeOp, 107 ClearCompositeOp, 108 ColorBurnCompositeOp, 109 ColorDodgeCompositeOp, 110 ColorizeCompositeOp, 111 CopyBlackCompositeOp, 112 CopyBlueCompositeOp, 113 CopyCompositeOp, 114 CopyCyanCompositeOp, 115 CopyGreenCompositeOp, 116 CopyMagentaCompositeOp, 117 CopyAlphaCompositeOp, 118 CopyRedCompositeOp, 119 CopyYellowCompositeOp, 120 DarkenCompositeOp, 121 DarkenIntensityCompositeOp, 122 DifferenceCompositeOp, 123 DisplaceCompositeOp, 124 DissolveCompositeOp, 125 DistortCompositeOp, 126 DivideDstCompositeOp, 127 DivideSrcCompositeOp, 128 DstAtopCompositeOp, 129 DstCompositeOp, 130 DstInCompositeOp, 131 DstOutCompositeOp, 132 DstOverCompositeOp, 133 ExclusionCompositeOp, 134 HardLightCompositeOp, 135 HardMixCompositeOp, 136 HueCompositeOp, 137 InCompositeOp, 138 IntensityCompositeOp, 139 LightenCompositeOp, 140 LightenIntensityCompositeOp, 141 LinearBurnCompositeOp, 142 LinearDodgeCompositeOp, 143 LinearLightCompositeOp, 144 LuminizeCompositeOp, 145 MathematicsCompositeOp, 146 MinusDstCompositeOp, 147 MinusSrcCompositeOp, 148 ModulateCompositeOp, 149 ModulusAddCompositeOp, 150 ModulusSubtractCompositeOp, 151 MultiplyCompositeOp, 152 NoCompositeOp, 153 OutCompositeOp, 154 OverCompositeOp, 155 OverlayCompositeOp, 156 PegtopLightCompositeOp, 157 PinLightCompositeOp, 158 PlusCompositeOp, 159 ReplaceCompositeOp, 160 SaturateCompositeOp, 161 ScreenCompositeOp, 162 SoftLightCompositeOp, 163 SrcAtopCompositeOp, 164 SrcCompositeOp, 165 SrcInCompositeOp, 166 SrcOutCompositeOp, 167 SrcOverCompositeOp, 168 ThresholdCompositeOp, 169 VividLightCompositeOp, 170 XorCompositeOp, 171 StereoCompositeOp 172 } CompositeOperator; 173 ) 174 175 STRINGIFY( 176 typedef enum 177 { 178 UndefinedFunction, 179 ArcsinFunction, 180 ArctanFunction, 181 PolynomialFunction, 182 SinusoidFunction 183 } MagickFunction; 184 ) 185 186 STRINGIFY( 187 typedef enum 188 { 189 UndefinedNoise, 190 UniformNoise, 191 GaussianNoise, 192 MultiplicativeGaussianNoise, 193 ImpulseNoise, 194 LaplacianNoise, 195 PoissonNoise, 196 RandomNoise 197 } NoiseType; 198 ) 199 200 STRINGIFY( 201 typedef enum 202 { 203 UndefinedPixelIntensityMethod = 0, 204 AveragePixelIntensityMethod, 205 BrightnessPixelIntensityMethod, 206 LightnessPixelIntensityMethod, 207 MSPixelIntensityMethod, 208 Rec601LumaPixelIntensityMethod, 209 Rec601LuminancePixelIntensityMethod, 210 Rec709LumaPixelIntensityMethod, 211 Rec709LuminancePixelIntensityMethod, 212 RMSPixelIntensityMethod 213 } PixelIntensityMethod; 214 ) 215 216 STRINGIFY( 217 typedef enum 218 { 219 BoxWeightingFunction = 0, 220 TriangleWeightingFunction, 221 CubicBCWeightingFunction, 222 HannWeightingFunction, 223 HammingWeightingFunction, 224 BlackmanWeightingFunction, 225 GaussianWeightingFunction, 226 QuadraticWeightingFunction, 227 JincWeightingFunction, 228 SincWeightingFunction, 229 SincFastWeightingFunction, 230 KaiserWeightingFunction, 231 WelchWeightingFunction, 232 BohmanWeightingFunction, 233 LagrangeWeightingFunction, 234 CosineWeightingFunction 235 } ResizeWeightingFunctionType; 236 ) 237 238 STRINGIFY( 239 typedef enum 240 { 241 UndefinedChannel = 0x0000, 242 RedChannel = 0x0001, 243 GrayChannel = 0x0001, 244 CyanChannel = 0x0001, 245 GreenChannel = 0x0002, 246 MagentaChannel = 0x0002, 247 BlueChannel = 0x0004, 248 YellowChannel = 0x0004, 249 BlackChannel = 0x0008, 250 AlphaChannel = 0x0010, 251 OpacityChannel = 0x0010, 252 IndexChannel = 0x0020, /* Color Index Table? */ 253 ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ 254 WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ 255 MetaChannel = 0x0100, /* ???? */ 256 CompositeChannels = 0x001F, 257 AllChannels = 0x7ffffff, 258 /* 259 Special purpose channel types. 260 FUTURE: are these needed any more - they are more like hacks 261 SyncChannels for example is NOT a real channel but a 'flag' 262 It really says -- "User has not defined channels" 263 Though it does have extra meaning in the "-auto-level" operator 264 */ 265 TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ 266 RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ 267 GrayChannels = 0x0400, 268 SyncChannels = 0x20000, /* channels modified as a single unit */ 269 DefaultChannels = AllChannels 270 } ChannelType; /* must correspond to PixelChannel */ 271 ) 272 273 /* 274 Helper functions. 275 */ 276 277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) 278 279 STRINGIFY( 280 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 281 { 282 return((CLQuantum) value); 283 } 284 ) 285 286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) 287 288 STRINGIFY( 289 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 290 { 291 return((CLQuantum) (257.0f*value)); 292 } 293 ) 294 295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) 296 297 STRINGIFY( 298 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 299 { 300 return((CLQuantum) (16843009.0*value)); 301 } 302 ) 303 304 OPENCL_ENDIF() 305 306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1)) 307 308 STRINGIFY( 309 inline CLQuantum ClampToQuantum(const float value) 310 { 311 return (CLQuantum) clamp(value, 0.0f, QuantumRange); 312 } 313 ) 314 315 OPENCL_ELSE() 316 317 STRINGIFY( 318 inline CLQuantum ClampToQuantum(const float value) 319 { 320 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); 321 } 322 ) 323 324 OPENCL_ENDIF() 325 326 STRINGIFY( 327 inline int ClampToCanvas(const int offset,const int range) 328 { 329 return clamp(offset, (int)0, range-1); 330 } 331 ) 332 333 STRINGIFY( 334 inline uint ScaleQuantumToMap(CLQuantum value) 335 { 336 if (value >= (CLQuantum) MaxMap) 337 return ((uint)MaxMap); 338 else 339 return ((uint)value); 340 } 341 ) 342 343 STRINGIFY( 344 inline float PerceptibleReciprocal(const float x) 345 { 346 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; 347 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); 348 } 349 ) 350 351 STRINGIFY( 352 inline float RoundToUnity(const float value) 353 { 354 return clamp(value,0.0f,1.0f); 355 } 356 ) 357 358 STRINGIFY( 359 360 inline unsigned int getPixelIndex(const unsigned int number_channels, 361 const unsigned int columns, const unsigned int x, const unsigned int y) 362 { 363 return (x * number_channels) + (y * columns * number_channels); 364 } 365 366 inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } 367 inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } 368 inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } 369 inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } 370 371 inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } 372 inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } 373 inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } 374 inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } 375 376 inline CLQuantum getBlue(CLPixelType p) { return p.x; } 377 inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } 378 inline float getBlueF4(float4 p) { return p.x; } 379 inline void setBlueF4(float4* p, float value) { (*p).x = value; } 380 381 inline CLQuantum getGreen(CLPixelType p) { return p.y; } 382 inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } 383 inline float getGreenF4(float4 p) { return p.y; } 384 inline void setGreenF4(float4* p, float value) { (*p).y = value; } 385 386 inline CLQuantum getRed(CLPixelType p) { return p.z; } 387 inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } 388 inline float getRedF4(float4 p) { return p.z; } 389 inline void setRedF4(float4* p, float value) { (*p).z = value; } 390 391 inline CLQuantum getAlpha(CLPixelType p) { return p.w; } 392 inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } 393 inline float getAlphaF4(float4 p) { return p.w; } 394 inline void setAlphaF4(float4* p, float value) { (*p).w = value; } 395 396 inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, 397 const ChannelType channel, float *red, float *green, float *blue, float *alpha) 398 { 399 if ((channel & RedChannel) != 0) 400 *red=getPixelRed(p); 401 402 if (number_channels > 2) 403 { 404 if ((channel & GreenChannel) != 0) 405 *green=getPixelGreen(p); 406 407 if ((channel & BlueChannel) != 0) 408 *blue=getPixelBlue(p); 409 } 410 411 if (((number_channels == 4) || (number_channels == 2)) && 412 ((channel & AlphaChannel) != 0)) 413 *alpha=getPixelAlpha(p,number_channels); 414 } 415 416 inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels, 417 const unsigned int columns, const unsigned int x, const unsigned int y) 418 { 419 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 420 421 float4 pixel; 422 423 pixel.x=getPixelRed(p); 424 425 if (number_channels > 2) 426 { 427 pixel.y=getPixelGreen(p); 428 pixel.z=getPixelBlue(p); 429 } 430 431 if ((number_channels == 4) || (number_channels == 2)) 432 pixel.w=getPixelAlpha(p,number_channels); 433 return(pixel); 434 } 435 436 inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, 437 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) 438 { 439 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 440 441 float red = 0.0f; 442 float green = 0.0f; 443 float blue = 0.0f; 444 float alpha = 0.0f; 445 446 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 447 return (float4)(red, green, blue, alpha); 448 } 449 450 inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, 451 const ChannelType channel, float red, float green, float blue, float alpha) 452 { 453 if ((channel & RedChannel) != 0) 454 setPixelRed(p,ClampToQuantum(red)); 455 456 if (number_channels > 2) 457 { 458 if ((channel & GreenChannel) != 0) 459 setPixelGreen(p,ClampToQuantum(green)); 460 461 if ((channel & BlueChannel) != 0) 462 setPixelBlue(p,ClampToQuantum(blue)); 463 } 464 465 if (((number_channels == 4) || (number_channels == 2)) && 466 ((channel & AlphaChannel) != 0)) 467 setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); 468 } 469 470 inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels, 471 const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel) 472 { 473 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 474 475 setPixelRed(p,ClampToQuantum(pixel.x)); 476 477 if (number_channels > 2) 478 { 479 setPixelGreen(p,ClampToQuantum(pixel.y)); 480 setPixelBlue(p,ClampToQuantum(pixel.z)); 481 } 482 483 if ((number_channels == 4) || (number_channels == 2)) 484 setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w)); 485 } 486 487 inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, 488 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, 489 float4 pixel) 490 { 491 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 492 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); 493 } 494 495 inline float GetPixelIntensity(const unsigned int colorspace, 496 const unsigned int method,float red,float green,float blue) 497 { 498 float intensity; 499 500 if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace)) 501 return red; 502 503 switch (method) 504 { 505 case AveragePixelIntensityMethod: 506 { 507 intensity=(red+green+blue)/3.0; 508 break; 509 } 510 case BrightnessPixelIntensityMethod: 511 { 512 intensity=MagickMax(MagickMax(red,green),blue); 513 break; 514 } 515 case LightnessPixelIntensityMethod: 516 { 517 intensity=(MagickMin(MagickMin(red,green),blue)+ 518 MagickMax(MagickMax(red,green),blue))/2.0; 519 break; 520 } 521 case MSPixelIntensityMethod: 522 { 523 intensity=(float) (((float) red*red+green*green+blue*blue)/ 524 (3.0*QuantumRange)); 525 break; 526 } 527 case Rec601LumaPixelIntensityMethod: 528 { 529 /* 530 if (image->colorspace == RGBColorspace) 531 { 532 red=EncodePixelGamma(red); 533 green=EncodePixelGamma(green); 534 blue=EncodePixelGamma(blue); 535 } 536 */ 537 intensity=0.298839*red+0.586811*green+0.114350*blue; 538 break; 539 } 540 case Rec601LuminancePixelIntensityMethod: 541 { 542 /* 543 if (image->colorspace == sRGBColorspace) 544 { 545 red=DecodePixelGamma(red); 546 green=DecodePixelGamma(green); 547 blue=DecodePixelGamma(blue); 548 } 549 */ 550 intensity=0.298839*red+0.586811*green+0.114350*blue; 551 break; 552 } 553 case Rec709LumaPixelIntensityMethod: 554 default: 555 { 556 /* 557 if (image->colorspace == RGBColorspace) 558 { 559 red=EncodePixelGamma(red); 560 green=EncodePixelGamma(green); 561 blue=EncodePixelGamma(blue); 562 } 563 */ 564 intensity=0.212656*red+0.715158*green+0.072186*blue; 565 break; 566 } 567 case Rec709LuminancePixelIntensityMethod: 568 { 569 /* 570 if (image->colorspace == sRGBColorspace) 571 { 572 red=DecodePixelGamma(red); 573 green=DecodePixelGamma(green); 574 blue=DecodePixelGamma(blue); 575 } 576 */ 577 intensity=0.212656*red+0.715158*green+0.072186*blue; 578 break; 579 } 580 case RMSPixelIntensityMethod: 581 { 582 intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ 583 sqrt(3.0)); 584 break; 585 } 586 } 587 588 return intensity; 589 } 590 591 inline int mirrorBottom(int value) 592 { 593 return (value < 0) ? - (value) : value; 594 } 595 596 inline int mirrorTop(int value, int width) 597 { 598 return (value >= width) ? (2 * width - value - 1) : value; 599 } 600 ) 601 602 /* 603 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 604 % % 605 % % 606 % % 607 % A d d N o i s e % 608 % % 609 % % 610 % % 611 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 612 */ 613 614 STRINGIFY( 615 /* 616 Part of MWC64X by David Thomas, dt10@imperial.ac.uk 617 This is provided under BSD, full license is with the main package. 618 See http://www.doc.ic.ac.uk/~dt10/research 619 */ 620 621 // Pre: a<M, b<M 622 // Post: r=(a+b) mod M 623 ulong MWC_AddMod64(ulong a, ulong b, ulong M) 624 { 625 ulong v=a+b; 626 //if( (v>=M) || (v<a) ) 627 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. 628 v=v-M; 629 return v; 630 } 631 632 // Pre: a<M,b<M 633 // Post: r=(a*b) mod M 634 // This could be done more efficently, but it is portable, and should 635 // be easy to understand. It can be replaced with any of the better 636 // modular multiplication algorithms (for example if you know you have 637 // double precision available or something). 638 ulong MWC_MulMod64(ulong a, ulong b, ulong M) 639 { 640 ulong r=0; 641 while(a!=0){ 642 if(a&1) 643 r=MWC_AddMod64(r,b,M); 644 b=MWC_AddMod64(b,b,M); 645 a=a>>1; 646 } 647 return r; 648 } 649 650 // Pre: a<M, e>=0 651 // Post: r=(a^b) mod M 652 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on 653 // most architectures 654 ulong MWC_PowMod64(ulong a, ulong e, ulong M) 655 { 656 ulong sqr=a, acc=1; 657 while(e!=0){ 658 if(e&1) 659 acc=MWC_MulMod64(acc,sqr,M); 660 sqr=MWC_MulMod64(sqr,sqr,M); 661 e=e>>1; 662 } 663 return acc; 664 } 665 666 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) 667 { 668 ulong m=MWC_PowMod64(A, distance, M); 669 ulong x=curr.x*(ulong)A+curr.y; 670 x=MWC_MulMod64(x, m, M); 671 return (uint2)((uint)(x/A), (uint)(x%A)); 672 } 673 674 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) 675 { 676 // This is an arbitrary constant for starting LCG jumping from. I didn't 677 // want to start from 1, as then you end up with the two or three first values 678 // being a bit poor in ones - once you've decided that, one constant is as 679 // good as any another. There is no deep mathematical reason for it, I just 680 // generated a random number. 681 enum{ MWC_BASEID = 4077358422479273989UL }; 682 683 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; 684 ulong m=MWC_PowMod64(A, dist, M); 685 686 ulong x=MWC_MulMod64(MWC_BASEID, m, M); 687 return (uint2)((uint)(x/A), (uint)(x%A)); 688 } 689 690 //! Represents the state of a particular generator 691 typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t; 692 693 void MWC64X_Step(mwc64x_state_t *s) 694 { 695 uint X=s->x, C=s->c; 696 697 uint Xn=s->seed0*X+C; 698 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar 699 uint Cn=mad_hi(s->seed0,X,carry); 700 701 s->x=Xn; 702 s->c=Cn; 703 } 704 705 void MWC64X_Skip(mwc64x_state_t *s, ulong distance) 706 { 707 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance); 708 s->x=tmp.x; 709 s->c=tmp.y; 710 } 711 712 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) 713 { 714 uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset); 715 s->x=tmp.x; 716 s->c=tmp.y; 717 } 718 719 //! Return a 32-bit integer in the range [0..2^32) 720 uint MWC64X_NextUint(mwc64x_state_t *s) 721 { 722 uint res=s->x ^ s->c; 723 MWC64X_Step(s); 724 return res; 725 } 726 727 // 728 // End of MWC64X excerpt 729 // 730 731 float mwcReadPseudoRandomValue(mwc64x_state_t* rng) 732 { 733 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 734 } 735 736 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) 737 { 738 float 739 alpha, 740 beta, 741 noise, 742 sigma; 743 744 noise = 0.0f; 745 alpha=mwcReadPseudoRandomValue(r); 746 switch(noise_type) 747 { 748 case UniformNoise: 749 default: 750 { 751 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); 752 break; 753 } 754 case GaussianNoise: 755 { 756 float 757 gamma, 758 tau; 759 760 if (alpha == 0.0f) 761 alpha=1.0f; 762 beta=mwcReadPseudoRandomValue(r); 763 gamma=sqrt(-2.0f*log(alpha)); 764 sigma=gamma*cospi((2.0f*beta)); 765 tau=gamma*sinpi((2.0f*beta)); 766 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; 767 break; 768 } 769 case ImpulseNoise: 770 { 771 if (alpha < (SigmaImpulse/2.0f)) 772 noise=0.0f; 773 else 774 if (alpha >= (1.0f-(SigmaImpulse/2.0f))) 775 noise=QuantumRange; 776 else 777 noise=pixel; 778 break; 779 } 780 case LaplacianNoise: 781 { 782 if (alpha <= 0.5f) 783 { 784 if (alpha <= MagickEpsilon) 785 noise=(pixel-QuantumRange); 786 else 787 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ 788 0.5f); 789 break; 790 } 791 beta=1.0f-alpha; 792 if (beta <= (0.5f*MagickEpsilon)) 793 noise=(pixel+QuantumRange); 794 else 795 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); 796 break; 797 } 798 case MultiplicativeGaussianNoise: 799 { 800 sigma=1.0f; 801 if (alpha > MagickEpsilon) 802 sigma=sqrt(-2.0f*log(alpha)); 803 beta=mwcReadPseudoRandomValue(r); 804 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* 805 cospi((2.0f*beta))/2.0f); 806 break; 807 } 808 case PoissonNoise: 809 { 810 float 811 poisson; 812 unsigned int i; 813 poisson=exp(-SigmaPoisson*QuantumScale*pixel); 814 for (i=0; alpha > poisson; i++) 815 { 816 beta=mwcReadPseudoRandomValue(r); 817 alpha*=beta; 818 } 819 noise=(QuantumRange*i/SigmaPoisson); 820 break; 821 } 822 case RandomNoise: 823 { 824 noise=(QuantumRange*SigmaRandom*alpha); 825 break; 826 } 827 } 828 return noise; 829 } 830 831 __kernel 832 void AddNoise(const __global CLQuantum *image, 833 const unsigned int number_channels,const ChannelType channel, 834 const unsigned int length,const unsigned int pixelsPerWorkItem, 835 const NoiseType noise_type,const float attenuate,const unsigned int seed0, 836 const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, 837 __global CLQuantum *filteredImage) 838 { 839 mwc64x_state_t rng; 840 rng.seed0 = seed0; 841 rng.seed1 = seed1; 842 843 uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use 844 uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); 845 MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams 846 847 uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); 848 uint count = pixelsPerWorkItem; 849 850 while (count > 0 && pos < length) 851 { 852 const __global CLQuantum *p = image + pos; 853 __global CLQuantum *q = filteredImage + pos; 854 855 float red; 856 float green; 857 float blue; 858 float alpha; 859 860 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 861 862 if ((channel & RedChannel) != 0) 863 red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); 864 865 if (number_channels > 2) 866 { 867 if ((channel & GreenChannel) != 0) 868 green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); 869 870 if ((channel & BlueChannel) != 0) 871 blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); 872 } 873 874 if (((number_channels == 4) || (number_channels == 2)) && 875 ((channel & AlphaChannel) != 0)) 876 alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); 877 878 WriteChannels(q, number_channels, channel, red, green, blue, alpha); 879 880 pos += (get_local_size(0) * number_channels); 881 count--; 882 } 883 } 884 ) 885 886 /* 887 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 888 % % 889 % % 890 % % 891 % B l u r % 892 % % 893 % % 894 % % 895 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 896 */ 897 898 STRINGIFY( 899 /* 900 Reduce image noise and reduce detail levels by row 901 */ 902 __kernel void BlurRow(const __global CLQuantum *image, 903 const unsigned int number_channels,const ChannelType channel, 904 __constant float *filter,const unsigned int width, 905 const unsigned int imageColumns,const unsigned int imageRows, 906 __local float4 *temp,__global float4 *tempImage) 907 { 908 const int x = get_global_id(0); 909 const int y = get_global_id(1); 910 911 const int columns = imageColumns; 912 913 const unsigned int radius = (width-1)/2; 914 const int wsize = get_local_size(0); 915 const unsigned int loadSize = wsize+width; 916 917 //group coordinate 918 const int groupX=get_local_size(0)*get_group_id(0); 919 920 //parallel load and clamp 921 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) 922 { 923 int cx = ClampToCanvas(i + groupX - radius, columns); 924 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); 925 } 926 927 // barrier 928 barrier(CLK_LOCAL_MEM_FENCE); 929 930 // only do the work if this is not a patched item 931 if (get_global_id(0) < columns) 932 { 933 // compute 934 float4 result = (float4) 0; 935 936 int i = 0; 937 938 for ( ; i+7 < width; ) 939 { 940 for (int j=0; j < 8; j++) 941 result+=filter[i+j]*temp[i+j+get_local_id(0)]; 942 i+=8; 943 } 944 945 for ( ; i < width; i++) 946 result+=filter[i]*temp[i+get_local_id(0)]; 947 948 // write back to global 949 tempImage[y*columns+x] = result; 950 } 951 } 952 ) 953 954 STRINGIFY( 955 /* 956 Reduce image noise and reduce detail levels by line 957 */ 958 __kernel void BlurColumn(const __global float4 *blurRowData, 959 const unsigned int number_channels,const ChannelType channel, 960 __constant float *filter,const unsigned int width, 961 const unsigned int imageColumns,const unsigned int imageRows, 962 __local float4 *temp,__global CLQuantum *filteredImage) 963 { 964 const int x = get_global_id(0); 965 const int y = get_global_id(1); 966 967 const int columns = imageColumns; 968 const int rows = imageRows; 969 970 unsigned int radius = (width-1)/2; 971 const int wsize = get_local_size(1); 972 const unsigned int loadSize = wsize+width; 973 974 //group coordinate 975 const int groupX=get_local_size(0)*get_group_id(0); 976 const int groupY=get_local_size(1)*get_group_id(1); 977 //notice that get_local_size(0) is 1, so 978 //groupX=get_group_id(0); 979 980 //parallel load and clamp 981 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) 982 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; 983 984 // barrier 985 barrier(CLK_LOCAL_MEM_FENCE); 986 987 // only do the work if this is not a patched item 988 if (get_global_id(1) < rows) 989 { 990 // compute 991 float4 result = (float4) 0; 992 993 int i = 0; 994 995 for ( ; i+7 < width; ) 996 { 997 for (int j=0; j < 8; j++) 998 result+=filter[i+j]*temp[i+j+get_local_id(1)]; 999 i+=8; 1000 } 1001 1002 for ( ; i < width; i++) 1003 result+=filter[i]*temp[i+get_local_id(1)]; 1004 1005 // write back to global 1006 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 1007 } 1008 } 1009 ) 1010 1011 /* 1012 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1013 % % 1014 % % 1015 % % 1016 % C o n t r a s t % 1017 % % 1018 % % 1019 % % 1020 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1021 */ 1022 1023 STRINGIFY( 1024 1025 inline float4 ConvertRGBToHSB(const float4 pixel) 1026 { 1027 float4 result=0.0f; 1028 result.w=pixel.w; 1029 float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z); 1030 if (tmax != 0.0f) 1031 { 1032 float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z); 1033 float delta=tmax-tmin; 1034 1035 result.y=delta/tmax; 1036 result.z=QuantumScale*tmax; 1037 if (delta != 0.0f) 1038 { 1039 result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f)); 1040 result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ? 1041 (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta; 1042 result.x/=6.0f; 1043 result.x+=(result.x < 0.0f) ? 0.0f : 1.0f; 1044 } 1045 } 1046 return(result); 1047 } 1048 1049 inline float4 ConvertHSBToRGB(const float4 pixel) 1050 { 1051 float hue=pixel.x; 1052 float saturation=pixel.y; 1053 float brightness=pixel.z; 1054 1055 float4 result=pixel; 1056 1057 if (saturation == 0.0f) 1058 { 1059 result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness); 1060 } 1061 else 1062 { 1063 float h=6.0f*(hue-floor(hue)); 1064 float f=h-floor(h); 1065 float p=brightness*(1.0f-saturation); 1066 float q=brightness*(1.0f-saturation*f); 1067 float t=brightness*(1.0f-(saturation*(1.0f-f))); 1068 int ih = (int)h; 1069 1070 if (ih == 1) 1071 { 1072 result.x=ClampToQuantum(QuantumRange*q); 1073 result.y=ClampToQuantum(QuantumRange*brightness); 1074 result.z=ClampToQuantum(QuantumRange*p); 1075 } 1076 else if (ih == 2) 1077 { 1078 result.x=ClampToQuantum(QuantumRange*p); 1079 result.y=ClampToQuantum(QuantumRange*brightness); 1080 result.z=ClampToQuantum(QuantumRange*t); 1081 } 1082 else if (ih == 3) 1083 { 1084 result.x=ClampToQuantum(QuantumRange*p); 1085 result.y=ClampToQuantum(QuantumRange*q); 1086 result.z=ClampToQuantum(QuantumRange*brightness); 1087 } 1088 else if (ih == 4) 1089 { 1090 result.x=ClampToQuantum(QuantumRange*t); 1091 result.y=ClampToQuantum(QuantumRange*p); 1092 result.z=ClampToQuantum(QuantumRange*brightness); 1093 } 1094 else if (ih == 5) 1095 { 1096 result.x=ClampToQuantum(QuantumRange*brightness); 1097 result.y=ClampToQuantum(QuantumRange*p); 1098 result.z=ClampToQuantum(QuantumRange*q); 1099 } 1100 else 1101 { 1102 result.x=ClampToQuantum(QuantumRange*brightness); 1103 result.y=ClampToQuantum(QuantumRange*t); 1104 result.z=ClampToQuantum(QuantumRange*p); 1105 } 1106 } 1107 return(result); 1108 } 1109 1110 __kernel void Contrast(__global CLQuantum *image, 1111 const unsigned int number_channels,const int sign) 1112 { 1113 const int x=get_global_id(0); 1114 const int y=get_global_id(1); 1115 const unsigned int columns=get_global_size(0); 1116 1117 float4 pixel=ReadAllChannels(image,number_channels,columns,x,y); 1118 if (number_channels < 3) 1119 pixel.y=pixel.z=pixel.x; 1120 1121 pixel=ConvertRGBToHSB(pixel); 1122 float brightness=pixel.z; 1123 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); 1124 brightness=clamp(brightness,0.0f,1.0f); 1125 pixel.z=brightness; 1126 pixel=ConvertHSBToRGB(pixel); 1127 1128 WriteAllChannels(image,number_channels,columns,x,y,pixel); 1129 } 1130 ) 1131 1132 /* 1133 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1134 % % 1135 % % 1136 % % 1137 % C o n t r a s t S t r e t c h % 1138 % % 1139 % % 1140 % % 1141 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1142 */ 1143 1144 STRINGIFY( 1145 /* 1146 */ 1147 __kernel void Histogram(__global CLPixelType * restrict im, 1148 const ChannelType channel, 1149 const unsigned int colorspace, 1150 const unsigned int method, 1151 __global uint4 * restrict histogram) 1152 { 1153 const int x = get_global_id(0); 1154 const int y = get_global_id(1); 1155 const int columns = get_global_size(0); 1156 const int c = x + y * columns; 1157 if ((channel & SyncChannels) != 0) 1158 { 1159 float red=(float)getRed(im[c]); 1160 float green=(float)getGreen(im[c]); 1161 float blue=(float)getBlue(im[c]); 1162 1163 float intensity = GetPixelIntensity(colorspace, method, red, green, blue); 1164 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); 1165 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position 1166 } 1167 else 1168 { 1169 // for equalizing, we always need all channels? 1170 // otherwise something more 1171 } 1172 } 1173 ) 1174 1175 STRINGIFY( 1176 /* 1177 */ 1178 __kernel void ContrastStretch(__global CLPixelType * restrict im, 1179 const ChannelType channel, 1180 __global CLPixelType * restrict stretch_map, 1181 const float4 white, const float4 black) 1182 { 1183 const int x = get_global_id(0); 1184 const int y = get_global_id(1); 1185 const int columns = get_global_size(0); 1186 const int c = x + y * columns; 1187 1188 uint ePos; 1189 CLPixelType oValue, eValue; 1190 CLQuantum red, green, blue, alpha; 1191 1192 //read from global 1193 oValue=im[c]; 1194 1195 if ((channel & RedChannel) != 0) 1196 { 1197 if (getRedF4(white) != getRedF4(black)) 1198 { 1199 ePos = ScaleQuantumToMap(getRed(oValue)); 1200 eValue = stretch_map[ePos]; 1201 red = getRed(eValue); 1202 } 1203 } 1204 1205 if ((channel & GreenChannel) != 0) 1206 { 1207 if (getGreenF4(white) != getGreenF4(black)) 1208 { 1209 ePos = ScaleQuantumToMap(getGreen(oValue)); 1210 eValue = stretch_map[ePos]; 1211 green = getGreen(eValue); 1212 } 1213 } 1214 1215 if ((channel & BlueChannel) != 0) 1216 { 1217 if (getBlueF4(white) != getBlueF4(black)) 1218 { 1219 ePos = ScaleQuantumToMap(getBlue(oValue)); 1220 eValue = stretch_map[ePos]; 1221 blue = getBlue(eValue); 1222 } 1223 } 1224 1225 if ((channel & AlphaChannel) != 0) 1226 { 1227 if (getAlphaF4(white) != getAlphaF4(black)) 1228 { 1229 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1230 eValue = stretch_map[ePos]; 1231 alpha = getAlpha(eValue); 1232 } 1233 } 1234 1235 //write back 1236 im[c]=(CLPixelType)(blue, green, red, alpha); 1237 1238 } 1239 ) 1240 1241 /* 1242 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1243 % % 1244 % % 1245 % % 1246 % C o n v o l v e % 1247 % % 1248 % % 1249 % % 1250 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1251 */ 1252 1253 STRINGIFY( 1254 __kernel 1255 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, 1256 const unsigned int imageWidth, const unsigned int imageHeight, 1257 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1258 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { 1259 1260 int2 blockID; 1261 blockID.x = get_global_id(0) / get_local_size(0); 1262 blockID.y = get_global_id(1) / get_local_size(1); 1263 1264 // image area processed by this workgroup 1265 int2 imageAreaOrg; 1266 imageAreaOrg.x = blockID.x * get_local_size(0); 1267 imageAreaOrg.y = blockID.y * get_local_size(1); 1268 1269 int2 midFilterDimen; 1270 midFilterDimen.x = (filterWidth-1)/2; 1271 midFilterDimen.y = (filterHeight-1)/2; 1272 1273 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; 1274 1275 // dimension of the local cache 1276 int2 cachedAreaDimen; 1277 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; 1278 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; 1279 1280 // cache the pixels accessed by this workgroup in local memory 1281 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); 1282 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; 1283 int groupSize = get_local_size(0) * get_local_size(1); 1284 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { 1285 1286 int2 cachedAreaIndex; 1287 cachedAreaIndex.x = i % cachedAreaDimen.x; 1288 cachedAreaIndex.y = i / cachedAreaDimen.x; 1289 1290 int2 imagePixelIndex; 1291 imagePixelIndex = cachedAreaOrg + cachedAreaIndex; 1292 1293 // only support EdgeVirtualPixelMethod through ClampToCanvas 1294 // TODO: implement other virtual pixel method 1295 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); 1296 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); 1297 1298 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; 1299 } 1300 1301 // cache the filter 1302 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { 1303 filterCache[i] = filter[i]; 1304 } 1305 barrier(CLK_LOCAL_MEM_FENCE); 1306 1307 1308 int2 imageIndex; 1309 imageIndex.x = imageAreaOrg.x + get_local_id(0); 1310 imageIndex.y = imageAreaOrg.y + get_local_id(1); 1311 1312 // if out-of-range, stops here and quit 1313 if (imageIndex.x >= imageWidth 1314 || imageIndex.y >= imageHeight) { 1315 return; 1316 } 1317 1318 int filterIndex = 0; 1319 float4 sum = (float4)0.0f; 1320 float gamma = 0.0f; 1321 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1322 int cacheIndexY = get_local_id(1); 1323 for (int j = 0; j < filterHeight; j++) { 1324 int cacheIndexX = get_local_id(0); 1325 for (int i = 0; i < filterWidth; i++) { 1326 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1327 float f = filterCache[filterIndex]; 1328 1329 sum.x += f * p.x; 1330 sum.y += f * p.y; 1331 sum.z += f * p.z; 1332 sum.w += f * p.w; 1333 1334 gamma += f; 1335 filterIndex++; 1336 cacheIndexX++; 1337 } 1338 cacheIndexY++; 1339 } 1340 } 1341 else { 1342 int cacheIndexY = get_local_id(1); 1343 for (int j = 0; j < filterHeight; j++) { 1344 int cacheIndexX = get_local_id(0); 1345 for (int i = 0; i < filterWidth; i++) { 1346 1347 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1348 float alpha = QuantumScale*p.w; 1349 float f = filterCache[filterIndex]; 1350 float g = alpha * f; 1351 1352 sum.x += g*p.x; 1353 sum.y += g*p.y; 1354 sum.z += g*p.z; 1355 sum.w += f*p.w; 1356 1357 gamma += g; 1358 filterIndex++; 1359 cacheIndexX++; 1360 } 1361 cacheIndexY++; 1362 } 1363 gamma = PerceptibleReciprocal(gamma); 1364 sum.xyz = gamma*sum.xyz; 1365 } 1366 CLPixelType outputPixel; 1367 outputPixel.x = ClampToQuantum(sum.x); 1368 outputPixel.y = ClampToQuantum(sum.y); 1369 outputPixel.z = ClampToQuantum(sum.z); 1370 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1371 1372 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1373 } 1374 ) 1375 1376 STRINGIFY( 1377 __kernel 1378 void Convolve(const __global CLPixelType *input, __global CLPixelType *output, 1379 const uint imageWidth, const uint imageHeight, 1380 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1381 const uint matte, const ChannelType channel) { 1382 1383 int2 imageIndex; 1384 imageIndex.x = get_global_id(0); 1385 imageIndex.y = get_global_id(1); 1386 1387 /* 1388 unsigned int imageWidth = get_global_size(0); 1389 unsigned int imageHeight = get_global_size(1); 1390 */ 1391 if (imageIndex.x >= imageWidth 1392 || imageIndex.y >= imageHeight) 1393 return; 1394 1395 int2 midFilterDimen; 1396 midFilterDimen.x = (filterWidth-1)/2; 1397 midFilterDimen.y = (filterHeight-1)/2; 1398 1399 int filterIndex = 0; 1400 float4 sum = (float4)0.0f; 1401 float gamma = 0.0f; 1402 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1403 for (int j = 0; j < filterHeight; j++) { 1404 int2 inputPixelIndex; 1405 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1406 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1407 for (int i = 0; i < filterWidth; i++) { 1408 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1409 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1410 1411 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1412 float f = filter[filterIndex]; 1413 1414 sum.x += f * p.x; 1415 sum.y += f * p.y; 1416 sum.z += f * p.z; 1417 sum.w += f * p.w; 1418 1419 gamma += f; 1420 1421 filterIndex++; 1422 } 1423 } 1424 } 1425 else { 1426 1427 for (int j = 0; j < filterHeight; j++) { 1428 int2 inputPixelIndex; 1429 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1430 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1431 for (int i = 0; i < filterWidth; i++) { 1432 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1433 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1434 1435 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1436 float alpha = QuantumScale*p.w; 1437 float f = filter[filterIndex]; 1438 float g = alpha * f; 1439 1440 sum.x += g*p.x; 1441 sum.y += g*p.y; 1442 sum.z += g*p.z; 1443 sum.w += f*p.w; 1444 1445 gamma += g; 1446 1447 1448 filterIndex++; 1449 } 1450 } 1451 gamma = PerceptibleReciprocal(gamma); 1452 sum.xyz = gamma*sum.xyz; 1453 } 1454 1455 CLPixelType outputPixel; 1456 outputPixel.x = ClampToQuantum(sum.x); 1457 outputPixel.y = ClampToQuantum(sum.y); 1458 outputPixel.z = ClampToQuantum(sum.z); 1459 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1460 1461 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1462 } 1463 ) 1464 1465 /* 1466 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1467 % % 1468 % % 1469 % % 1470 % D e s p e c k l e % 1471 % % 1472 % % 1473 % % 1474 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1475 */ 1476 1477 STRINGIFY( 1478 1479 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1480 , const unsigned int imageWidth, const unsigned int imageHeight 1481 , const int2 offset, const int polarity, const int matte) { 1482 1483 int x = get_global_id(0); 1484 int y = get_global_id(1); 1485 1486 CLPixelType v = inputImage[y*imageWidth+x]; 1487 1488 int2 neighbor; 1489 neighbor.y = y + offset.y; 1490 neighbor.x = x + offset.x; 1491 1492 int2 clampedNeighbor; 1493 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1494 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1495 1496 CLPixelType r = (clampedNeighbor.x == neighbor.x 1497 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1498 :(CLPixelType)0; 1499 1500 int sv[4]; 1501 sv[0] = (int)v.x; 1502 sv[1] = (int)v.y; 1503 sv[2] = (int)v.z; 1504 sv[3] = (int)v.w; 1505 1506 int sr[4]; 1507 sr[0] = (int)r.x; 1508 sr[1] = (int)r.y; 1509 sr[2] = (int)r.z; 1510 sr[3] = (int)r.w; 1511 1512 if (polarity > 0) { 1513 \n #pragma unroll 4\n 1514 for (unsigned int i = 0; i < 4; i++) { 1515 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; 1516 } 1517 } 1518 else { 1519 \n #pragma unroll 4\n 1520 for (unsigned int i = 0; i < 4; i++) { 1521 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; 1522 } 1523 1524 } 1525 1526 v.x = (CLQuantum)sv[0]; 1527 v.y = (CLQuantum)sv[1]; 1528 v.z = (CLQuantum)sv[2]; 1529 1530 if (matte!=0) 1531 v.w = (CLQuantum)sv[3]; 1532 1533 outputImage[y*imageWidth+x] = v; 1534 1535 } 1536 1537 1538 ) 1539 1540 STRINGIFY( 1541 1542 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1543 , const unsigned int imageWidth, const unsigned int imageHeight 1544 , const int2 offset, const int polarity, const int matte) { 1545 1546 int x = get_global_id(0); 1547 int y = get_global_id(1); 1548 1549 CLPixelType v = inputImage[y*imageWidth+x]; 1550 1551 int2 neighbor, clampedNeighbor; 1552 1553 neighbor.y = y + offset.y; 1554 neighbor.x = x + offset.x; 1555 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1556 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1557 1558 CLPixelType r = (clampedNeighbor.x == neighbor.x 1559 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1560 :(CLPixelType)0; 1561 1562 1563 neighbor.y = y - offset.y; 1564 neighbor.x = x - offset.x; 1565 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1566 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1567 1568 CLPixelType s = (clampedNeighbor.x == neighbor.x 1569 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1570 :(CLPixelType)0; 1571 1572 1573 int sv[4]; 1574 sv[0] = (int)v.x; 1575 sv[1] = (int)v.y; 1576 sv[2] = (int)v.z; 1577 sv[3] = (int)v.w; 1578 1579 int sr[4]; 1580 sr[0] = (int)r.x; 1581 sr[1] = (int)r.y; 1582 sr[2] = (int)r.z; 1583 sr[3] = (int)r.w; 1584 1585 int ss[4]; 1586 ss[0] = (int)s.x; 1587 ss[1] = (int)s.y; 1588 ss[2] = (int)s.z; 1589 ss[3] = (int)s.w; 1590 1591 if (polarity > 0) { 1592 \n #pragma unroll 4\n 1593 for (unsigned int i = 0; i < 4; i++) { 1594 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; 1595 // 1596 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1597 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1598 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1599 } 1600 } 1601 else { 1602 \n #pragma unroll 4\n 1603 for (unsigned int i = 0; i < 4; i++) { 1604 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; 1605 // 1606 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1607 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1608 } 1609 } 1610 1611 v.x = (CLQuantum)sv[0]; 1612 v.y = (CLQuantum)sv[1]; 1613 v.z = (CLQuantum)sv[2]; 1614 1615 if (matte!=0) 1616 v.w = (CLQuantum)sv[3]; 1617 1618 outputImage[y*imageWidth+x] = v; 1619 1620 } 1621 ) 1622 1623 /* 1624 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1625 % % 1626 % % 1627 % % 1628 % E q u a l i z e % 1629 % % 1630 % % 1631 % % 1632 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1633 */ 1634 1635 STRINGIFY( 1636 /* 1637 */ 1638 __kernel void Equalize(__global CLPixelType * restrict im, 1639 const ChannelType channel, 1640 __global CLPixelType * restrict equalize_map, 1641 const float4 white, const float4 black) 1642 { 1643 const int x = get_global_id(0); 1644 const int y = get_global_id(1); 1645 const int columns = get_global_size(0); 1646 const int c = x + y * columns; 1647 1648 uint ePos; 1649 CLPixelType oValue, eValue; 1650 CLQuantum red, green, blue, alpha; 1651 1652 //read from global 1653 oValue=im[c]; 1654 1655 if ((channel & SyncChannels) != 0) 1656 { 1657 if (getRedF4(white) != getRedF4(black)) 1658 { 1659 ePos = ScaleQuantumToMap(getRed(oValue)); 1660 eValue = equalize_map[ePos]; 1661 red = getRed(eValue); 1662 ePos = ScaleQuantumToMap(getGreen(oValue)); 1663 eValue = equalize_map[ePos]; 1664 green = getRed(eValue); 1665 ePos = ScaleQuantumToMap(getBlue(oValue)); 1666 eValue = equalize_map[ePos]; 1667 blue = getRed(eValue); 1668 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1669 eValue = equalize_map[ePos]; 1670 alpha = getRed(eValue); 1671 1672 //write back 1673 im[c]=(CLPixelType)(blue, green, red, alpha); 1674 } 1675 1676 } 1677 1678 // for equalizing, we always need all channels? 1679 // otherwise something more 1680 1681 } 1682 ) 1683 1684 /* 1685 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1686 % % 1687 % % 1688 % % 1689 % F u n c t i o n % 1690 % % 1691 % % 1692 % % 1693 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1694 */ 1695 1696 STRINGIFY( 1697 /* 1698 apply FunctionImageChannel(braightness-contrast) 1699 */ 1700 CLQuantum ApplyFunction(float pixel,const MagickFunction function, 1701 const unsigned int number_parameters,__constant float *parameters) 1702 { 1703 float result = 0.0f; 1704 1705 switch (function) 1706 { 1707 case PolynomialFunction: 1708 { 1709 for (unsigned int i=0; i < number_parameters; i++) 1710 result = result*QuantumScale*pixel + parameters[i]; 1711 result *= QuantumRange; 1712 break; 1713 } 1714 case SinusoidFunction: 1715 { 1716 float freq,phase,ampl,bias; 1717 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1718 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; 1719 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; 1720 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1721 result = QuantumRange*(ampl*sin(2.0f*MagickPI* 1722 (freq*QuantumScale*pixel + phase/360.0f)) + bias); 1723 break; 1724 } 1725 case ArcsinFunction: 1726 { 1727 float width,range,center,bias; 1728 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1729 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1730 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1731 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1732 1733 result = 2.0f/width*(QuantumScale*pixel - center); 1734 result = range/MagickPI*asin(result)+bias; 1735 result = ( result <= -1.0f ) ? bias - range/2.0f : result; 1736 result = ( result >= 1.0f ) ? bias + range/2.0f : result; 1737 result *= QuantumRange; 1738 break; 1739 } 1740 case ArctanFunction: 1741 { 1742 float slope,range,center,bias; 1743 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1744 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1745 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1746 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1747 result = MagickPI*slope*(QuantumScale*pixel-center); 1748 result = QuantumRange*(range/MagickPI*atan(result) + bias); 1749 break; 1750 } 1751 case UndefinedFunction: 1752 break; 1753 } 1754 return(ClampToQuantum(result)); 1755 } 1756 ) 1757 1758 STRINGIFY( 1759 /* 1760 Improve brightness / contrast of the image 1761 channel : define which channel is improved 1762 function : the function called to enchance the brightness contrast 1763 number_parameters : numbers of parameters 1764 parameters : the parameter 1765 */ 1766 __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, 1767 const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, 1768 __constant float *parameters) 1769 { 1770 const unsigned int x = get_global_id(0); 1771 const unsigned int y = get_global_id(1); 1772 const unsigned int columns = get_global_size(0); 1773 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1774 1775 float red; 1776 float green; 1777 float blue; 1778 float alpha; 1779 1780 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 1781 1782 if ((channel & RedChannel) != 0) 1783 red=ApplyFunction(red, function, number_parameters, parameters); 1784 1785 if (number_channels > 2) 1786 { 1787 if ((channel & GreenChannel) != 0) 1788 green=ApplyFunction(green, function, number_parameters, parameters); 1789 1790 if ((channel & BlueChannel) != 0) 1791 blue=ApplyFunction(blue, function, number_parameters, parameters); 1792 } 1793 1794 if (((number_channels == 4) || (number_channels == 2)) && 1795 ((channel & AlphaChannel) != 0)) 1796 alpha=ApplyFunction(alpha, function, number_parameters, parameters); 1797 1798 WriteChannels(p, number_channels, channel, red, green, blue, alpha); 1799 } 1800 ) 1801 1802 /* 1803 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1804 % % 1805 % % 1806 % % 1807 % G r a y s c a l e % 1808 % % 1809 % % 1810 % % 1811 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1812 */ 1813 1814 STRINGIFY( 1815 __kernel void Grayscale(__global CLQuantum *image,const int number_channels, 1816 const unsigned int colorspace,const unsigned int method) 1817 { 1818 const unsigned int x = get_global_id(0); 1819 const unsigned int y = get_global_id(1); 1820 const unsigned int columns = get_global_size(0); 1821 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1822 1823 float 1824 blue, 1825 green, 1826 red; 1827 1828 red=getPixelRed(p); 1829 green=getPixelGreen(p); 1830 blue=getPixelBlue(p); 1831 1832 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); 1833 1834 setPixelRed(p,intensity); 1835 setPixelGreen(p,intensity); 1836 setPixelBlue(p,intensity); 1837 } 1838 ) 1839 1840 /* 1841 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1842 % % 1843 % % 1844 % % 1845 % L o c a l C o n t r a s t % 1846 % % 1847 % % 1848 % % 1849 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1850 */ 1851 1852 STRINGIFY( 1853 1854 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, 1855 const int radius, 1856 const int imageWidth, 1857 const int imageHeight) 1858 { 1859 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); 1860 1861 int x = get_local_id(0); 1862 int y = get_global_id(1); 1863 1864 if ((x >= imageWidth) || (y >= imageHeight)) 1865 return; 1866 1867 global CLPixelType *src = srcImage + y * imageWidth; 1868 1869 for (int i = x; i < imageWidth; i += get_local_size(0)) { 1870 float sum = 0.0f; 1871 float weight = 1.0f; 1872 1873 int j = i - radius; 1874 while ((j + 7) < i) { 1875 for (int k = 0; k < 8; ++k) // Unroll 8x 1876 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); 1877 weight += 8.0f; 1878 j+=8; 1879 } 1880 while (j < i) { 1881 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); 1882 weight += 1.0f; 1883 ++j; 1884 } 1885 1886 while ((j + 7) < radius + i) { 1887 for (int k = 0; k < 8; ++k) // Unroll 8x 1888 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); 1889 weight -= 8.0f; 1890 j+=8; 1891 } 1892 while (j < radius + i) { 1893 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); 1894 weight -= 1.0f; 1895 ++j; 1896 } 1897 1898 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); 1899 } 1900 } 1901 ) 1902 1903 STRINGIFY( 1904 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, 1905 const int radius, 1906 const float strength, 1907 const int imageWidth, 1908 const int imageHeight) 1909 { 1910 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); 1911 1912 int x = get_global_id(0); 1913 int y = get_global_id(1); 1914 1915 if ((x >= imageWidth) || (y >= imageHeight)) 1916 return; 1917 1918 global float *src = blurImage + x; 1919 1920 float sum = 0.0f; 1921 float weight = 1.0f; 1922 1923 int j = y - radius; 1924 while ((j + 7) < y) { 1925 for (int k = 0; k < 8; ++k) // Unroll 8x 1926 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; 1927 weight += 8.0f; 1928 j+=8; 1929 } 1930 while (j < y) { 1931 sum += weight * src[mirrorBottom(j) * imageWidth]; 1932 weight += 1.0f; 1933 ++j; 1934 } 1935 1936 while ((j + 7) < radius + y) { 1937 for (int k = 0; k < 8; ++k) // Unroll 8x 1938 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; 1939 weight -= 8.0f; 1940 j+=8; 1941 } 1942 while (j < radius + y) { 1943 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; 1944 weight -= 1.0f; 1945 ++j; 1946 } 1947 1948 CLPixelType pixel = srcImage[x + y * imageWidth]; 1949 float srcVal = dot(RGB, convert_float4(pixel)); 1950 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); 1951 mult = (srcVal + mult) / srcVal; 1952 1953 pixel.x = ClampToQuantum(pixel.x * mult); 1954 pixel.y = ClampToQuantum(pixel.y * mult); 1955 pixel.z = ClampToQuantum(pixel.z * mult); 1956 1957 dstImage[x + y * imageWidth] = pixel; 1958 } 1959 ) 1960 1961 /* 1962 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1963 % % 1964 % % 1965 % % 1966 % M o d u l a t e % 1967 % % 1968 % % 1969 % % 1970 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1971 */ 1972 1973 STRINGIFY( 1974 1975 inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, 1976 float *hue, float *saturation, float *lightness) 1977 { 1978 float 1979 c, 1980 tmax, 1981 tmin; 1982 1983 /* 1984 Convert RGB to HSL colorspace. 1985 */ 1986 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); 1987 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); 1988 1989 c=tmax-tmin; 1990 1991 *lightness=(tmax+tmin)/2.0; 1992 if (c <= 0.0) 1993 { 1994 *hue=0.0; 1995 *saturation=0.0; 1996 return; 1997 } 1998 1999 if (tmax == (QuantumScale*red)) 2000 { 2001 *hue=(QuantumScale*green-QuantumScale*blue)/c; 2002 if ((QuantumScale*green) < (QuantumScale*blue)) 2003 *hue+=6.0; 2004 } 2005 else 2006 if (tmax == (QuantumScale*green)) 2007 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; 2008 else 2009 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; 2010 2011 *hue*=60.0/360.0; 2012 if (*lightness <= 0.5) 2013 *saturation=c/(2.0*(*lightness)); 2014 else 2015 *saturation=c/(2.0-2.0*(*lightness)); 2016 } 2017 2018 inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, 2019 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2020 { 2021 float 2022 b, 2023 c, 2024 g, 2025 h, 2026 tmin, 2027 r, 2028 x; 2029 2030 /* 2031 Convert HSL to RGB colorspace. 2032 */ 2033 h=hue*360.0; 2034 if (lightness <= 0.5) 2035 c=2.0*lightness*saturation; 2036 else 2037 c=(2.0-2.0*lightness)*saturation; 2038 tmin=lightness-0.5*c; 2039 h-=360.0*floor(h/360.0); 2040 h/=60.0; 2041 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); 2042 switch ((int) floor(h) % 6) 2043 { 2044 case 0: 2045 { 2046 r=tmin+c; 2047 g=tmin+x; 2048 b=tmin; 2049 break; 2050 } 2051 case 1: 2052 { 2053 r=tmin+x; 2054 g=tmin+c; 2055 b=tmin; 2056 break; 2057 } 2058 case 2: 2059 { 2060 r=tmin; 2061 g=tmin+c; 2062 b=tmin+x; 2063 break; 2064 } 2065 case 3: 2066 { 2067 r=tmin; 2068 g=tmin+x; 2069 b=tmin+c; 2070 break; 2071 } 2072 case 4: 2073 { 2074 r=tmin+x; 2075 g=tmin; 2076 b=tmin+c; 2077 break; 2078 } 2079 case 5: 2080 { 2081 r=tmin+c; 2082 g=tmin; 2083 b=tmin+x; 2084 break; 2085 } 2086 default: 2087 { 2088 r=0.0; 2089 g=0.0; 2090 b=0.0; 2091 } 2092 } 2093 *red=ClampToQuantum(QuantumRange*r); 2094 *green=ClampToQuantum(QuantumRange*g); 2095 *blue=ClampToQuantum(QuantumRange*b); 2096 } 2097 2098 inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 2099 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2100 { 2101 float 2102 hue, 2103 lightness, 2104 saturation; 2105 2106 /* 2107 Increase or decrease color lightness, saturation, or hue. 2108 */ 2109 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); 2110 hue+=0.5*(0.01*percent_hue-1.0); 2111 while (hue < 0.0) 2112 hue+=1.0; 2113 while (hue >= 1.0) 2114 hue-=1.0; 2115 saturation*=0.01*percent_saturation; 2116 lightness*=0.01*percent_lightness; 2117 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); 2118 } 2119 2120 __kernel void Modulate(__global CLPixelType *im, 2121 const float percent_brightness, 2122 const float percent_hue, 2123 const float percent_saturation, 2124 const int colorspace) 2125 { 2126 2127 const int x = get_global_id(0); 2128 const int y = get_global_id(1); 2129 const int columns = get_global_size(0); 2130 const int c = x + y * columns; 2131 2132 CLPixelType pixel = im[c]; 2133 2134 CLQuantum 2135 blue, 2136 green, 2137 red; 2138 2139 red=getRed(pixel); 2140 green=getGreen(pixel); 2141 blue=getBlue(pixel); 2142 2143 switch (colorspace) 2144 { 2145 case HSLColorspace: 2146 default: 2147 { 2148 ModulateHSL(percent_hue, percent_saturation, percent_brightness, 2149 &red, &green, &blue); 2150 } 2151 2152 } 2153 2154 CLPixelType filteredPixel; 2155 2156 setRed(&filteredPixel, red); 2157 setGreen(&filteredPixel, green); 2158 setBlue(&filteredPixel, blue); 2159 filteredPixel.w = pixel.w; 2160 2161 im[c] = filteredPixel; 2162 } 2163 ) 2164 2165 /* 2166 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2167 % % 2168 % % 2169 % % 2170 % M o t i o n B l u r % 2171 % % 2172 % % 2173 % % 2174 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2175 */ 2176 2177 STRINGIFY( 2178 __kernel 2179 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, 2180 const unsigned int imageWidth, const unsigned int imageHeight, 2181 const __global float *filter, const unsigned int width, const __global int2* offset, 2182 const float4 bias, 2183 const ChannelType channel, const unsigned int matte) { 2184 2185 int2 currentPixel; 2186 currentPixel.x = get_global_id(0); 2187 currentPixel.y = get_global_id(1); 2188 2189 if (currentPixel.x >= imageWidth 2190 || currentPixel.y >= imageHeight) 2191 return; 2192 2193 float4 pixel; 2194 pixel.x = (float)bias.x; 2195 pixel.y = (float)bias.y; 2196 pixel.z = (float)bias.z; 2197 pixel.w = (float)bias.w; 2198 2199 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 2200 2201 for (int i = 0; i < width; i++) { 2202 // only support EdgeVirtualPixelMethod through ClampToCanvas 2203 // TODO: implement other virtual pixel method 2204 int2 samplePixel = currentPixel + offset[i]; 2205 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2206 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2207 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2208 2209 pixel.x += (filter[i] * (float)samplePixelValue.x); 2210 pixel.y += (filter[i] * (float)samplePixelValue.y); 2211 pixel.z += (filter[i] * (float)samplePixelValue.z); 2212 pixel.w += (filter[i] * (float)samplePixelValue.w); 2213 } 2214 2215 CLPixelType outputPixel; 2216 outputPixel.x = ClampToQuantum(pixel.x); 2217 outputPixel.y = ClampToQuantum(pixel.y); 2218 outputPixel.z = ClampToQuantum(pixel.z); 2219 outputPixel.w = ClampToQuantum(pixel.w); 2220 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2221 } 2222 else { 2223 2224 float gamma = 0.0f; 2225 for (int i = 0; i < width; i++) { 2226 // only support EdgeVirtualPixelMethod through ClampToCanvas 2227 // TODO: implement other virtual pixel method 2228 int2 samplePixel = currentPixel + offset[i]; 2229 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2230 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2231 2232 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2233 2234 float alpha = QuantumScale*samplePixelValue.w; 2235 float k = filter[i]; 2236 pixel.x = pixel.x + k * alpha * samplePixelValue.x; 2237 pixel.y = pixel.y + k * alpha * samplePixelValue.y; 2238 pixel.z = pixel.z + k * alpha * samplePixelValue.z; 2239 2240 pixel.w += k * alpha * samplePixelValue.w; 2241 2242 gamma+=k*alpha; 2243 } 2244 gamma = PerceptibleReciprocal(gamma); 2245 pixel.xyz = gamma*pixel.xyz; 2246 2247 CLPixelType outputPixel; 2248 outputPixel.x = ClampToQuantum(pixel.x); 2249 outputPixel.y = ClampToQuantum(pixel.y); 2250 outputPixel.z = ClampToQuantum(pixel.z); 2251 outputPixel.w = ClampToQuantum(pixel.w); 2252 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2253 } 2254 } 2255 ) 2256 2257 /* 2258 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2259 % % 2260 % % 2261 % % 2262 % R e s i z e % 2263 % % 2264 % % 2265 % % 2266 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2267 */ 2268 2269 STRINGIFY( 2270 // Based on Box from resize.c 2271 float BoxResizeFilter(const float x) 2272 { 2273 return 1.0f; 2274 } 2275 ) 2276 2277 STRINGIFY( 2278 // Based on CubicBC from resize.c 2279 float CubicBC(const float x,const __global float* resizeFilterCoefficients) 2280 { 2281 /* 2282 Cubic Filters using B,C determined values: 2283 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter 2284 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears 2285 Spline B = 1 C = 0 B-Spline Gaussian approximation 2286 Hermite B = 0 C = 0 B-Spline interpolator 2287 2288 See paper by Mitchell and Netravali, Reconstruction Filters in Computer 2289 Graphics Computer Graphics, Volume 22, Number 4, August 1988 2290 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ 2291 Mitchell.pdf. 2292 2293 Coefficents are determined from B,C values: 2294 P0 = ( 6 - 2*B )/6 = coeff[0] 2295 P1 = 0 2296 P2 = (-18 +12*B + 6*C )/6 = coeff[1] 2297 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] 2298 Q0 = ( 8*B +24*C )/6 = coeff[3] 2299 Q1 = ( -12*B -48*C )/6 = coeff[4] 2300 Q2 = ( 6*B +30*C )/6 = coeff[5] 2301 Q3 = ( - 1*B - 6*C )/6 = coeff[6] 2302 2303 which are used to define the filter: 2304 2305 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 2306 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 2307 2308 which ensures function is continuous in value and derivative (slope). 2309 */ 2310 if (x < 1.0) 2311 return(resizeFilterCoefficients[0]+x*(x* 2312 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); 2313 if (x < 2.0) 2314 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* 2315 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); 2316 return(0.0); 2317 } 2318 ) 2319 2320 STRINGIFY( 2321 float Sinc(const float x) 2322 { 2323 if (x != 0.0f) 2324 { 2325 const float alpha=(float) (MagickPI*x); 2326 return sinpi(x)/alpha; 2327 } 2328 return(1.0f); 2329 } 2330 ) 2331 2332 STRINGIFY( 2333 float Triangle(const float x) 2334 { 2335 /* 2336 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or 2337 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function 2338 for Sinc(). 2339 */ 2340 return ((x<1.0f)?(1.0f-x):0.0f); 2341 } 2342 ) 2343 2344 2345 STRINGIFY( 2346 float Hann(const float x) 2347 { 2348 /* 2349 Cosine window function: 2350 0.5+0.5*cos(pi*x). 2351 */ 2352 const float cosine=cos((MagickPI*x)); 2353 return(0.5f+0.5f*cosine); 2354 } 2355 ) 2356 2357 STRINGIFY( 2358 float Hamming(const float x) 2359 { 2360 /* 2361 Offset cosine window function: 2362 .54 + .46 cos(pi x). 2363 */ 2364 const float cosine=cos((MagickPI*x)); 2365 return(0.54f+0.46f*cosine); 2366 } 2367 ) 2368 2369 STRINGIFY( 2370 float Blackman(const float x) 2371 { 2372 /* 2373 Blackman: 2nd order cosine windowing function: 2374 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) 2375 2376 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and 2377 five flops. 2378 */ 2379 const float cosine=cos((MagickPI*x)); 2380 return(0.34f+cosine*(0.5f+cosine*0.16f)); 2381 } 2382 ) 2383 2384 STRINGIFY( 2385 inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) 2386 { 2387 switch (filterType) 2388 { 2389 /* Call Sinc even for SincFast to get better precision on GPU 2390 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ 2391 case SincWeightingFunction: 2392 case SincFastWeightingFunction: 2393 return Sinc(x); 2394 case CubicBCWeightingFunction: 2395 return CubicBC(x,filterCoefficients); 2396 case BoxWeightingFunction: 2397 return BoxResizeFilter(x); 2398 case TriangleWeightingFunction: 2399 return Triangle(x); 2400 case HannWeightingFunction: 2401 return Hann(x); 2402 case HammingWeightingFunction: 2403 return Hamming(x); 2404 case BlackmanWeightingFunction: 2405 return Blackman(x); 2406 2407 default: 2408 return 0.0f; 2409 } 2410 } 2411 ) 2412 2413 2414 STRINGIFY( 2415 inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType 2416 , const ResizeWeightingFunctionType resizeWindowType 2417 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) 2418 { 2419 float scale; 2420 float xBlur = fabs(x/resizeFilterBlur); 2421 if (resizeWindowSupport < MagickEpsilon 2422 || resizeWindowType == BoxWeightingFunction) 2423 { 2424 scale = 1.0f; 2425 } 2426 else 2427 { 2428 scale = resizeFilterScale; 2429 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); 2430 } 2431 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); 2432 return weight; 2433 } 2434 2435 ) 2436 2437 ; 2438 const char *accelerateKernels2 = 2439 2440 STRINGIFY( 2441 2442 inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2443 return (numWorkItems/pixelPerWorkgroup); 2444 } 2445 2446 // returns the index of the pixel for the current workitem to compute. 2447 // returns -1 if this workitem doesn't need to participate in any computation 2448 inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2449 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); 2450 int pixelIndex = itemID/numWorkItemsPerPixel; 2451 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1; 2452 return pixelIndex; 2453 } 2454 2455 ) 2456 2457 STRINGIFY( 2458 __kernel __attribute__((reqd_work_group_size(256, 1, 1))) 2459 void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2460 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2461 const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor, 2462 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2463 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2464 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2465 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2466 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2467 { 2468 // calculate the range of resized image pixels computed by this workgroup 2469 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup; 2470 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns); 2471 const unsigned int actualNumPixelToCompute = stopX - startX; 2472 2473 // calculate the range of input image pixels to cache 2474 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f); 2475 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2476 scale = PerceptibleReciprocal(scale); 2477 2478 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0)); 2479 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns); 2480 2481 // cache the input pixels into local memory 2482 const unsigned int y = get_global_id(1); 2483 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y); 2484 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels; 2485 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0); 2486 wait_group_events(1, &e); 2487 2488 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2489 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2490 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2491 { 2492 const unsigned int chunkStartX = startX + chunk*pixelChunkSize; 2493 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX); 2494 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX; 2495 2496 // determine which resized pixel computed by this workitem 2497 const unsigned int itemID = get_local_id(0); 2498 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0)); 2499 2500 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0)); 2501 2502 float4 filteredPixel = (float4)0.0f; 2503 float density = 0.0f; 2504 float gamma = 0.0f; 2505 // -1 means this workitem doesn't participate in the computation 2506 if (pixelIndex != -1) 2507 { 2508 // x coordinated of the resized pixel computed by this workitem 2509 const int x = chunkStartX + pixelIndex; 2510 2511 // calculate how many steps required for this pixel 2512 const float bisect = (x+0.5)/xFactor+MagickEpsilon; 2513 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2514 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns); 2515 const unsigned int n = stop - start; 2516 2517 // calculate how many steps this workitem will contribute 2518 unsigned int numStepsPerWorkItem = n / numItems; 2519 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2520 2521 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2522 if (startStep < n) 2523 { 2524 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2525 2526 unsigned int cacheIndex = start+startStep-cacheRangeStartX; 2527 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2528 { 2529 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2530 (ResizeWeightingFunctionType) resizeFilterType, 2531 (ResizeWeightingFunctionType) resizeWindowType, 2532 resizeFilterScale, resizeFilterWindowSupport, 2533 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2534 2535 float4 cp = (float4)0.0f; 2536 2537 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels); 2538 cp.x = (float) *(p); 2539 if (number_channels > 2) 2540 { 2541 cp.y = (float) *(p + 1); 2542 cp.z = (float) *(p + 2); 2543 } 2544 2545 if (alpha_index != 0) 2546 { 2547 cp.w = (float) *(p + alpha_index); 2548 2549 float alpha = weight * QuantumScale * cp.w; 2550 2551 filteredPixel.x += alpha * cp.x; 2552 filteredPixel.y += alpha * cp.y; 2553 filteredPixel.z += alpha * cp.z; 2554 filteredPixel.w += weight * cp.w; 2555 gamma += alpha; 2556 } 2557 else 2558 filteredPixel += ((float4) weight)*cp; 2559 2560 density += weight; 2561 } 2562 } 2563 } 2564 2565 // initialize the accumulators to zero 2566 if (itemID < actualNumPixelInThisChunk) { 2567 outputPixelCache[itemID] = (float4)0.0f; 2568 densityCache[itemID] = 0.0f; 2569 if (alpha_index != 0) 2570 gammaCache[itemID] = 0.0f; 2571 } 2572 barrier(CLK_LOCAL_MEM_FENCE); 2573 2574 // accumulatte the filtered pixel value and the density 2575 for (unsigned int i = 0; i < numItems; i++) { 2576 if (pixelIndex != -1) { 2577 if (itemID%numItems == i) { 2578 outputPixelCache[pixelIndex]+=filteredPixel; 2579 densityCache[pixelIndex]+=density; 2580 if (alpha_index != 0) 2581 gammaCache[pixelIndex]+=gamma; 2582 } 2583 } 2584 barrier(CLK_LOCAL_MEM_FENCE); 2585 } 2586 2587 if (itemID < actualNumPixelInThisChunk) 2588 { 2589 float4 filteredPixel = outputPixelCache[itemID]; 2590 2591 float gamma = 0.0f; 2592 if (alpha_index != 0) 2593 gamma = gammaCache[itemID]; 2594 2595 float density = densityCache[itemID]; 2596 if ((density != 0.0f) && (density != 1.0f)) 2597 { 2598 density = PerceptibleReciprocal(density); 2599 filteredPixel *= (float4) density; 2600 if (alpha_index != 0) 2601 gamma *= density; 2602 } 2603 2604 if (alpha_index != 0) 2605 { 2606 gamma = PerceptibleReciprocal(gamma); 2607 filteredPixel.x *= gamma; 2608 filteredPixel.y *= gamma; 2609 filteredPixel.z *= gamma; 2610 } 2611 2612 WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel); 2613 } 2614 } 2615 } 2616 ) 2617 2618 2619 STRINGIFY( 2620 __kernel __attribute__((reqd_work_group_size(1, 256, 1))) 2621 void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2622 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2623 const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, 2624 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2625 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2626 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2627 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2628 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2629 { 2630 // calculate the range of resized image pixels computed by this workgroup 2631 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; 2632 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); 2633 const unsigned int actualNumPixelToCompute = stopY - startY; 2634 2635 // calculate the range of input image pixels to cache 2636 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); 2637 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2638 scale = PerceptibleReciprocal(scale); 2639 2640 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); 2641 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); 2642 2643 // cache the input pixels into local memory 2644 const unsigned int x = get_global_id(0); 2645 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); 2646 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; 2647 unsigned int stride = inputColumns * number_channels; 2648 for (unsigned int i = 0; i < number_channels; i++) 2649 { 2650 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); 2651 wait_group_events(1,&e); 2652 } 2653 2654 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2655 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2656 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2657 { 2658 const unsigned int chunkStartY = startY + chunk*pixelChunkSize; 2659 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); 2660 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; 2661 2662 // determine which resized pixel computed by this workitem 2663 const unsigned int itemID = get_local_id(1); 2664 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); 2665 2666 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); 2667 2668 float4 filteredPixel = (float4)0.0f; 2669 float density = 0.0f; 2670 float gamma = 0.0f; 2671 // -1 means this workitem doesn't participate in the computation 2672 if (pixelIndex != -1) 2673 { 2674 // x coordinated of the resized pixel computed by this workitem 2675 const int y = chunkStartY + pixelIndex; 2676 2677 // calculate how many steps required for this pixel 2678 const float bisect = (y+0.5)/yFactor+MagickEpsilon; 2679 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2680 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); 2681 const unsigned int n = stop - start; 2682 2683 // calculate how many steps this workitem will contribute 2684 unsigned int numStepsPerWorkItem = n / numItems; 2685 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2686 2687 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2688 if (startStep < n) 2689 { 2690 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2691 2692 unsigned int cacheIndex = start+startStep-cacheRangeStartY; 2693 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2694 { 2695 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2696 (ResizeWeightingFunctionType) resizeFilterType, 2697 (ResizeWeightingFunctionType) resizeWindowType, 2698 resizeFilterScale, resizeFilterWindowSupport, 2699 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2700 2701 float4 cp = (float4)0.0f; 2702 2703 __local CLQuantum *p = inputImageCache + cacheIndex; 2704 cp.x = (float) *(p); 2705 if (number_channels > 2) 2706 { 2707 cp.y = (float) *(p + rangeLength); 2708 cp.z = (float) *(p + (rangeLength * 2)); 2709 } 2710 2711 if (alpha_index != 0) 2712 { 2713 cp.w = (float) *(p + (rangeLength * alpha_index)); 2714 2715 float alpha = weight * QuantumScale * cp.w; 2716 2717 filteredPixel.x += alpha * cp.x; 2718 filteredPixel.y += alpha * cp.y; 2719 filteredPixel.z += alpha * cp.z; 2720 filteredPixel.w += weight * cp.w; 2721 gamma += alpha; 2722 } 2723 else 2724 filteredPixel += ((float4) weight)*cp; 2725 2726 density += weight; 2727 } 2728 } 2729 } 2730 2731 // initialize the accumulators to zero 2732 if (itemID < actualNumPixelInThisChunk) { 2733 outputPixelCache[itemID] = (float4)0.0f; 2734 densityCache[itemID] = 0.0f; 2735 if (alpha_index != 0) 2736 gammaCache[itemID] = 0.0f; 2737 } 2738 barrier(CLK_LOCAL_MEM_FENCE); 2739 2740 // accumulatte the filtered pixel value and the density 2741 for (unsigned int i = 0; i < numItems; i++) { 2742 if (pixelIndex != -1) { 2743 if (itemID%numItems == i) { 2744 outputPixelCache[pixelIndex]+=filteredPixel; 2745 densityCache[pixelIndex]+=density; 2746 if (alpha_index != 0) 2747 gammaCache[pixelIndex]+=gamma; 2748 } 2749 } 2750 barrier(CLK_LOCAL_MEM_FENCE); 2751 } 2752 2753 if (itemID < actualNumPixelInThisChunk) 2754 { 2755 float4 filteredPixel = outputPixelCache[itemID]; 2756 2757 float gamma = 0.0f; 2758 if (alpha_index != 0) 2759 gamma = gammaCache[itemID]; 2760 2761 float density = densityCache[itemID]; 2762 if ((density != 0.0f) && (density != 1.0f)) 2763 { 2764 density = PerceptibleReciprocal(density); 2765 filteredPixel *= (float4) density; 2766 if (alpha_index != 0) 2767 gamma *= density; 2768 } 2769 2770 if (alpha_index != 0) 2771 { 2772 gamma = PerceptibleReciprocal(gamma); 2773 filteredPixel.x *= gamma; 2774 filteredPixel.y *= gamma; 2775 filteredPixel.z *= gamma; 2776 } 2777 2778 WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel); 2779 } 2780 } 2781 } 2782 ) 2783 2784 /* 2785 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2786 % % 2787 % % 2788 % % 2789 % R o t a t i o n a l B l u r % 2790 % % 2791 % % 2792 % % 2793 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2794 */ 2795 2796 STRINGIFY( 2797 __kernel void RotationalBlur(const __global CLQuantum *image, 2798 const unsigned int number_channels,const unsigned int channel, 2799 const float2 blurCenter,__constant float *cos_theta, 2800 __constant float *sin_theta,const unsigned int cossin_theta_size, 2801 __global CLQuantum *filteredImage) 2802 { 2803 const int x = get_global_id(0); 2804 const int y = get_global_id(1); 2805 const int columns = get_global_size(0); 2806 const int rows = get_global_size(1); 2807 unsigned int step = 1; 2808 float center_x = (float) x - blurCenter.x; 2809 float center_y = (float) y - blurCenter.y; 2810 float radius = hypot(center_x, center_y); 2811 2812 float blur_radius = hypot(blurCenter.x, blurCenter.y); 2813 2814 if (radius > MagickEpsilon) 2815 { 2816 step = (unsigned int) (blur_radius / radius); 2817 if (step == 0) 2818 step = 1; 2819 if (step >= cossin_theta_size) 2820 step = cossin_theta_size-1; 2821 } 2822 2823 float4 result = 0.0f; 2824 float normalize = 0.0f; 2825 float gamma = 0.0f; 2826 2827 for (unsigned int i=0; i<cossin_theta_size; i+=step) 2828 { 2829 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns); 2830 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows); 2831 2832 float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy); 2833 2834 if ((number_channels == 4) || (number_channels == 2)) 2835 { 2836 float alpha = (float)(QuantumScale*pixel.w); 2837 2838 gamma += alpha; 2839 2840 result.x += alpha * pixel.x; 2841 result.y += alpha * pixel.y; 2842 result.z += alpha * pixel.z; 2843 result.w += pixel.w; 2844 } 2845 else 2846 result += pixel; 2847 2848 normalize += 1.0f; 2849 } 2850 2851 normalize = PerceptibleReciprocal(normalize); 2852 2853 if ((number_channels == 4) || (number_channels == 2)) 2854 { 2855 gamma = PerceptibleReciprocal(gamma); 2856 result.x *= gamma; 2857 result.y *= gamma; 2858 result.z *= gamma; 2859 result.w *= normalize; 2860 } 2861 else 2862 result *= normalize; 2863 2864 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 2865 } 2866 ) 2867 2868 /* 2869 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2870 % % 2871 % % 2872 % % 2873 % U n s h a r p M a s k % 2874 % % 2875 % % 2876 % % 2877 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2878 */ 2879 2880 STRINGIFY( 2881 __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image, 2882 const __global float4 *blurRowData,const unsigned int number_channels, 2883 const ChannelType channel,const unsigned int columns, 2884 const unsigned int rows,__local float4* cachedData, 2885 __local float* cachedFilter,const __global float *filter, 2886 const unsigned int width,const float gain, const float threshold, 2887 __global CLQuantum *filteredImage) 2888 { 2889 const unsigned int radius = (width-1)/2; 2890 2891 // cache the pixel shared by the workgroup 2892 const int groupX = get_group_id(0); 2893 const int groupStartY = get_group_id(1)*get_local_size(1) - radius; 2894 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius; 2895 2896 if ((groupStartY >= 0) && (groupStopY < rows)) 2897 { 2898 event_t e = async_work_group_strided_copy(cachedData, 2899 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); 2900 wait_group_events(1,&e); 2901 } 2902 else 2903 { 2904 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) 2905 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; 2906 2907 barrier(CLK_LOCAL_MEM_FENCE); 2908 } 2909 // cache the filter as well 2910 event_t e = async_work_group_copy(cachedFilter,filter,width,0); 2911 wait_group_events(1,&e); 2912 2913 // only do the work if this is not a patched item 2914 const int cy = get_global_id(1); 2915 2916 if (cy < rows) 2917 { 2918 float4 blurredPixel = (float4) 0.0f; 2919 2920 int i = 0; 2921 2922 for ( ; i+7 < width; ) 2923 { 2924 for (int j=0; j < 8; j++, i++) 2925 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; 2926 } 2927 2928 for ( ; i < width; i++) 2929 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; 2930 2931 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); 2932 float4 outputPixel = inputImagePixel - blurredPixel; 2933 2934 float quantumThreshold = QuantumRange*threshold; 2935 2936 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); 2937 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); 2938 2939 //write back 2940 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); 2941 } 2942 } 2943 ) 2944 2945 STRINGIFY( 2946 __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, 2947 const ChannelType channel,__constant float *filter,const unsigned int width, 2948 const unsigned int columns,const unsigned int rows,__local float4 *pixels, 2949 const float gain,const float threshold,__global CLQuantum *filteredImage) 2950 { 2951 const unsigned int x = get_global_id(0); 2952 const unsigned int y = get_global_id(1); 2953 2954 const unsigned int radius = (width - 1) / 2; 2955 2956 int row = y - radius; 2957 int baseRow = get_group_id(1) * get_local_size(1) - radius; 2958 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; 2959 2960 while (row < endRow) { 2961 int srcy = (row < 0) ? -row : row; // mirror pad 2962 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; 2963 2964 float4 value = 0.0f; 2965 2966 int ix = x - radius; 2967 int i = 0; 2968 2969 while (i + 7 < width) { 2970 for (int j = 0; j < 8; ++j) { // unrolled 2971 int srcx = ix + j; 2972 srcx = (srcx < 0) ? -srcx : srcx; 2973 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2974 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2975 } 2976 ix += 8; 2977 i += 8; 2978 } 2979 2980 while (i < width) { 2981 int srcx = (ix < 0) ? -ix : ix; // mirror pad 2982 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2983 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2984 ++i; 2985 ++ix; 2986 } 2987 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; 2988 row += get_local_size(1); 2989 } 2990 2991 barrier(CLK_LOCAL_MEM_FENCE); 2992 2993 const int px = get_local_id(0); 2994 const int py = get_local_id(1); 2995 const int prp = get_local_size(0); 2996 float4 value = (float4)(0.0f); 2997 2998 int i = 0; 2999 while (i + 7 < width) { 3000 for (int j = 0; j < 8; ++j) // unrolled 3001 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; 3002 i += 8; 3003 } 3004 while (i < width) { 3005 value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; 3006 ++i; 3007 } 3008 3009 if ((x < columns) && (y < rows)) { 3010 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); 3011 float4 diff = srcPixel - value; 3012 3013 float quantumThreshold = QuantumRange*threshold; 3014 3015 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); 3016 value = select(srcPixel + diff * gain, srcPixel, mask); 3017 3018 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); 3019 } 3020 } 3021 ) 3022 3023 STRINGIFY( 3024 __kernel __attribute__((reqd_work_group_size(64, 4, 1))) 3025 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, 3026 const unsigned int number_channels,const unsigned int max_channels, 3027 const float threshold,const int passes,const unsigned int imageWidth, 3028 const unsigned int imageHeight) 3029 { 3030 const int pad = (1 << (passes - 1)); 3031 const int tileSize = 64; 3032 const int tileRowPixels = 64; 3033 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; 3034 3035 CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) 3036 3037 local float buffer[64 * 64]; 3038 3039 int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0); 3040 int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad; 3041 3042 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3043 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + 3044 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; 3045 3046 for (int channel = 0; channel < max_channels; ++channel) 3047 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; 3048 } 3049 3050 for (int channel = 0; channel < max_channels; ++channel) { 3051 // Load LDS 3052 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3053 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); 3054 3055 // Process 3056 3057 float tmp[16]; 3058 float accum[16]; 3059 float pixel; 3060 3061 for (int i = 0; i < 16; i++) 3062 accum[i]=0.0f; 3063 3064 for (int pass = 0; pass < passes; ++pass) { 3065 const int radius = 1 << pass; 3066 const int x = get_local_id(0); 3067 const float thresh = threshold * noise[pass]; 3068 3069 // Apply horizontal hat 3070 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3071 const int offset = i * tileRowPixels; 3072 if (pass == 0) 3073 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass 3074 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); 3075 barrier(CLK_LOCAL_MEM_FENCE); 3076 buffer[x + offset] = pixel; 3077 } 3078 barrier(CLK_LOCAL_MEM_FENCE); 3079 3080 // Apply vertical hat 3081 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3082 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); 3083 float delta = tmp[i / 4] - pixel; 3084 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done 3085 if (delta < -thresh) 3086 delta += thresh; 3087 else if (delta > thresh) 3088 delta -= thresh; 3089 else 3090 delta = 0; 3091 accum[i / 4] += delta; 3092 } 3093 barrier(CLK_LOCAL_MEM_FENCE); 3094 3095 if (pass < passes - 1) 3096 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3097 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass 3098 else // last pass 3099 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3100 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output 3101 barrier(CLK_LOCAL_MEM_FENCE); 3102 } 3103 3104 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3105 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); 3106 3107 barrier(CLK_LOCAL_MEM_FENCE); 3108 } 3109 3110 // Write from stage to output 3111 3112 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { 3113 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3114 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { 3115 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); 3116 for (int channel = 0; channel < max_channels; ++channel) { 3117 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; 3118 } 3119 } 3120 } 3121 } 3122 } 3123 ) 3124 3125 ; 3126 3127 #endif // MAGICKCORE_OPENCL_SUPPORT 3128 3129 #if defined(__cplusplus) || defined(c_plusplus) 3130 } 3131 #endif 3132 3133 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 3134