1 /* 2 Copyright 1999-2021 ImageMagick Studio LLC, a non-profit organization 3 dedicated to making software imaging solutions freely available. 4 5 You may not use this file except in compliance with the License. You may 6 obtain a copy of the License at 7 8 https://imagemagick.org/script/license.php 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 16 MagickCore private kernels for accelerated functions. 17 */ 18 19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 21 22 #if defined(__cplusplus) || defined(c_plusplus) 23 extern "C" { 24 #endif 25 26 #if defined(MAGICKCORE_OPENCL_SUPPORT) 27 28 /* 29 Define declarations. 30 */ 31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" 32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" 33 #define OPENCL_ELSE() "\n #""else " " \n" 34 #define OPENCL_ENDIF() "\n #""endif " " \n" 35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" 36 #define STRINGIFY(...) #__VA_ARGS__ "\n" 37 38 const char *accelerateKernels = 39 40 /* 41 Define declarations. 42 */ 43 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) 44 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) 45 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) 46 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) 47 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) 48 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) 49 OPENCL_DEFINE(SigmaRandom, (attenuate)) 50 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) 51 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) 52 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) 53 54 /* 55 Typedef declarations. 56 */ 57 STRINGIFY( 58 typedef enum 59 { 60 UndefinedColorspace, 61 CMYColorspace, /* negated linear RGB colorspace */ 62 CMYKColorspace, /* CMY with Black separation */ 63 GRAYColorspace, /* Single Channel greyscale (non-linear) image */ 64 HCLColorspace, 65 HCLpColorspace, 66 HSBColorspace, 67 HSIColorspace, 68 HSLColorspace, 69 HSVColorspace, /* alias for HSB */ 70 HWBColorspace, 71 LabColorspace, 72 LCHColorspace, /* alias for LCHuv */ 73 LCHabColorspace, /* Cylindrical (Polar) Lab */ 74 LCHuvColorspace, /* Cylindrical (Polar) Luv */ 75 LogColorspace, 76 LMSColorspace, 77 LuvColorspace, 78 OHTAColorspace, 79 Rec601YCbCrColorspace, 80 Rec709YCbCrColorspace, 81 RGBColorspace, /* Linear RGB colorspace */ 82 scRGBColorspace, /* ??? */ 83 sRGBColorspace, /* Default: non-linear sRGB colorspace */ 84 TransparentColorspace, 85 xyYColorspace, 86 XYZColorspace, /* IEEE Color Reference colorspace */ 87 YCbCrColorspace, 88 YCCColorspace, 89 YDbDrColorspace, 90 YIQColorspace, 91 YPbPrColorspace, 92 YUVColorspace, 93 LinearGRAYColorspace /* Single Channel greyscale (linear) image */ 94 } ColorspaceType; 95 ) 96 97 STRINGIFY( 98 typedef enum 99 { 100 UndefinedCompositeOp, 101 AlphaCompositeOp, 102 AtopCompositeOp, 103 BlendCompositeOp, 104 BlurCompositeOp, 105 BumpmapCompositeOp, 106 ChangeMaskCompositeOp, 107 ClearCompositeOp, 108 ColorBurnCompositeOp, 109 ColorDodgeCompositeOp, 110 ColorizeCompositeOp, 111 CopyBlackCompositeOp, 112 CopyBlueCompositeOp, 113 CopyCompositeOp, 114 CopyCyanCompositeOp, 115 CopyGreenCompositeOp, 116 CopyMagentaCompositeOp, 117 CopyAlphaCompositeOp, 118 CopyRedCompositeOp, 119 CopyYellowCompositeOp, 120 DarkenCompositeOp, 121 DarkenIntensityCompositeOp, 122 DifferenceCompositeOp, 123 DisplaceCompositeOp, 124 DissolveCompositeOp, 125 DistortCompositeOp, 126 DivideDstCompositeOp, 127 DivideSrcCompositeOp, 128 DstAtopCompositeOp, 129 DstCompositeOp, 130 DstInCompositeOp, 131 DstOutCompositeOp, 132 DstOverCompositeOp, 133 ExclusionCompositeOp, 134 HardLightCompositeOp, 135 HardMixCompositeOp, 136 HueCompositeOp, 137 InCompositeOp, 138 IntensityCompositeOp, 139 LightenCompositeOp, 140 LightenIntensityCompositeOp, 141 LinearBurnCompositeOp, 142 LinearDodgeCompositeOp, 143 LinearLightCompositeOp, 144 LuminizeCompositeOp, 145 MathematicsCompositeOp, 146 MinusDstCompositeOp, 147 MinusSrcCompositeOp, 148 ModulateCompositeOp, 149 ModulusAddCompositeOp, 150 ModulusSubtractCompositeOp, 151 MultiplyCompositeOp, 152 NoCompositeOp, 153 OutCompositeOp, 154 OverCompositeOp, 155 OverlayCompositeOp, 156 PegtopLightCompositeOp, 157 PinLightCompositeOp, 158 PlusCompositeOp, 159 ReplaceCompositeOp, 160 SaturateCompositeOp, 161 ScreenCompositeOp, 162 SoftLightCompositeOp, 163 SrcAtopCompositeOp, 164 SrcCompositeOp, 165 SrcInCompositeOp, 166 SrcOutCompositeOp, 167 SrcOverCompositeOp, 168 ThresholdCompositeOp, 169 VividLightCompositeOp, 170 XorCompositeOp, 171 StereoCompositeOp 172 } CompositeOperator; 173 ) 174 175 STRINGIFY( 176 typedef enum 177 { 178 UndefinedFunction, 179 ArcsinFunction, 180 ArctanFunction, 181 PolynomialFunction, 182 SinusoidFunction 183 } MagickFunction; 184 ) 185 186 STRINGIFY( 187 typedef enum 188 { 189 UndefinedNoise, 190 UniformNoise, 191 GaussianNoise, 192 MultiplicativeGaussianNoise, 193 ImpulseNoise, 194 LaplacianNoise, 195 PoissonNoise, 196 RandomNoise 197 } NoiseType; 198 ) 199 200 STRINGIFY( 201 typedef enum 202 { 203 UndefinedPixelIntensityMethod = 0, 204 AveragePixelIntensityMethod, 205 BrightnessPixelIntensityMethod, 206 LightnessPixelIntensityMethod, 207 MSPixelIntensityMethod, 208 Rec601LumaPixelIntensityMethod, 209 Rec601LuminancePixelIntensityMethod, 210 Rec709LumaPixelIntensityMethod, 211 Rec709LuminancePixelIntensityMethod, 212 RMSPixelIntensityMethod 213 } PixelIntensityMethod; 214 ) 215 216 STRINGIFY( 217 typedef enum 218 { 219 BoxWeightingFunction = 0, 220 TriangleWeightingFunction, 221 CubicBCWeightingFunction, 222 HannWeightingFunction, 223 HammingWeightingFunction, 224 BlackmanWeightingFunction, 225 GaussianWeightingFunction, 226 QuadraticWeightingFunction, 227 JincWeightingFunction, 228 SincWeightingFunction, 229 SincFastWeightingFunction, 230 KaiserWeightingFunction, 231 WelchWeightingFunction, 232 BohmanWeightingFunction, 233 LagrangeWeightingFunction, 234 CosineWeightingFunction 235 } ResizeWeightingFunctionType; 236 ) 237 238 STRINGIFY( 239 typedef enum 240 { 241 UndefinedChannel = 0x0000, 242 RedChannel = 0x0001, 243 GrayChannel = 0x0001, 244 CyanChannel = 0x0001, 245 GreenChannel = 0x0002, 246 MagentaChannel = 0x0002, 247 BlueChannel = 0x0004, 248 YellowChannel = 0x0004, 249 BlackChannel = 0x0008, 250 AlphaChannel = 0x0010, 251 OpacityChannel = 0x0010, 252 IndexChannel = 0x0020, /* Color Index Table? */ 253 ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ 254 WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ 255 MetaChannel = 0x0100, /* ???? */ 256 CompositeChannels = 0x001F, 257 AllChannels = 0x7ffffff, 258 /* 259 Special purpose channel types. 260 FUTURE: are these needed any more - they are more like hacks 261 SyncChannels for example is NOT a real channel but a 'flag' 262 It really says -- "User has not defined channels" 263 Though it does have extra meaning in the "-auto-level" operator 264 */ 265 TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ 266 RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ 267 GrayChannels = 0x0400, 268 SyncChannels = 0x20000, /* channels modified as a single unit */ 269 DefaultChannels = AllChannels 270 } ChannelType; /* must correspond to PixelChannel */ 271 ) 272 273 /* 274 Helper functions. 275 */ 276 277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) 278 279 STRINGIFY( 280 static inline CLQuantum ScaleCharToQuantum(const unsigned char value) 281 { 282 return((CLQuantum) value); 283 } 284 ) 285 286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) 287 288 STRINGIFY( 289 static inline CLQuantum ScaleCharToQuantum(const unsigned char value) 290 { 291 return((CLQuantum) (257.0f*value)); 292 } 293 ) 294 295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) 296 297 STRINGIFY( 298 static inline CLQuantum ScaleCharToQuantum(const unsigned char value) 299 { 300 return((CLQuantum) (16843009.0*value)); 301 } 302 ) 303 304 OPENCL_ENDIF() 305 306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1)) 307 308 STRINGIFY( 309 static inline CLQuantum ClampToQuantum(const float value) 310 { 311 return (CLQuantum) clamp(value, 0.0f, QuantumRange); 312 } 313 ) 314 315 OPENCL_ELSE() 316 317 STRINGIFY( 318 static inline CLQuantum ClampToQuantum(const float value) 319 { 320 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); 321 } 322 ) 323 324 OPENCL_ENDIF() 325 326 STRINGIFY( 327 static inline int ClampToCanvas(const int offset,const int range) 328 { 329 return clamp(offset, (int)0, range-1); 330 } 331 ) 332 333 STRINGIFY( 334 static inline uint ScaleQuantumToMap(CLQuantum value) 335 { 336 if (value >= (CLQuantum) MaxMap) 337 return ((uint)MaxMap); 338 else 339 return ((uint)value); 340 } 341 ) 342 343 STRINGIFY( 344 static inline float PerceptibleReciprocal(const float x) 345 { 346 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; 347 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); 348 } 349 ) 350 351 STRINGIFY( 352 353 static inline unsigned int getPixelIndex(const unsigned int number_channels, 354 const unsigned int columns, const unsigned int x, const unsigned int y) 355 { 356 return (x * number_channels) + (y * columns * number_channels); 357 } 358 359 static inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } 360 static inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } 361 static inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } 362 static inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } 363 364 static inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } 365 static inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } 366 static inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } 367 static inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } 368 369 static inline CLQuantum getBlue(CLPixelType p) { return p.x; } 370 static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } 371 static inline float getBlueF4(float4 p) { return p.x; } 372 static inline void setBlueF4(float4* p, float value) { (*p).x = value; } 373 374 static inline CLQuantum getGreen(CLPixelType p) { return p.y; } 375 static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } 376 static inline float getGreenF4(float4 p) { return p.y; } 377 static inline void setGreenF4(float4* p, float value) { (*p).y = value; } 378 379 static inline CLQuantum getRed(CLPixelType p) { return p.z; } 380 static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } 381 static inline float getRedF4(float4 p) { return p.z; } 382 static inline void setRedF4(float4* p, float value) { (*p).z = value; } 383 384 static inline CLQuantum getAlpha(CLPixelType p) { return p.w; } 385 static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } 386 static inline float getAlphaF4(float4 p) { return p.w; } 387 static inline void setAlphaF4(float4* p, float value) { (*p).w = value; } 388 389 static inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, 390 const ChannelType channel, float *red, float *green, float *blue, float *alpha) 391 { 392 if ((channel & RedChannel) != 0) 393 *red=getPixelRed(p); 394 395 if (number_channels > 2) 396 { 397 if ((channel & GreenChannel) != 0) 398 *green=getPixelGreen(p); 399 400 if ((channel & BlueChannel) != 0) 401 *blue=getPixelBlue(p); 402 } 403 404 if (((number_channels == 4) || (number_channels == 2)) && 405 ((channel & AlphaChannel) != 0)) 406 *alpha=getPixelAlpha(p,number_channels); 407 } 408 409 static inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels, 410 const unsigned int columns, const unsigned int x, const unsigned int y) 411 { 412 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 413 414 float4 pixel; 415 416 pixel.x=getPixelRed(p); 417 418 if (number_channels > 2) 419 { 420 pixel.y=getPixelGreen(p); 421 pixel.z=getPixelBlue(p); 422 } 423 424 if ((number_channels == 4) || (number_channels == 2)) 425 pixel.w=getPixelAlpha(p,number_channels); 426 return(pixel); 427 } 428 429 static inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, 430 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) 431 { 432 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 433 434 float red = 0.0f; 435 float green = 0.0f; 436 float blue = 0.0f; 437 float alpha = 0.0f; 438 439 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 440 return (float4)(red, green, blue, alpha); 441 } 442 443 static inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, 444 const ChannelType channel, float red, float green, float blue, float alpha) 445 { 446 if ((channel & RedChannel) != 0) 447 setPixelRed(p,ClampToQuantum(red)); 448 449 if (number_channels > 2) 450 { 451 if ((channel & GreenChannel) != 0) 452 setPixelGreen(p,ClampToQuantum(green)); 453 454 if ((channel & BlueChannel) != 0) 455 setPixelBlue(p,ClampToQuantum(blue)); 456 } 457 458 if (((number_channels == 4) || (number_channels == 2)) && 459 ((channel & AlphaChannel) != 0)) 460 setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); 461 } 462 463 static inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels, 464 const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel) 465 { 466 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 467 468 setPixelRed(p,ClampToQuantum(pixel.x)); 469 470 if (number_channels > 2) 471 { 472 setPixelGreen(p,ClampToQuantum(pixel.y)); 473 setPixelBlue(p,ClampToQuantum(pixel.z)); 474 } 475 476 if ((number_channels == 4) || (number_channels == 2)) 477 setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w)); 478 } 479 480 static inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, 481 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, 482 float4 pixel) 483 { 484 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 485 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); 486 } 487 488 static inline float GetPixelIntensity(const unsigned int colorspace, 489 const unsigned int method,float red,float green,float blue) 490 { 491 float intensity; 492 493 if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace)) 494 return red; 495 496 switch (method) 497 { 498 case AveragePixelIntensityMethod: 499 { 500 intensity=(red+green+blue)/3.0; 501 break; 502 } 503 case BrightnessPixelIntensityMethod: 504 { 505 intensity=MagickMax(MagickMax(red,green),blue); 506 break; 507 } 508 case LightnessPixelIntensityMethod: 509 { 510 intensity=(MagickMin(MagickMin(red,green),blue)+ 511 MagickMax(MagickMax(red,green),blue))/2.0; 512 break; 513 } 514 case MSPixelIntensityMethod: 515 { 516 intensity=(float) (((float) red*red+green*green+blue*blue)/ 517 (3.0*QuantumRange)); 518 break; 519 } 520 case Rec601LumaPixelIntensityMethod: 521 { 522 /* 523 if (image->colorspace == RGBColorspace) 524 { 525 red=EncodePixelGamma(red); 526 green=EncodePixelGamma(green); 527 blue=EncodePixelGamma(blue); 528 } 529 */ 530 intensity=0.298839*red+0.586811*green+0.114350*blue; 531 break; 532 } 533 case Rec601LuminancePixelIntensityMethod: 534 { 535 /* 536 if (image->colorspace == sRGBColorspace) 537 { 538 red=DecodePixelGamma(red); 539 green=DecodePixelGamma(green); 540 blue=DecodePixelGamma(blue); 541 } 542 */ 543 intensity=0.298839*red+0.586811*green+0.114350*blue; 544 break; 545 } 546 case Rec709LumaPixelIntensityMethod: 547 default: 548 { 549 /* 550 if (image->colorspace == RGBColorspace) 551 { 552 red=EncodePixelGamma(red); 553 green=EncodePixelGamma(green); 554 blue=EncodePixelGamma(blue); 555 } 556 */ 557 intensity=0.212656*red+0.715158*green+0.072186*blue; 558 break; 559 } 560 case Rec709LuminancePixelIntensityMethod: 561 { 562 /* 563 if (image->colorspace == sRGBColorspace) 564 { 565 red=DecodePixelGamma(red); 566 green=DecodePixelGamma(green); 567 blue=DecodePixelGamma(blue); 568 } 569 */ 570 intensity=0.212656*red+0.715158*green+0.072186*blue; 571 break; 572 } 573 case RMSPixelIntensityMethod: 574 { 575 intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ 576 sqrt(3.0)); 577 break; 578 } 579 } 580 581 return intensity; 582 } 583 584 static inline int mirrorBottom(int value) 585 { 586 return (value < 0) ? - (value) : value; 587 } 588 589 static inline int mirrorTop(int value, int width) 590 { 591 return (value >= width) ? (2 * width - value - 1) : value; 592 } 593 ) 594 595 /* 596 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 597 % % 598 % % 599 % % 600 % A d d N o i s e % 601 % % 602 % % 603 % % 604 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 605 */ 606 607 STRINGIFY( 608 /* 609 Part of MWC64X by David Thomas, dt10@imperial.ac.uk 610 This is provided under BSD, full license is with the main package. 611 See http://www.doc.ic.ac.uk/~dt10/research 612 */ 613 614 // Pre: a<M, b<M 615 // Post: r=(a+b) mod M 616 ulong MWC_AddMod64(ulong a, ulong b, ulong M) 617 { 618 ulong v=a+b; 619 //if( (v>=M) || (v<a) ) 620 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. 621 v=v-M; 622 return v; 623 } 624 625 // Pre: a<M,b<M 626 // Post: r=(a*b) mod M 627 // This could be done more efficently, but it is portable, and should 628 // be easy to understand. It can be replaced with any of the better 629 // modular multiplication algorithms (for example if you know you have 630 // double precision available or something). 631 ulong MWC_MulMod64(ulong a, ulong b, ulong M) 632 { 633 ulong r=0; 634 while(a!=0){ 635 if(a&1) 636 r=MWC_AddMod64(r,b,M); 637 b=MWC_AddMod64(b,b,M); 638 a=a>>1; 639 } 640 return r; 641 } 642 643 // Pre: a<M, e>=0 644 // Post: r=(a^b) mod M 645 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on 646 // most architectures 647 ulong MWC_PowMod64(ulong a, ulong e, ulong M) 648 { 649 ulong sqr=a, acc=1; 650 while(e!=0){ 651 if(e&1) 652 acc=MWC_MulMod64(acc,sqr,M); 653 sqr=MWC_MulMod64(sqr,sqr,M); 654 e=e>>1; 655 } 656 return acc; 657 } 658 659 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) 660 { 661 ulong m=MWC_PowMod64(A, distance, M); 662 ulong x=curr.x*(ulong)A+curr.y; 663 x=MWC_MulMod64(x, m, M); 664 return (uint2)((uint)(x/A), (uint)(x%A)); 665 } 666 667 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) 668 { 669 // This is an arbitrary constant for starting LCG jumping from. I didn't 670 // want to start from 1, as then you end up with the two or three first values 671 // being a bit poor in ones - once you've decided that, one constant is as 672 // good as any another. There is no deep mathematical reason for it, I just 673 // generated a random number. 674 enum{ MWC_BASEID = 4077358422479273989UL }; 675 676 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; 677 ulong m=MWC_PowMod64(A, dist, M); 678 679 ulong x=MWC_MulMod64(MWC_BASEID, m, M); 680 return (uint2)((uint)(x/A), (uint)(x%A)); 681 } 682 683 //! Represents the state of a particular generator 684 typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t; 685 686 void MWC64X_Step(mwc64x_state_t *s) 687 { 688 uint X=s->x, C=s->c; 689 690 uint Xn=s->seed0*X+C; 691 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar 692 uint Cn=mad_hi(s->seed0,X,carry); 693 694 s->x=Xn; 695 s->c=Cn; 696 } 697 698 void MWC64X_Skip(mwc64x_state_t *s, ulong distance) 699 { 700 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance); 701 s->x=tmp.x; 702 s->c=tmp.y; 703 } 704 705 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) 706 { 707 uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset); 708 s->x=tmp.x; 709 s->c=tmp.y; 710 } 711 712 //! Return a 32-bit integer in the range [0..2^32) 713 uint MWC64X_NextUint(mwc64x_state_t *s) 714 { 715 uint res=s->x ^ s->c; 716 MWC64X_Step(s); 717 return res; 718 } 719 720 // 721 // End of MWC64X excerpt 722 // 723 724 float mwcReadPseudoRandomValue(mwc64x_state_t* rng) 725 { 726 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 727 } 728 729 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) 730 { 731 float 732 alpha, 733 beta, 734 noise, 735 sigma; 736 737 noise = 0.0f; 738 alpha=mwcReadPseudoRandomValue(r); 739 switch(noise_type) 740 { 741 case UniformNoise: 742 default: 743 { 744 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); 745 break; 746 } 747 case GaussianNoise: 748 { 749 float 750 gamma, 751 tau; 752 753 if (alpha == 0.0f) 754 alpha=1.0f; 755 beta=mwcReadPseudoRandomValue(r); 756 gamma=sqrt(-2.0f*log(alpha)); 757 sigma=gamma*cospi((2.0f*beta)); 758 tau=gamma*sinpi((2.0f*beta)); 759 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; 760 break; 761 } 762 case ImpulseNoise: 763 { 764 if (alpha < (SigmaImpulse/2.0f)) 765 noise=0.0f; 766 else 767 if (alpha >= (1.0f-(SigmaImpulse/2.0f))) 768 noise=QuantumRange; 769 else 770 noise=pixel; 771 break; 772 } 773 case LaplacianNoise: 774 { 775 if (alpha <= 0.5f) 776 { 777 if (alpha <= MagickEpsilon) 778 noise=(pixel-QuantumRange); 779 else 780 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ 781 0.5f); 782 break; 783 } 784 beta=1.0f-alpha; 785 if (beta <= (0.5f*MagickEpsilon)) 786 noise=(pixel+QuantumRange); 787 else 788 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); 789 break; 790 } 791 case MultiplicativeGaussianNoise: 792 { 793 sigma=1.0f; 794 if (alpha > MagickEpsilon) 795 sigma=sqrt(-2.0f*log(alpha)); 796 beta=mwcReadPseudoRandomValue(r); 797 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* 798 cospi((2.0f*beta))/2.0f); 799 break; 800 } 801 case PoissonNoise: 802 { 803 float 804 poisson; 805 unsigned int i; 806 poisson=exp(-SigmaPoisson*QuantumScale*pixel); 807 for (i=0; alpha > poisson; i++) 808 { 809 beta=mwcReadPseudoRandomValue(r); 810 alpha*=beta; 811 } 812 noise=(QuantumRange*i*PerceptibleReciprocal(SigmaPoisson)); 813 break; 814 } 815 case RandomNoise: 816 { 817 noise=(QuantumRange*SigmaRandom*alpha); 818 break; 819 } 820 } 821 return noise; 822 } 823 824 __kernel 825 void AddNoise(const __global CLQuantum *image, 826 const unsigned int number_channels,const ChannelType channel, 827 const unsigned int length,const unsigned int pixelsPerWorkItem, 828 const NoiseType noise_type,const float attenuate,const unsigned int seed0, 829 const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, 830 __global CLQuantum *filteredImage) 831 { 832 mwc64x_state_t rng; 833 rng.seed0 = seed0; 834 rng.seed1 = seed1; 835 836 uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use 837 uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); 838 MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams 839 840 uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); 841 uint count = pixelsPerWorkItem; 842 843 while (count > 0 && pos < length) 844 { 845 const __global CLQuantum *p = image + pos; 846 __global CLQuantum *q = filteredImage + pos; 847 848 float red; 849 float green; 850 float blue; 851 float alpha; 852 853 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 854 855 if ((channel & RedChannel) != 0) 856 red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); 857 858 if (number_channels > 2) 859 { 860 if ((channel & GreenChannel) != 0) 861 green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); 862 863 if ((channel & BlueChannel) != 0) 864 blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); 865 } 866 867 if (((number_channels == 4) || (number_channels == 2)) && 868 ((channel & AlphaChannel) != 0)) 869 alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); 870 871 WriteChannels(q, number_channels, channel, red, green, blue, alpha); 872 873 pos += (get_local_size(0) * number_channels); 874 count--; 875 } 876 } 877 ) 878 879 /* 880 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 881 % % 882 % % 883 % % 884 % B l u r % 885 % % 886 % % 887 % % 888 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 889 */ 890 891 STRINGIFY( 892 /* 893 Reduce image noise and reduce detail levels by row 894 */ 895 __kernel void BlurRow(const __global CLQuantum *image, 896 const unsigned int number_channels,const ChannelType channel, 897 __constant float *filter,const unsigned int width, 898 const unsigned int imageColumns,const unsigned int imageRows, 899 __local float4 *temp,__global float4 *tempImage) 900 { 901 const int x = get_global_id(0); 902 const int y = get_global_id(1); 903 904 const int columns = imageColumns; 905 906 const unsigned int radius = (width-1)/2; 907 const int wsize = get_local_size(0); 908 const unsigned int loadSize = wsize+width; 909 910 //group coordinate 911 const int groupX=get_local_size(0)*get_group_id(0); 912 913 //parallel load and clamp 914 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) 915 { 916 int cx = ClampToCanvas(i + groupX - radius, columns); 917 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); 918 } 919 920 // barrier 921 barrier(CLK_LOCAL_MEM_FENCE); 922 923 // only do the work if this is not a patched item 924 if (get_global_id(0) < columns) 925 { 926 // compute 927 float4 result = (float4) 0; 928 929 int i = 0; 930 931 for ( ; i+7 < width; ) 932 { 933 for (int j=0; j < 8; j++) 934 result+=filter[i+j]*temp[i+j+get_local_id(0)]; 935 i+=8; 936 } 937 938 for ( ; i < width; i++) 939 result+=filter[i]*temp[i+get_local_id(0)]; 940 941 // write back to global 942 tempImage[y*columns+x] = result; 943 } 944 } 945 ) 946 947 STRINGIFY( 948 /* 949 Reduce image noise and reduce detail levels by line 950 */ 951 __kernel void BlurColumn(const __global float4 *blurRowData, 952 const unsigned int number_channels,const ChannelType channel, 953 __constant float *filter,const unsigned int width, 954 const unsigned int imageColumns,const unsigned int imageRows, 955 __local float4 *temp,__global CLQuantum *filteredImage) 956 { 957 const int x = get_global_id(0); 958 const int y = get_global_id(1); 959 960 const int columns = imageColumns; 961 const int rows = imageRows; 962 963 unsigned int radius = (width-1)/2; 964 const int wsize = get_local_size(1); 965 const unsigned int loadSize = wsize+width; 966 967 //group coordinate 968 const int groupX=get_local_size(0)*get_group_id(0); 969 const int groupY=get_local_size(1)*get_group_id(1); 970 //notice that get_local_size(0) is 1, so 971 //groupX=get_group_id(0); 972 973 //parallel load and clamp 974 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) 975 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; 976 977 // barrier 978 barrier(CLK_LOCAL_MEM_FENCE); 979 980 // only do the work if this is not a patched item 981 if (get_global_id(1) < rows) 982 { 983 // compute 984 float4 result = (float4) 0; 985 986 int i = 0; 987 988 for ( ; i+7 < width; ) 989 { 990 for (int j=0; j < 8; j++) 991 result+=filter[i+j]*temp[i+j+get_local_id(1)]; 992 i+=8; 993 } 994 995 for ( ; i < width; i++) 996 result+=filter[i]*temp[i+get_local_id(1)]; 997 998 // write back to global 999 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 1000 } 1001 } 1002 ) 1003 1004 /* 1005 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1006 % % 1007 % % 1008 % % 1009 % C o n t r a s t % 1010 % % 1011 % % 1012 % % 1013 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1014 */ 1015 1016 STRINGIFY( 1017 1018 static inline float4 ConvertRGBToHSB(const float4 pixel) 1019 { 1020 float4 result=0.0f; 1021 result.w=pixel.w; 1022 float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z); 1023 if (tmax != 0.0f) 1024 { 1025 float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z); 1026 float delta=tmax-tmin; 1027 1028 result.y=delta/tmax; 1029 result.z=QuantumScale*tmax; 1030 if (delta != 0.0f) 1031 { 1032 result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f)); 1033 result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ? 1034 (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta; 1035 result.x/=6.0f; 1036 result.x+=(result.x < 0.0f) ? 0.0f : 1.0f; 1037 } 1038 } 1039 return(result); 1040 } 1041 1042 static inline float4 ConvertHSBToRGB(const float4 pixel) 1043 { 1044 float hue=pixel.x; 1045 float saturation=pixel.y; 1046 float brightness=pixel.z; 1047 1048 float4 result=pixel; 1049 1050 if (saturation == 0.0f) 1051 { 1052 result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness); 1053 } 1054 else 1055 { 1056 float h=6.0f*(hue-floor(hue)); 1057 float f=h-floor(h); 1058 float p=brightness*(1.0f-saturation); 1059 float q=brightness*(1.0f-saturation*f); 1060 float t=brightness*(1.0f-(saturation*(1.0f-f))); 1061 int ih = (int)h; 1062 1063 if (ih == 1) 1064 { 1065 result.x=ClampToQuantum(QuantumRange*q); 1066 result.y=ClampToQuantum(QuantumRange*brightness); 1067 result.z=ClampToQuantum(QuantumRange*p); 1068 } 1069 else if (ih == 2) 1070 { 1071 result.x=ClampToQuantum(QuantumRange*p); 1072 result.y=ClampToQuantum(QuantumRange*brightness); 1073 result.z=ClampToQuantum(QuantumRange*t); 1074 } 1075 else if (ih == 3) 1076 { 1077 result.x=ClampToQuantum(QuantumRange*p); 1078 result.y=ClampToQuantum(QuantumRange*q); 1079 result.z=ClampToQuantum(QuantumRange*brightness); 1080 } 1081 else if (ih == 4) 1082 { 1083 result.x=ClampToQuantum(QuantumRange*t); 1084 result.y=ClampToQuantum(QuantumRange*p); 1085 result.z=ClampToQuantum(QuantumRange*brightness); 1086 } 1087 else if (ih == 5) 1088 { 1089 result.x=ClampToQuantum(QuantumRange*brightness); 1090 result.y=ClampToQuantum(QuantumRange*p); 1091 result.z=ClampToQuantum(QuantumRange*q); 1092 } 1093 else 1094 { 1095 result.x=ClampToQuantum(QuantumRange*brightness); 1096 result.y=ClampToQuantum(QuantumRange*t); 1097 result.z=ClampToQuantum(QuantumRange*p); 1098 } 1099 } 1100 return(result); 1101 } 1102 1103 __kernel void Contrast(__global CLQuantum *image, 1104 const unsigned int number_channels,const int sign) 1105 { 1106 const int x=get_global_id(0); 1107 const int y=get_global_id(1); 1108 const unsigned int columns=get_global_size(0); 1109 1110 float4 pixel=ReadAllChannels(image,number_channels,columns,x,y); 1111 if (number_channels < 3) 1112 pixel.y=pixel.z=pixel.x; 1113 1114 pixel=ConvertRGBToHSB(pixel); 1115 float brightness=pixel.z; 1116 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); 1117 brightness=clamp(brightness,0.0f,1.0f); 1118 pixel.z=brightness; 1119 pixel=ConvertHSBToRGB(pixel); 1120 1121 WriteAllChannels(image,number_channels,columns,x,y,pixel); 1122 } 1123 ) 1124 1125 /* 1126 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1127 % % 1128 % % 1129 % % 1130 % C o n t r a s t S t r e t c h % 1131 % % 1132 % % 1133 % % 1134 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1135 */ 1136 1137 STRINGIFY( 1138 /* 1139 */ 1140 __kernel void Histogram(__global CLPixelType * restrict im, 1141 const ChannelType channel, 1142 const unsigned int colorspace, 1143 const unsigned int method, 1144 __global uint4 * restrict histogram) 1145 { 1146 const int x = get_global_id(0); 1147 const int y = get_global_id(1); 1148 const int columns = get_global_size(0); 1149 const int c = x + y * columns; 1150 if ((channel & SyncChannels) != 0) 1151 { 1152 float red=(float)getRed(im[c]); 1153 float green=(float)getGreen(im[c]); 1154 float blue=(float)getBlue(im[c]); 1155 1156 float intensity = GetPixelIntensity(colorspace, method, red, green, blue); 1157 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); 1158 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position 1159 } 1160 else 1161 { 1162 // for equalizing, we always need all channels? 1163 // otherwise something more 1164 } 1165 } 1166 ) 1167 1168 STRINGIFY( 1169 /* 1170 */ 1171 __kernel void ContrastStretch(__global CLPixelType * restrict im, 1172 const ChannelType channel, 1173 __global CLPixelType * restrict stretch_map, 1174 const float4 white, const float4 black) 1175 { 1176 const int x = get_global_id(0); 1177 const int y = get_global_id(1); 1178 const int columns = get_global_size(0); 1179 const int c = x + y * columns; 1180 1181 uint ePos; 1182 CLPixelType oValue, eValue; 1183 CLQuantum red, green, blue, alpha; 1184 1185 //read from global 1186 oValue=im[c]; 1187 1188 if ((channel & RedChannel) != 0) 1189 { 1190 if (getRedF4(white) != getRedF4(black)) 1191 { 1192 ePos = ScaleQuantumToMap(getRed(oValue)); 1193 eValue = stretch_map[ePos]; 1194 red = getRed(eValue); 1195 } 1196 } 1197 1198 if ((channel & GreenChannel) != 0) 1199 { 1200 if (getGreenF4(white) != getGreenF4(black)) 1201 { 1202 ePos = ScaleQuantumToMap(getGreen(oValue)); 1203 eValue = stretch_map[ePos]; 1204 green = getGreen(eValue); 1205 } 1206 } 1207 1208 if ((channel & BlueChannel) != 0) 1209 { 1210 if (getBlueF4(white) != getBlueF4(black)) 1211 { 1212 ePos = ScaleQuantumToMap(getBlue(oValue)); 1213 eValue = stretch_map[ePos]; 1214 blue = getBlue(eValue); 1215 } 1216 } 1217 1218 if ((channel & AlphaChannel) != 0) 1219 { 1220 if (getAlphaF4(white) != getAlphaF4(black)) 1221 { 1222 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1223 eValue = stretch_map[ePos]; 1224 alpha = getAlpha(eValue); 1225 } 1226 } 1227 1228 //write back 1229 im[c]=(CLPixelType)(blue, green, red, alpha); 1230 1231 } 1232 ) 1233 1234 /* 1235 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1236 % % 1237 % % 1238 % % 1239 % C o n v o l v e % 1240 % % 1241 % % 1242 % % 1243 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1244 */ 1245 1246 STRINGIFY( 1247 __kernel 1248 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, 1249 const unsigned int imageWidth, const unsigned int imageHeight, 1250 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1251 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { 1252 1253 int2 blockID; 1254 blockID.x = get_global_id(0) / get_local_size(0); 1255 blockID.y = get_global_id(1) / get_local_size(1); 1256 1257 // image area processed by this workgroup 1258 int2 imageAreaOrg; 1259 imageAreaOrg.x = blockID.x * get_local_size(0); 1260 imageAreaOrg.y = blockID.y * get_local_size(1); 1261 1262 int2 midFilterDimen; 1263 midFilterDimen.x = (filterWidth-1)/2; 1264 midFilterDimen.y = (filterHeight-1)/2; 1265 1266 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; 1267 1268 // dimension of the local cache 1269 int2 cachedAreaDimen; 1270 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; 1271 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; 1272 1273 // cache the pixels accessed by this workgroup in local memory 1274 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); 1275 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; 1276 int groupSize = get_local_size(0) * get_local_size(1); 1277 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { 1278 1279 int2 cachedAreaIndex; 1280 cachedAreaIndex.x = i % cachedAreaDimen.x; 1281 cachedAreaIndex.y = i / cachedAreaDimen.x; 1282 1283 int2 imagePixelIndex; 1284 imagePixelIndex = cachedAreaOrg + cachedAreaIndex; 1285 1286 // only support EdgeVirtualPixelMethod through ClampToCanvas 1287 // TODO: implement other virtual pixel method 1288 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); 1289 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); 1290 1291 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; 1292 } 1293 1294 // cache the filter 1295 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { 1296 filterCache[i] = filter[i]; 1297 } 1298 barrier(CLK_LOCAL_MEM_FENCE); 1299 1300 1301 int2 imageIndex; 1302 imageIndex.x = imageAreaOrg.x + get_local_id(0); 1303 imageIndex.y = imageAreaOrg.y + get_local_id(1); 1304 1305 // if out-of-range, stops here and quit 1306 if (imageIndex.x >= imageWidth 1307 || imageIndex.y >= imageHeight) { 1308 return; 1309 } 1310 1311 int filterIndex = 0; 1312 float4 sum = (float4)0.0f; 1313 float gamma = 0.0f; 1314 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1315 int cacheIndexY = get_local_id(1); 1316 for (int j = 0; j < filterHeight; j++) { 1317 int cacheIndexX = get_local_id(0); 1318 for (int i = 0; i < filterWidth; i++) { 1319 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1320 float f = filterCache[filterIndex]; 1321 1322 sum.x += f * p.x; 1323 sum.y += f * p.y; 1324 sum.z += f * p.z; 1325 sum.w += f * p.w; 1326 1327 gamma += f; 1328 filterIndex++; 1329 cacheIndexX++; 1330 } 1331 cacheIndexY++; 1332 } 1333 } 1334 else { 1335 int cacheIndexY = get_local_id(1); 1336 for (int j = 0; j < filterHeight; j++) { 1337 int cacheIndexX = get_local_id(0); 1338 for (int i = 0; i < filterWidth; i++) { 1339 1340 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1341 float alpha = QuantumScale*p.w; 1342 float f = filterCache[filterIndex]; 1343 float g = alpha * f; 1344 1345 sum.x += g*p.x; 1346 sum.y += g*p.y; 1347 sum.z += g*p.z; 1348 sum.w += f*p.w; 1349 1350 gamma += g; 1351 filterIndex++; 1352 cacheIndexX++; 1353 } 1354 cacheIndexY++; 1355 } 1356 gamma = PerceptibleReciprocal(gamma); 1357 sum.xyz = gamma*sum.xyz; 1358 } 1359 CLPixelType outputPixel; 1360 outputPixel.x = ClampToQuantum(sum.x); 1361 outputPixel.y = ClampToQuantum(sum.y); 1362 outputPixel.z = ClampToQuantum(sum.z); 1363 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1364 1365 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1366 } 1367 ) 1368 1369 STRINGIFY( 1370 __kernel 1371 void Convolve(const __global CLPixelType *input, __global CLPixelType *output, 1372 const uint imageWidth, const uint imageHeight, 1373 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1374 const uint matte, const ChannelType channel) { 1375 1376 int2 imageIndex; 1377 imageIndex.x = get_global_id(0); 1378 imageIndex.y = get_global_id(1); 1379 1380 /* 1381 unsigned int imageWidth = get_global_size(0); 1382 unsigned int imageHeight = get_global_size(1); 1383 */ 1384 if (imageIndex.x >= imageWidth 1385 || imageIndex.y >= imageHeight) 1386 return; 1387 1388 int2 midFilterDimen; 1389 midFilterDimen.x = (filterWidth-1)/2; 1390 midFilterDimen.y = (filterHeight-1)/2; 1391 1392 int filterIndex = 0; 1393 float4 sum = (float4)0.0f; 1394 float gamma = 0.0f; 1395 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1396 for (int j = 0; j < filterHeight; j++) { 1397 int2 inputPixelIndex; 1398 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1399 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1400 for (int i = 0; i < filterWidth; i++) { 1401 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1402 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1403 1404 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1405 float f = filter[filterIndex]; 1406 1407 sum.x += f * p.x; 1408 sum.y += f * p.y; 1409 sum.z += f * p.z; 1410 sum.w += f * p.w; 1411 1412 gamma += f; 1413 1414 filterIndex++; 1415 } 1416 } 1417 } 1418 else { 1419 1420 for (int j = 0; j < filterHeight; j++) { 1421 int2 inputPixelIndex; 1422 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1423 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1424 for (int i = 0; i < filterWidth; i++) { 1425 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1426 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1427 1428 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1429 float alpha = QuantumScale*p.w; 1430 float f = filter[filterIndex]; 1431 float g = alpha * f; 1432 1433 sum.x += g*p.x; 1434 sum.y += g*p.y; 1435 sum.z += g*p.z; 1436 sum.w += f*p.w; 1437 1438 gamma += g; 1439 1440 1441 filterIndex++; 1442 } 1443 } 1444 gamma = PerceptibleReciprocal(gamma); 1445 sum.xyz = gamma*sum.xyz; 1446 } 1447 1448 CLPixelType outputPixel; 1449 outputPixel.x = ClampToQuantum(sum.x); 1450 outputPixel.y = ClampToQuantum(sum.y); 1451 outputPixel.z = ClampToQuantum(sum.z); 1452 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1453 1454 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1455 } 1456 ) 1457 1458 /* 1459 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1460 % % 1461 % % 1462 % % 1463 % D e s p e c k l e % 1464 % % 1465 % % 1466 % % 1467 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1468 */ 1469 1470 STRINGIFY( 1471 1472 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1473 , const unsigned int imageWidth, const unsigned int imageHeight 1474 , const int2 offset, const int polarity, const int matte) { 1475 1476 int x = get_global_id(0); 1477 int y = get_global_id(1); 1478 1479 CLPixelType v = inputImage[y*imageWidth+x]; 1480 1481 int2 neighbor; 1482 neighbor.y = y + offset.y; 1483 neighbor.x = x + offset.x; 1484 1485 int2 clampedNeighbor; 1486 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1487 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1488 1489 CLPixelType r = (clampedNeighbor.x == neighbor.x 1490 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1491 :(CLPixelType)0; 1492 1493 int sv[4]; 1494 sv[0] = (int)v.x; 1495 sv[1] = (int)v.y; 1496 sv[2] = (int)v.z; 1497 sv[3] = (int)v.w; 1498 1499 int sr[4]; 1500 sr[0] = (int)r.x; 1501 sr[1] = (int)r.y; 1502 sr[2] = (int)r.z; 1503 sr[3] = (int)r.w; 1504 1505 if (polarity > 0) { 1506 \n #pragma unroll 4\n 1507 for (unsigned int i = 0; i < 4; i++) { 1508 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; 1509 } 1510 } 1511 else { 1512 \n #pragma unroll 4\n 1513 for (unsigned int i = 0; i < 4; i++) { 1514 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; 1515 } 1516 1517 } 1518 1519 v.x = (CLQuantum)sv[0]; 1520 v.y = (CLQuantum)sv[1]; 1521 v.z = (CLQuantum)sv[2]; 1522 1523 if (matte!=0) 1524 v.w = (CLQuantum)sv[3]; 1525 1526 outputImage[y*imageWidth+x] = v; 1527 1528 } 1529 1530 1531 ) 1532 1533 STRINGIFY( 1534 1535 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1536 , const unsigned int imageWidth, const unsigned int imageHeight 1537 , const int2 offset, const int polarity, const int matte) { 1538 1539 int x = get_global_id(0); 1540 int y = get_global_id(1); 1541 1542 CLPixelType v = inputImage[y*imageWidth+x]; 1543 1544 int2 neighbor, clampedNeighbor; 1545 1546 neighbor.y = y + offset.y; 1547 neighbor.x = x + offset.x; 1548 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1549 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1550 1551 CLPixelType r = (clampedNeighbor.x == neighbor.x 1552 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1553 :(CLPixelType)0; 1554 1555 1556 neighbor.y = y - offset.y; 1557 neighbor.x = x - offset.x; 1558 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1559 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1560 1561 CLPixelType s = (clampedNeighbor.x == neighbor.x 1562 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1563 :(CLPixelType)0; 1564 1565 1566 int sv[4]; 1567 sv[0] = (int)v.x; 1568 sv[1] = (int)v.y; 1569 sv[2] = (int)v.z; 1570 sv[3] = (int)v.w; 1571 1572 int sr[4]; 1573 sr[0] = (int)r.x; 1574 sr[1] = (int)r.y; 1575 sr[2] = (int)r.z; 1576 sr[3] = (int)r.w; 1577 1578 int ss[4]; 1579 ss[0] = (int)s.x; 1580 ss[1] = (int)s.y; 1581 ss[2] = (int)s.z; 1582 ss[3] = (int)s.w; 1583 1584 if (polarity > 0) { 1585 \n #pragma unroll 4\n 1586 for (unsigned int i = 0; i < 4; i++) { 1587 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; 1588 // 1589 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1590 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1591 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1592 } 1593 } 1594 else { 1595 \n #pragma unroll 4\n 1596 for (unsigned int i = 0; i < 4; i++) { 1597 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; 1598 // 1599 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1600 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1601 } 1602 } 1603 1604 v.x = (CLQuantum)sv[0]; 1605 v.y = (CLQuantum)sv[1]; 1606 v.z = (CLQuantum)sv[2]; 1607 1608 if (matte!=0) 1609 v.w = (CLQuantum)sv[3]; 1610 1611 outputImage[y*imageWidth+x] = v; 1612 1613 } 1614 ) 1615 1616 /* 1617 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1618 % % 1619 % % 1620 % % 1621 % E q u a l i z e % 1622 % % 1623 % % 1624 % % 1625 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1626 */ 1627 1628 STRINGIFY( 1629 /* 1630 */ 1631 __kernel void Equalize(__global CLPixelType * restrict im, 1632 const ChannelType channel, 1633 __global CLPixelType * restrict equalize_map, 1634 const float4 white, const float4 black) 1635 { 1636 const int x = get_global_id(0); 1637 const int y = get_global_id(1); 1638 const int columns = get_global_size(0); 1639 const int c = x + y * columns; 1640 1641 uint ePos; 1642 CLPixelType oValue, eValue; 1643 CLQuantum red, green, blue, alpha; 1644 1645 //read from global 1646 oValue=im[c]; 1647 1648 if ((channel & SyncChannels) != 0) 1649 { 1650 if (getRedF4(white) != getRedF4(black)) 1651 { 1652 ePos = ScaleQuantumToMap(getRed(oValue)); 1653 eValue = equalize_map[ePos]; 1654 red = getRed(eValue); 1655 ePos = ScaleQuantumToMap(getGreen(oValue)); 1656 eValue = equalize_map[ePos]; 1657 green = getRed(eValue); 1658 ePos = ScaleQuantumToMap(getBlue(oValue)); 1659 eValue = equalize_map[ePos]; 1660 blue = getRed(eValue); 1661 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1662 eValue = equalize_map[ePos]; 1663 alpha = getRed(eValue); 1664 1665 //write back 1666 im[c]=(CLPixelType)(blue, green, red, alpha); 1667 } 1668 1669 } 1670 1671 // for equalizing, we always need all channels? 1672 // otherwise something more 1673 1674 } 1675 ) 1676 1677 /* 1678 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1679 % % 1680 % % 1681 % % 1682 % F u n c t i o n % 1683 % % 1684 % % 1685 % % 1686 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1687 */ 1688 1689 STRINGIFY( 1690 /* 1691 apply FunctionImageChannel(braightness-contrast) 1692 */ 1693 CLQuantum ApplyFunction(float pixel,const MagickFunction function, 1694 const unsigned int number_parameters,__constant float *parameters) 1695 { 1696 float result = 0.0f; 1697 1698 switch (function) 1699 { 1700 case PolynomialFunction: 1701 { 1702 for (unsigned int i=0; i < number_parameters; i++) 1703 result = result*QuantumScale*pixel + parameters[i]; 1704 result *= QuantumRange; 1705 break; 1706 } 1707 case SinusoidFunction: 1708 { 1709 float freq,phase,ampl,bias; 1710 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1711 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; 1712 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; 1713 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1714 result = QuantumRange*(ampl*sin(2.0f*MagickPI* 1715 (freq*QuantumScale*pixel + phase/360.0f)) + bias); 1716 break; 1717 } 1718 case ArcsinFunction: 1719 { 1720 float width,range,center,bias; 1721 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1722 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1723 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1724 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1725 1726 result = 2.0f/width*(QuantumScale*pixel - center); 1727 result = range/MagickPI*asin(result)+bias; 1728 result = ( result <= -1.0f ) ? bias - range/2.0f : result; 1729 result = ( result >= 1.0f ) ? bias + range/2.0f : result; 1730 result *= QuantumRange; 1731 break; 1732 } 1733 case ArctanFunction: 1734 { 1735 float slope,range,center,bias; 1736 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1737 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1738 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1739 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1740 result = MagickPI*slope*(QuantumScale*pixel-center); 1741 result = QuantumRange*(range/MagickPI*atan(result) + bias); 1742 break; 1743 } 1744 case UndefinedFunction: 1745 break; 1746 } 1747 return(ClampToQuantum(result)); 1748 } 1749 ) 1750 1751 STRINGIFY( 1752 /* 1753 Improve brightness / contrast of the image 1754 channel : define which channel is improved 1755 function : the function called to enchance the brightness contrast 1756 number_parameters : numbers of parameters 1757 parameters : the parameter 1758 */ 1759 __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, 1760 const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, 1761 __constant float *parameters) 1762 { 1763 const unsigned int x = get_global_id(0); 1764 const unsigned int y = get_global_id(1); 1765 const unsigned int columns = get_global_size(0); 1766 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1767 1768 float red; 1769 float green; 1770 float blue; 1771 float alpha; 1772 1773 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 1774 1775 if ((channel & RedChannel) != 0) 1776 red=ApplyFunction(red, function, number_parameters, parameters); 1777 1778 if (number_channels > 2) 1779 { 1780 if ((channel & GreenChannel) != 0) 1781 green=ApplyFunction(green, function, number_parameters, parameters); 1782 1783 if ((channel & BlueChannel) != 0) 1784 blue=ApplyFunction(blue, function, number_parameters, parameters); 1785 } 1786 1787 if (((number_channels == 4) || (number_channels == 2)) && 1788 ((channel & AlphaChannel) != 0)) 1789 alpha=ApplyFunction(alpha, function, number_parameters, parameters); 1790 1791 WriteChannels(p, number_channels, channel, red, green, blue, alpha); 1792 } 1793 ) 1794 1795 /* 1796 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1797 % % 1798 % % 1799 % % 1800 % G r a y s c a l e % 1801 % % 1802 % % 1803 % % 1804 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1805 */ 1806 1807 STRINGIFY( 1808 __kernel void Grayscale(__global CLQuantum *image,const int number_channels, 1809 const unsigned int colorspace,const unsigned int method) 1810 { 1811 const unsigned int x = get_global_id(0); 1812 const unsigned int y = get_global_id(1); 1813 const unsigned int columns = get_global_size(0); 1814 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1815 1816 float 1817 blue, 1818 green, 1819 red; 1820 1821 red=getPixelRed(p); 1822 green=getPixelGreen(p); 1823 blue=getPixelBlue(p); 1824 1825 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); 1826 1827 setPixelRed(p,intensity); 1828 setPixelGreen(p,intensity); 1829 setPixelBlue(p,intensity); 1830 } 1831 ) 1832 1833 /* 1834 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1835 % % 1836 % % 1837 % % 1838 % L o c a l C o n t r a s t % 1839 % % 1840 % % 1841 % % 1842 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1843 */ 1844 1845 STRINGIFY( 1846 1847 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, 1848 const int radius, 1849 const int imageWidth, 1850 const int imageHeight) 1851 { 1852 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); 1853 1854 int x = get_local_id(0); 1855 int y = get_global_id(1); 1856 1857 if ((x >= imageWidth) || (y >= imageHeight)) 1858 return; 1859 1860 global CLPixelType *src = srcImage + y * imageWidth; 1861 1862 for (int i = x; i < imageWidth; i += get_local_size(0)) { 1863 float sum = 0.0f; 1864 float weight = 1.0f; 1865 1866 int j = i - radius; 1867 while ((j + 7) < i) { 1868 for (int k = 0; k < 8; ++k) // Unroll 8x 1869 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); 1870 weight += 8.0f; 1871 j+=8; 1872 } 1873 while (j < i) { 1874 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); 1875 weight += 1.0f; 1876 ++j; 1877 } 1878 1879 while ((j + 7) < radius + i) { 1880 for (int k = 0; k < 8; ++k) // Unroll 8x 1881 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); 1882 weight -= 8.0f; 1883 j+=8; 1884 } 1885 while (j < radius + i) { 1886 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); 1887 weight -= 1.0f; 1888 ++j; 1889 } 1890 1891 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); 1892 } 1893 } 1894 ) 1895 1896 STRINGIFY( 1897 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, 1898 const int radius, 1899 const float strength, 1900 const int imageWidth, 1901 const int imageHeight) 1902 { 1903 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); 1904 1905 int x = get_global_id(0); 1906 int y = get_global_id(1); 1907 1908 if ((x >= imageWidth) || (y >= imageHeight)) 1909 return; 1910 1911 global float *src = blurImage + x; 1912 1913 float sum = 0.0f; 1914 float weight = 1.0f; 1915 1916 int j = y - radius; 1917 while ((j + 7) < y) { 1918 for (int k = 0; k < 8; ++k) // Unroll 8x 1919 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; 1920 weight += 8.0f; 1921 j+=8; 1922 } 1923 while (j < y) { 1924 sum += weight * src[mirrorBottom(j) * imageWidth]; 1925 weight += 1.0f; 1926 ++j; 1927 } 1928 1929 while ((j + 7) < radius + y) { 1930 for (int k = 0; k < 8; ++k) // Unroll 8x 1931 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; 1932 weight -= 8.0f; 1933 j+=8; 1934 } 1935 while (j < radius + y) { 1936 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; 1937 weight -= 1.0f; 1938 ++j; 1939 } 1940 1941 CLPixelType pixel = srcImage[x + y * imageWidth]; 1942 float srcVal = dot(RGB, convert_float4(pixel)); 1943 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); 1944 mult = (srcVal + mult) / srcVal; 1945 1946 pixel.x = ClampToQuantum(pixel.x * mult); 1947 pixel.y = ClampToQuantum(pixel.y * mult); 1948 pixel.z = ClampToQuantum(pixel.z * mult); 1949 1950 dstImage[x + y * imageWidth] = pixel; 1951 } 1952 ) 1953 1954 /* 1955 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1956 % % 1957 % % 1958 % % 1959 % M o d u l a t e % 1960 % % 1961 % % 1962 % % 1963 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1964 */ 1965 1966 STRINGIFY( 1967 1968 static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, 1969 float *hue, float *saturation, float *lightness) 1970 { 1971 float 1972 c, 1973 tmax, 1974 tmin; 1975 1976 /* 1977 Convert RGB to HSL colorspace. 1978 */ 1979 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); 1980 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); 1981 1982 c=tmax-tmin; 1983 1984 *lightness=(tmax+tmin)/2.0; 1985 if (c <= 0.0) 1986 { 1987 *hue=0.0; 1988 *saturation=0.0; 1989 return; 1990 } 1991 1992 if (tmax == (QuantumScale*red)) 1993 { 1994 *hue=(QuantumScale*green-QuantumScale*blue)/c; 1995 if ((QuantumScale*green) < (QuantumScale*blue)) 1996 *hue+=6.0; 1997 } 1998 else 1999 if (tmax == (QuantumScale*green)) 2000 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; 2001 else 2002 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; 2003 2004 *hue*=60.0/360.0; 2005 if (*lightness <= 0.5) 2006 *saturation=c/(2.0*(*lightness)); 2007 else 2008 *saturation=c/(2.0-2.0*(*lightness)); 2009 } 2010 2011 static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, 2012 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2013 { 2014 float 2015 b, 2016 c, 2017 g, 2018 h, 2019 tmin, 2020 r, 2021 x; 2022 2023 /* 2024 Convert HSL to RGB colorspace. 2025 */ 2026 h=hue*360.0; 2027 if (lightness <= 0.5) 2028 c=2.0*lightness*saturation; 2029 else 2030 c=(2.0-2.0*lightness)*saturation; 2031 tmin=lightness-0.5*c; 2032 h-=360.0*floor(h/360.0); 2033 h/=60.0; 2034 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); 2035 switch ((int) floor(h) % 6) 2036 { 2037 case 0: 2038 { 2039 r=tmin+c; 2040 g=tmin+x; 2041 b=tmin; 2042 break; 2043 } 2044 case 1: 2045 { 2046 r=tmin+x; 2047 g=tmin+c; 2048 b=tmin; 2049 break; 2050 } 2051 case 2: 2052 { 2053 r=tmin; 2054 g=tmin+c; 2055 b=tmin+x; 2056 break; 2057 } 2058 case 3: 2059 { 2060 r=tmin; 2061 g=tmin+x; 2062 b=tmin+c; 2063 break; 2064 } 2065 case 4: 2066 { 2067 r=tmin+x; 2068 g=tmin; 2069 b=tmin+c; 2070 break; 2071 } 2072 case 5: 2073 { 2074 r=tmin+c; 2075 g=tmin; 2076 b=tmin+x; 2077 break; 2078 } 2079 default: 2080 { 2081 r=0.0; 2082 g=0.0; 2083 b=0.0; 2084 } 2085 } 2086 *red=ClampToQuantum(QuantumRange*r); 2087 *green=ClampToQuantum(QuantumRange*g); 2088 *blue=ClampToQuantum(QuantumRange*b); 2089 } 2090 2091 static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 2092 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2093 { 2094 float 2095 hue, 2096 lightness, 2097 saturation; 2098 2099 /* 2100 Increase or decrease color lightness, saturation, or hue. 2101 */ 2102 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); 2103 hue+=0.5*(0.01*percent_hue-1.0); 2104 while (hue < 0.0) 2105 hue+=1.0; 2106 while (hue >= 1.0) 2107 hue-=1.0; 2108 saturation*=0.01*percent_saturation; 2109 lightness*=0.01*percent_lightness; 2110 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); 2111 } 2112 2113 __kernel void Modulate(__global CLPixelType *im, 2114 const float percent_brightness, 2115 const float percent_hue, 2116 const float percent_saturation, 2117 const int colorspace) 2118 { 2119 2120 const int x = get_global_id(0); 2121 const int y = get_global_id(1); 2122 const int columns = get_global_size(0); 2123 const int c = x + y * columns; 2124 2125 CLPixelType pixel = im[c]; 2126 2127 CLQuantum 2128 blue, 2129 green, 2130 red; 2131 2132 red=getRed(pixel); 2133 green=getGreen(pixel); 2134 blue=getBlue(pixel); 2135 2136 switch (colorspace) 2137 { 2138 case HSLColorspace: 2139 default: 2140 { 2141 ModulateHSL(percent_hue, percent_saturation, percent_brightness, 2142 &red, &green, &blue); 2143 } 2144 2145 } 2146 2147 CLPixelType filteredPixel; 2148 2149 setRed(&filteredPixel, red); 2150 setGreen(&filteredPixel, green); 2151 setBlue(&filteredPixel, blue); 2152 filteredPixel.w = pixel.w; 2153 2154 im[c] = filteredPixel; 2155 } 2156 ) 2157 2158 /* 2159 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2160 % % 2161 % % 2162 % % 2163 % M o t i o n B l u r % 2164 % % 2165 % % 2166 % % 2167 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2168 */ 2169 2170 STRINGIFY( 2171 __kernel 2172 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, 2173 const unsigned int imageWidth, const unsigned int imageHeight, 2174 const __global float *filter, const unsigned int width, const __global int2* offset, 2175 const float4 bias, 2176 const ChannelType channel, const unsigned int matte) { 2177 2178 int2 currentPixel; 2179 currentPixel.x = get_global_id(0); 2180 currentPixel.y = get_global_id(1); 2181 2182 if (currentPixel.x >= imageWidth 2183 || currentPixel.y >= imageHeight) 2184 return; 2185 2186 float4 pixel; 2187 pixel.x = (float)bias.x; 2188 pixel.y = (float)bias.y; 2189 pixel.z = (float)bias.z; 2190 pixel.w = (float)bias.w; 2191 2192 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 2193 2194 for (int i = 0; i < width; i++) { 2195 // only support EdgeVirtualPixelMethod through ClampToCanvas 2196 // TODO: implement other virtual pixel method 2197 int2 samplePixel = currentPixel + offset[i]; 2198 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2199 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2200 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2201 2202 pixel.x += (filter[i] * (float)samplePixelValue.x); 2203 pixel.y += (filter[i] * (float)samplePixelValue.y); 2204 pixel.z += (filter[i] * (float)samplePixelValue.z); 2205 pixel.w += (filter[i] * (float)samplePixelValue.w); 2206 } 2207 2208 CLPixelType outputPixel; 2209 outputPixel.x = ClampToQuantum(pixel.x); 2210 outputPixel.y = ClampToQuantum(pixel.y); 2211 outputPixel.z = ClampToQuantum(pixel.z); 2212 outputPixel.w = ClampToQuantum(pixel.w); 2213 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2214 } 2215 else { 2216 2217 float gamma = 0.0f; 2218 for (int i = 0; i < width; i++) { 2219 // only support EdgeVirtualPixelMethod through ClampToCanvas 2220 // TODO: implement other virtual pixel method 2221 int2 samplePixel = currentPixel + offset[i]; 2222 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2223 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2224 2225 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2226 2227 float alpha = QuantumScale*samplePixelValue.w; 2228 float k = filter[i]; 2229 pixel.x = pixel.x + k * alpha * samplePixelValue.x; 2230 pixel.y = pixel.y + k * alpha * samplePixelValue.y; 2231 pixel.z = pixel.z + k * alpha * samplePixelValue.z; 2232 2233 pixel.w += k * alpha * samplePixelValue.w; 2234 2235 gamma+=k*alpha; 2236 } 2237 gamma = PerceptibleReciprocal(gamma); 2238 pixel.xyz = gamma*pixel.xyz; 2239 2240 CLPixelType outputPixel; 2241 outputPixel.x = ClampToQuantum(pixel.x); 2242 outputPixel.y = ClampToQuantum(pixel.y); 2243 outputPixel.z = ClampToQuantum(pixel.z); 2244 outputPixel.w = ClampToQuantum(pixel.w); 2245 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2246 } 2247 } 2248 ) 2249 2250 /* 2251 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2252 % % 2253 % % 2254 % % 2255 % R e s i z e % 2256 % % 2257 % % 2258 % % 2259 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2260 */ 2261 2262 STRINGIFY( 2263 // Based on Box from resize.c 2264 float BoxResizeFilter(const float x) 2265 { 2266 return 1.0f; 2267 } 2268 ) 2269 2270 STRINGIFY( 2271 // Based on CubicBC from resize.c 2272 float CubicBC(const float x,const __global float* resizeFilterCoefficients) 2273 { 2274 /* 2275 Cubic Filters using B,C determined values: 2276 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter 2277 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears 2278 Spline B = 1 C = 0 B-Spline Gaussian approximation 2279 Hermite B = 0 C = 0 B-Spline interpolator 2280 2281 See paper by Mitchell and Netravali, Reconstruction Filters in Computer 2282 Graphics Computer Graphics, Volume 22, Number 4, August 1988 2283 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ 2284 Mitchell.pdf. 2285 2286 Coefficents are determined from B,C values: 2287 P0 = ( 6 - 2*B )/6 = coeff[0] 2288 P1 = 0 2289 P2 = (-18 +12*B + 6*C )/6 = coeff[1] 2290 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] 2291 Q0 = ( 8*B +24*C )/6 = coeff[3] 2292 Q1 = ( -12*B -48*C )/6 = coeff[4] 2293 Q2 = ( 6*B +30*C )/6 = coeff[5] 2294 Q3 = ( - 1*B - 6*C )/6 = coeff[6] 2295 2296 which are used to define the filter: 2297 2298 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 2299 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 2300 2301 which ensures function is continuous in value and derivative (slope). 2302 */ 2303 if (x < 1.0) 2304 return(resizeFilterCoefficients[0]+x*(x* 2305 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); 2306 if (x < 2.0) 2307 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* 2308 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); 2309 return(0.0); 2310 } 2311 ) 2312 2313 STRINGIFY( 2314 float Sinc(const float x) 2315 { 2316 if (x != 0.0f) 2317 { 2318 const float alpha=(float) (MagickPI*x); 2319 return sinpi(x)/alpha; 2320 } 2321 return(1.0f); 2322 } 2323 ) 2324 2325 STRINGIFY( 2326 float Triangle(const float x) 2327 { 2328 /* 2329 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or 2330 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function 2331 for Sinc(). 2332 */ 2333 return ((x<1.0f)?(1.0f-x):0.0f); 2334 } 2335 ) 2336 2337 2338 STRINGIFY( 2339 float Hann(const float x) 2340 { 2341 /* 2342 Cosine window function: 2343 0.5+0.5*cos(pi*x). 2344 */ 2345 const float cosine=cos((MagickPI*x)); 2346 return(0.5f+0.5f*cosine); 2347 } 2348 ) 2349 2350 STRINGIFY( 2351 float Hamming(const float x) 2352 { 2353 /* 2354 Offset cosine window function: 2355 .54 + .46 cos(pi x). 2356 */ 2357 const float cosine=cos((MagickPI*x)); 2358 return(0.54f+0.46f*cosine); 2359 } 2360 ) 2361 2362 STRINGIFY( 2363 float Blackman(const float x) 2364 { 2365 /* 2366 Blackman: 2nd order cosine windowing function: 2367 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) 2368 2369 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and 2370 five flops. 2371 */ 2372 const float cosine=cos((MagickPI*x)); 2373 return(0.34f+cosine*(0.5f+cosine*0.16f)); 2374 } 2375 ) 2376 2377 STRINGIFY( 2378 static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) 2379 { 2380 switch (filterType) 2381 { 2382 /* Call Sinc even for SincFast to get better precision on GPU 2383 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ 2384 case SincWeightingFunction: 2385 case SincFastWeightingFunction: 2386 return Sinc(x); 2387 case CubicBCWeightingFunction: 2388 return CubicBC(x,filterCoefficients); 2389 case BoxWeightingFunction: 2390 return BoxResizeFilter(x); 2391 case TriangleWeightingFunction: 2392 return Triangle(x); 2393 case HannWeightingFunction: 2394 return Hann(x); 2395 case HammingWeightingFunction: 2396 return Hamming(x); 2397 case BlackmanWeightingFunction: 2398 return Blackman(x); 2399 2400 default: 2401 return 0.0f; 2402 } 2403 } 2404 ) 2405 2406 2407 STRINGIFY( 2408 static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType 2409 , const ResizeWeightingFunctionType resizeWindowType 2410 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) 2411 { 2412 float scale; 2413 float xBlur = fabs(x/resizeFilterBlur); 2414 if (resizeWindowSupport < MagickEpsilon 2415 || resizeWindowType == BoxWeightingFunction) 2416 { 2417 scale = 1.0f; 2418 } 2419 else 2420 { 2421 scale = resizeFilterScale; 2422 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); 2423 } 2424 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); 2425 return weight; 2426 } 2427 2428 ) 2429 2430 ; 2431 const char *accelerateKernels2 = 2432 2433 STRINGIFY( 2434 2435 static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2436 return (numWorkItems/pixelPerWorkgroup); 2437 } 2438 2439 // returns the index of the pixel for the current workitem to compute. 2440 // returns -1 if this workitem doesn't need to participate in any computation 2441 static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2442 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); 2443 int pixelIndex = itemID/numWorkItemsPerPixel; 2444 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1; 2445 return pixelIndex; 2446 } 2447 2448 ) 2449 2450 STRINGIFY( 2451 __kernel __attribute__((reqd_work_group_size(256, 1, 1))) 2452 void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2453 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2454 const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor, 2455 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2456 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2457 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2458 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2459 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2460 { 2461 // calculate the range of resized image pixels computed by this workgroup 2462 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup; 2463 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns); 2464 const unsigned int actualNumPixelToCompute = stopX - startX; 2465 2466 // calculate the range of input image pixels to cache 2467 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f); 2468 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2469 scale = PerceptibleReciprocal(scale); 2470 2471 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0)); 2472 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns); 2473 2474 // cache the input pixels into local memory 2475 const unsigned int y = get_global_id(1); 2476 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y); 2477 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels; 2478 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0); 2479 wait_group_events(1, &e); 2480 2481 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2482 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2483 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2484 { 2485 const unsigned int chunkStartX = startX + chunk*pixelChunkSize; 2486 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX); 2487 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX; 2488 2489 // determine which resized pixel computed by this workitem 2490 const unsigned int itemID = get_local_id(0); 2491 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0)); 2492 2493 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0)); 2494 2495 float4 filteredPixel = (float4)0.0f; 2496 float density = 0.0f; 2497 float gamma = 0.0f; 2498 // -1 means this workitem doesn't participate in the computation 2499 if (pixelIndex != -1) 2500 { 2501 // x coordinated of the resized pixel computed by this workitem 2502 const int x = chunkStartX + pixelIndex; 2503 2504 // calculate how many steps required for this pixel 2505 const float bisect = (x+0.5)/xFactor+MagickEpsilon; 2506 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2507 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns); 2508 const unsigned int n = stop - start; 2509 2510 // calculate how many steps this workitem will contribute 2511 unsigned int numStepsPerWorkItem = n / numItems; 2512 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2513 2514 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2515 if (startStep < n) 2516 { 2517 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2518 2519 unsigned int cacheIndex = start+startStep-cacheRangeStartX; 2520 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2521 { 2522 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2523 (ResizeWeightingFunctionType) resizeFilterType, 2524 (ResizeWeightingFunctionType) resizeWindowType, 2525 resizeFilterScale, resizeFilterWindowSupport, 2526 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2527 2528 float4 cp = (float4)0.0f; 2529 2530 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels); 2531 cp.x = (float) *(p); 2532 if (number_channels > 2) 2533 { 2534 cp.y = (float) *(p + 1); 2535 cp.z = (float) *(p + 2); 2536 } 2537 2538 if (alpha_index != 0) 2539 { 2540 cp.w = (float) *(p + alpha_index); 2541 2542 float alpha = weight * QuantumScale * cp.w; 2543 2544 filteredPixel.x += alpha * cp.x; 2545 filteredPixel.y += alpha * cp.y; 2546 filteredPixel.z += alpha * cp.z; 2547 filteredPixel.w += weight * cp.w; 2548 gamma += alpha; 2549 } 2550 else 2551 filteredPixel += ((float4) weight)*cp; 2552 2553 density += weight; 2554 } 2555 } 2556 } 2557 2558 // initialize the accumulators to zero 2559 if (itemID < actualNumPixelInThisChunk) { 2560 outputPixelCache[itemID] = (float4)0.0f; 2561 densityCache[itemID] = 0.0f; 2562 if (alpha_index != 0) 2563 gammaCache[itemID] = 0.0f; 2564 } 2565 barrier(CLK_LOCAL_MEM_FENCE); 2566 2567 // accumulatte the filtered pixel value and the density 2568 for (unsigned int i = 0; i < numItems; i++) { 2569 if (pixelIndex != -1) { 2570 if (itemID%numItems == i) { 2571 outputPixelCache[pixelIndex]+=filteredPixel; 2572 densityCache[pixelIndex]+=density; 2573 if (alpha_index != 0) 2574 gammaCache[pixelIndex]+=gamma; 2575 } 2576 } 2577 barrier(CLK_LOCAL_MEM_FENCE); 2578 } 2579 2580 if (itemID < actualNumPixelInThisChunk) 2581 { 2582 float4 filteredPixel = outputPixelCache[itemID]; 2583 2584 float gamma = 0.0f; 2585 if (alpha_index != 0) 2586 gamma = gammaCache[itemID]; 2587 2588 float density = densityCache[itemID]; 2589 if ((density != 0.0f) && (density != 1.0f)) 2590 { 2591 density = PerceptibleReciprocal(density); 2592 filteredPixel *= (float4) density; 2593 if (alpha_index != 0) 2594 gamma *= density; 2595 } 2596 2597 if (alpha_index != 0) 2598 { 2599 gamma = PerceptibleReciprocal(gamma); 2600 filteredPixel.x *= gamma; 2601 filteredPixel.y *= gamma; 2602 filteredPixel.z *= gamma; 2603 } 2604 2605 WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel); 2606 } 2607 } 2608 } 2609 ) 2610 2611 2612 STRINGIFY( 2613 __kernel __attribute__((reqd_work_group_size(1, 256, 1))) 2614 void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2615 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2616 const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, 2617 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2618 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2619 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2620 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2621 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2622 { 2623 // calculate the range of resized image pixels computed by this workgroup 2624 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; 2625 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); 2626 const unsigned int actualNumPixelToCompute = stopY - startY; 2627 2628 // calculate the range of input image pixels to cache 2629 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); 2630 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2631 scale = PerceptibleReciprocal(scale); 2632 2633 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); 2634 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); 2635 2636 // cache the input pixels into local memory 2637 const unsigned int x = get_global_id(0); 2638 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); 2639 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; 2640 unsigned int stride = inputColumns * number_channels; 2641 for (unsigned int i = 0; i < number_channels; i++) 2642 { 2643 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); 2644 wait_group_events(1,&e); 2645 } 2646 2647 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2648 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2649 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2650 { 2651 const unsigned int chunkStartY = startY + chunk*pixelChunkSize; 2652 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); 2653 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; 2654 2655 // determine which resized pixel computed by this workitem 2656 const unsigned int itemID = get_local_id(1); 2657 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); 2658 2659 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); 2660 2661 float4 filteredPixel = (float4)0.0f; 2662 float density = 0.0f; 2663 float gamma = 0.0f; 2664 // -1 means this workitem doesn't participate in the computation 2665 if (pixelIndex != -1) 2666 { 2667 // x coordinated of the resized pixel computed by this workitem 2668 const int y = chunkStartY + pixelIndex; 2669 2670 // calculate how many steps required for this pixel 2671 const float bisect = (y+0.5)/yFactor+MagickEpsilon; 2672 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2673 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); 2674 const unsigned int n = stop - start; 2675 2676 // calculate how many steps this workitem will contribute 2677 unsigned int numStepsPerWorkItem = n / numItems; 2678 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2679 2680 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2681 if (startStep < n) 2682 { 2683 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2684 2685 unsigned int cacheIndex = start+startStep-cacheRangeStartY; 2686 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2687 { 2688 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2689 (ResizeWeightingFunctionType) resizeFilterType, 2690 (ResizeWeightingFunctionType) resizeWindowType, 2691 resizeFilterScale, resizeFilterWindowSupport, 2692 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2693 2694 float4 cp = (float4)0.0f; 2695 2696 __local CLQuantum *p = inputImageCache + cacheIndex; 2697 cp.x = (float) *(p); 2698 if (number_channels > 2) 2699 { 2700 cp.y = (float) *(p + rangeLength); 2701 cp.z = (float) *(p + (rangeLength * 2)); 2702 } 2703 2704 if (alpha_index != 0) 2705 { 2706 cp.w = (float) *(p + (rangeLength * alpha_index)); 2707 2708 float alpha = weight * QuantumScale * cp.w; 2709 2710 filteredPixel.x += alpha * cp.x; 2711 filteredPixel.y += alpha * cp.y; 2712 filteredPixel.z += alpha * cp.z; 2713 filteredPixel.w += weight * cp.w; 2714 gamma += alpha; 2715 } 2716 else 2717 filteredPixel += ((float4) weight)*cp; 2718 2719 density += weight; 2720 } 2721 } 2722 } 2723 2724 // initialize the accumulators to zero 2725 if (itemID < actualNumPixelInThisChunk) { 2726 outputPixelCache[itemID] = (float4)0.0f; 2727 densityCache[itemID] = 0.0f; 2728 if (alpha_index != 0) 2729 gammaCache[itemID] = 0.0f; 2730 } 2731 barrier(CLK_LOCAL_MEM_FENCE); 2732 2733 // accumulatte the filtered pixel value and the density 2734 for (unsigned int i = 0; i < numItems; i++) { 2735 if (pixelIndex != -1) { 2736 if (itemID%numItems == i) { 2737 outputPixelCache[pixelIndex]+=filteredPixel; 2738 densityCache[pixelIndex]+=density; 2739 if (alpha_index != 0) 2740 gammaCache[pixelIndex]+=gamma; 2741 } 2742 } 2743 barrier(CLK_LOCAL_MEM_FENCE); 2744 } 2745 2746 if (itemID < actualNumPixelInThisChunk) 2747 { 2748 float4 filteredPixel = outputPixelCache[itemID]; 2749 2750 float gamma = 0.0f; 2751 if (alpha_index != 0) 2752 gamma = gammaCache[itemID]; 2753 2754 float density = densityCache[itemID]; 2755 if ((density != 0.0f) && (density != 1.0f)) 2756 { 2757 density = PerceptibleReciprocal(density); 2758 filteredPixel *= (float4) density; 2759 if (alpha_index != 0) 2760 gamma *= density; 2761 } 2762 2763 if (alpha_index != 0) 2764 { 2765 gamma = PerceptibleReciprocal(gamma); 2766 filteredPixel.x *= gamma; 2767 filteredPixel.y *= gamma; 2768 filteredPixel.z *= gamma; 2769 } 2770 2771 WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel); 2772 } 2773 } 2774 } 2775 ) 2776 2777 /* 2778 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2779 % % 2780 % % 2781 % % 2782 % R o t a t i o n a l B l u r % 2783 % % 2784 % % 2785 % % 2786 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2787 */ 2788 2789 STRINGIFY( 2790 __kernel void RotationalBlur(const __global CLQuantum *image, 2791 const unsigned int number_channels,const unsigned int channel, 2792 const float2 blurCenter,__constant float *cos_theta, 2793 __constant float *sin_theta,const unsigned int cossin_theta_size, 2794 __global CLQuantum *filteredImage) 2795 { 2796 const int x = get_global_id(0); 2797 const int y = get_global_id(1); 2798 const int columns = get_global_size(0); 2799 const int rows = get_global_size(1); 2800 unsigned int step = 1; 2801 float center_x = (float) x - blurCenter.x; 2802 float center_y = (float) y - blurCenter.y; 2803 float radius = hypot(center_x, center_y); 2804 2805 float blur_radius = hypot(blurCenter.x, blurCenter.y); 2806 2807 if (radius > MagickEpsilon) 2808 { 2809 step = (unsigned int) (blur_radius / radius); 2810 if (step == 0) 2811 step = 1; 2812 if (step >= cossin_theta_size) 2813 step = cossin_theta_size-1; 2814 } 2815 2816 float4 result = 0.0f; 2817 float normalize = 0.0f; 2818 float gamma = 0.0f; 2819 2820 for (unsigned int i=0; i<cossin_theta_size; i+=step) 2821 { 2822 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns); 2823 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows); 2824 2825 float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy); 2826 2827 if ((number_channels == 4) || (number_channels == 2)) 2828 { 2829 float alpha = (float)(QuantumScale*pixel.w); 2830 2831 gamma += alpha; 2832 2833 result.x += alpha * pixel.x; 2834 result.y += alpha * pixel.y; 2835 result.z += alpha * pixel.z; 2836 result.w += pixel.w; 2837 } 2838 else 2839 result += pixel; 2840 2841 normalize += 1.0f; 2842 } 2843 2844 normalize = PerceptibleReciprocal(normalize); 2845 2846 if ((number_channels == 4) || (number_channels == 2)) 2847 { 2848 gamma = PerceptibleReciprocal(gamma); 2849 result.x *= gamma; 2850 result.y *= gamma; 2851 result.z *= gamma; 2852 result.w *= normalize; 2853 } 2854 else 2855 result *= normalize; 2856 2857 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 2858 } 2859 ) 2860 2861 /* 2862 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2863 % % 2864 % % 2865 % % 2866 % U n s h a r p M a s k % 2867 % % 2868 % % 2869 % % 2870 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2871 */ 2872 2873 STRINGIFY( 2874 __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image, 2875 const __global float4 *blurRowData,const unsigned int number_channels, 2876 const ChannelType channel,const unsigned int columns, 2877 const unsigned int rows,__local float4* cachedData, 2878 __local float* cachedFilter,const __global float *filter, 2879 const unsigned int width,const float gain, const float threshold, 2880 __global CLQuantum *filteredImage) 2881 { 2882 const unsigned int radius = (width-1)/2; 2883 2884 // cache the pixel shared by the workgroup 2885 const int groupX = get_group_id(0); 2886 const int groupStartY = get_group_id(1)*get_local_size(1) - radius; 2887 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius; 2888 2889 if ((groupStartY >= 0) && (groupStopY < rows)) 2890 { 2891 event_t e = async_work_group_strided_copy(cachedData, 2892 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); 2893 wait_group_events(1,&e); 2894 } 2895 else 2896 { 2897 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) 2898 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; 2899 2900 barrier(CLK_LOCAL_MEM_FENCE); 2901 } 2902 // cache the filter as well 2903 event_t e = async_work_group_copy(cachedFilter,filter,width,0); 2904 wait_group_events(1,&e); 2905 2906 // only do the work if this is not a patched item 2907 const int cy = get_global_id(1); 2908 2909 if (cy < rows) 2910 { 2911 float4 blurredPixel = (float4) 0.0f; 2912 2913 int i = 0; 2914 2915 for ( ; i+7 < width; ) 2916 { 2917 for (int j=0; j < 8; j++, i++) 2918 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; 2919 } 2920 2921 for ( ; i < width; i++) 2922 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; 2923 2924 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); 2925 float4 outputPixel = inputImagePixel - blurredPixel; 2926 2927 float quantumThreshold = QuantumRange*threshold; 2928 2929 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); 2930 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); 2931 2932 //write back 2933 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); 2934 } 2935 } 2936 ) 2937 2938 STRINGIFY( 2939 __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, 2940 const ChannelType channel,__constant float *filter,const unsigned int width, 2941 const unsigned int columns,const unsigned int rows,__local float4 *pixels, 2942 const float gain,const float threshold,__global CLQuantum *filteredImage) 2943 { 2944 const unsigned int x = get_global_id(0); 2945 const unsigned int y = get_global_id(1); 2946 2947 const unsigned int radius = (width - 1) / 2; 2948 2949 int row = y - radius; 2950 int baseRow = get_group_id(1) * get_local_size(1) - radius; 2951 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; 2952 2953 while (row < endRow) { 2954 int srcy = (row < 0) ? -row : row; // mirror pad 2955 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; 2956 2957 float4 value = 0.0f; 2958 2959 int ix = x - radius; 2960 int i = 0; 2961 2962 while (i + 7 < width) { 2963 for (int j = 0; j < 8; ++j) { // unrolled 2964 int srcx = ix + j; 2965 srcx = (srcx < 0) ? -srcx : srcx; 2966 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2967 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2968 } 2969 ix += 8; 2970 i += 8; 2971 } 2972 2973 while (i < width) { 2974 int srcx = (ix < 0) ? -ix : ix; // mirror pad 2975 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2976 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2977 ++i; 2978 ++ix; 2979 } 2980 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; 2981 row += get_local_size(1); 2982 } 2983 2984 barrier(CLK_LOCAL_MEM_FENCE); 2985 2986 const int px = get_local_id(0); 2987 const int py = get_local_id(1); 2988 const int prp = get_local_size(0); 2989 float4 value = (float4)(0.0f); 2990 2991 int i = 0; 2992 while (i + 7 < width) { 2993 for (int j = 0; j < 8; ++j) // unrolled 2994 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; 2995 i += 8; 2996 } 2997 while (i < width) { 2998 value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; 2999 ++i; 3000 } 3001 3002 if ((x < columns) && (y < rows)) { 3003 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); 3004 float4 diff = srcPixel - value; 3005 3006 float quantumThreshold = QuantumRange*threshold; 3007 3008 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); 3009 value = select(srcPixel + diff * gain, srcPixel, mask); 3010 3011 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); 3012 } 3013 } 3014 ) 3015 3016 STRINGIFY( 3017 __kernel __attribute__((reqd_work_group_size(64, 4, 1))) 3018 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, 3019 const unsigned int number_channels,const unsigned int max_channels, 3020 const float threshold,const int passes,const unsigned int imageWidth, 3021 const unsigned int imageHeight) 3022 { 3023 const int pad = (1 << (passes - 1)); 3024 const int tileSize = 64; 3025 const int tileRowPixels = 64; 3026 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; 3027 3028 CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) 3029 3030 local float buffer[64 * 64]; 3031 3032 int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0); 3033 int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad; 3034 3035 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3036 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + 3037 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; 3038 3039 for (int channel = 0; channel < max_channels; ++channel) 3040 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; 3041 } 3042 3043 for (int channel = 0; channel < max_channels; ++channel) { 3044 // Load LDS 3045 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3046 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); 3047 3048 // Process 3049 3050 float tmp[16]; 3051 float accum[16]; 3052 float pixel; 3053 3054 for (int i = 0; i < 16; i++) 3055 accum[i]=0.0f; 3056 3057 for (int pass = 0; pass < passes; ++pass) { 3058 const int radius = 1 << pass; 3059 const int x = get_local_id(0); 3060 const float thresh = threshold * noise[pass]; 3061 3062 // Apply horizontal hat 3063 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3064 const int offset = i * tileRowPixels; 3065 if (pass == 0) 3066 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass 3067 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); 3068 barrier(CLK_LOCAL_MEM_FENCE); 3069 buffer[x + offset] = pixel; 3070 } 3071 barrier(CLK_LOCAL_MEM_FENCE); 3072 3073 // Apply vertical hat 3074 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3075 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); 3076 float delta = tmp[i / 4] - pixel; 3077 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done 3078 if (delta < -thresh) 3079 delta += thresh; 3080 else if (delta > thresh) 3081 delta -= thresh; 3082 else 3083 delta = 0; 3084 accum[i / 4] += delta; 3085 } 3086 barrier(CLK_LOCAL_MEM_FENCE); 3087 3088 if (pass < passes - 1) 3089 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3090 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass 3091 else // last pass 3092 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3093 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output 3094 barrier(CLK_LOCAL_MEM_FENCE); 3095 } 3096 3097 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3098 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); 3099 3100 barrier(CLK_LOCAL_MEM_FENCE); 3101 } 3102 3103 // Write from stage to output 3104 3105 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { 3106 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3107 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { 3108 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); 3109 for (int channel = 0; channel < max_channels; ++channel) { 3110 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; 3111 } 3112 } 3113 } 3114 } 3115 } 3116 ) 3117 3118 ; 3119 3120 #endif // MAGICKCORE_OPENCL_SUPPORT 3121 3122 #if defined(__cplusplus) || defined(c_plusplus) 3123 } 3124 #endif 3125 3126 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 3127