• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // GENERATED FILE - DO NOT EDIT.
2 // Generated by gen_mtl_internal_shaders.py
3 //
4 // Copyright 2020 The ANGLE Project Authors. All rights reserved.
5 // Use of this source code is governed by a BSD-style license that can be
6 // found in the LICENSE file.
7 //
8 
9 // C++ string version of combined Metal default shaders.
10 
11 static char gDefaultMetallibSrc[] = R"(
12 # include <simd/simd.h>
13 # include <metal_stdlib>
14 namespace rx
15 {
16 namespace mtl_shader
17 {
18 enum
19 {
20     kTextureType2D = 0,
21     kTextureType2DMultisample = 1,
22     kTextureType2DArray = 2,
23     kTextureTypeCube = 3,
24     kTextureType3D = 4,
25     kTextureTypeCount = 5,
26 };
27 }
28 }
29 using namespace metal;
30 constant uint32_t kNumColorOutputs [[function_constant(0)]];
31 constant bool kColorOutputAvailable0 = kNumColorOutputs > 0;
32 constant bool kColorOutputAvailable1 = kNumColorOutputs > 1;
33 constant bool kColorOutputAvailable2 = kNumColorOutputs > 2;
34 constant bool kColorOutputAvailable3 = kNumColorOutputs > 3;
35 constant bool kColorOutputAvailable4 = kNumColorOutputs > 4;
36 constant bool kColorOutputAvailable5 = kNumColorOutputs > 5;
37 constant bool kColorOutputAvailable6 = kNumColorOutputs > 6;
38 constant bool kColorOutputAvailable7 = kNumColorOutputs > 7;
39 namespace rx
40 {
41 namespace mtl_shader
42 {
43 constant float2 gCorners[3] = {float2(-1.0f, -1.0f), float2(3.0f, -1.0f), float2(-1.0f, 3.0f)};
44 template <typename T>
45 struct MultipleColorOutputs
46 {
47     vec<T, 4> color0 [[color(0), function_constant(kColorOutputAvailable0)]];
48     vec<T, 4> color1 [[color(1), function_constant(kColorOutputAvailable1)]];
49     vec<T, 4> color2 [[color(2), function_constant(kColorOutputAvailable2)]];
50     vec<T, 4> color3 [[color(3), function_constant(kColorOutputAvailable3)]];
51     vec<T, 4> color4 [[color(4), function_constant(kColorOutputAvailable4)]];
52     vec<T, 4> color5 [[color(5), function_constant(kColorOutputAvailable5)]];
53     vec<T, 4> color6 [[color(6), function_constant(kColorOutputAvailable6)]];
54     vec<T, 4> color7 [[color(7), function_constant(kColorOutputAvailable7)]];
55 };
56 template <typename T>
57 static inline MultipleColorOutputs<T> toMultipleColorOutputs(vec<T, 4> color)
58 {
59     MultipleColorOutputs<T> re;
60     do { if (kColorOutputAvailable0) { re.color0 = color; } } while (0);
61     do { if (kColorOutputAvailable1) { re.color1 = color; } } while (0);
62     do { if (kColorOutputAvailable2) { re.color2 = color; } } while (0);
63     do { if (kColorOutputAvailable3) { re.color3 = color; } } while (0);
64     do { if (kColorOutputAvailable4) { re.color4 = color; } } while (0);
65     do { if (kColorOutputAvailable5) { re.color5 = color; } } while (0);
66     do { if (kColorOutputAvailable6) { re.color6 = color; } } while (0);
67     do { if (kColorOutputAvailable7) { re.color7 = color; } } while (0);
68     return re;
69 }
70 static inline float3 cubeTexcoords(float2 texcoords, int face)
71 {
72     texcoords = 2.0 * texcoords - 1.0;
73     switch (face)
74     {
75         case 0:
76             return float3(1.0, -texcoords.y, -texcoords.x);
77         case 1:
78             return float3(-1.0, -texcoords.y, texcoords.x);
79         case 2:
80             return float3(texcoords.x, 1.0, texcoords.y);
81         case 3:
82             return float3(texcoords.x, -1.0, -texcoords.y);
83         case 4:
84             return float3(texcoords.x, -texcoords.y, 1.0);
85         case 5:
86             return float3(-texcoords.x, -texcoords.y, -1.0);
87     }
88     return float3(texcoords, 0);
89 }
90 template <typename T>
91 static inline vec<T, 4> resolveTextureMS(texture2d_ms<T> srcTexture, uint2 coords)
92 {
93     uint samples = srcTexture.get_num_samples();
94     vec<T, 4> output(0);
95     for (uint sample = 0; sample < samples; ++sample)
96     {
97         output += srcTexture.read(coords, sample);
98     }
99     output = output / samples;
100     return output;
101 }
102 static inline float4 sRGBtoLinear(float4 color)
103 {
104     float3 linear1 = color.rgb / 12.92;
105     float3 linear2 = powr((color.rgb + float3(0.055)) / 1.055, 2.4);
106     float3 factor = float3(color.rgb <= float3(0.04045));
107     float4 linear = float4(factor * linear1 + float3(1.0 - factor) * linear2, color.a);
108     return linear;
109 }
110 static inline float linearToSRGB(float color)
111 {
112     if (color <= 0.0f)
113         return 0.0f;
114     if (color < 0.0031308f)
115         return 12.92f * color;
116     if (color < 1.0f)
117         return 1.055f * powr(color, 0.41666f) - 0.055f;
118     return 1.0f;
119 }
120 static inline float4 linearToSRGB(float4 color)
121 {
122     return float4(linearToSRGB(color.r), linearToSRGB(color.g), linearToSRGB(color.b), color.a);
123 }
124 template <typename Short>
125 static inline Short bytesToShort(constant uchar *input, uint offset)
126 {
127     Short inputLo = input[offset];
128     Short inputHi = input[offset + 1];
129     return inputLo | (inputHi << 8);
130 }
131 template <typename Int>
132 static inline Int bytesToInt(constant uchar *input, uint offset)
133 {
134     Int input0 = input[offset];
135     Int input1 = input[offset + 1];
136     Int input2 = input[offset + 2];
137     Int input3 = input[offset + 3];
138     return input0 | (input1 << 8) | (input2 << 16) | (input3 << 24);
139 }
140 template <typename Short>
141 static inline void shortToBytes(Short val, uint offset, device uchar *output)
142 {
143     ushort valUnsigned = as_type<ushort>(val);
144     output[offset] = valUnsigned & 0xff;
145     output[offset + 1] = (valUnsigned >> 8) & 0xff;
146 }
147 template <typename Int>
148 static inline void intToBytes(Int val, uint offset, device uchar *output)
149 {
150     uint valUnsigned = as_type<uint>(val);
151     output[offset] = valUnsigned & 0xff;
152     output[offset + 1] = (valUnsigned >> 8) & 0xff;
153     output[offset + 2] = (valUnsigned >> 16) & 0xff;
154     output[offset + 3] = (valUnsigned >> 24) & 0xff;
155 }
156 static inline void floatToBytes(float val, uint offset, device uchar *output)
157 {
158     intToBytes(as_type<uint>(val), offset, output);
159 }
160 static inline void int24bitToBytes(uint val, uint offset, device uchar *output)
161 {
162     output[offset] = val & 0xff;
163     output[offset + 1] = (val >> 8) & 0xff;
164     output[offset + 2] = (val >> 16) & 0xff;
165 }
166 template <unsigned int inputBitCount, unsigned int inputBitStart, typename T>
167 static inline T getShiftedData(T input)
168 {
169     static_assert(inputBitCount + inputBitStart <= (sizeof(T) * 8),
170                   "T must have at least as many bits as inputBitCount + inputBitStart.");
171     const T mask = (1 << inputBitCount) - 1;
172     return (input >> inputBitStart) & mask;
173 }
174 template <unsigned int inputBitCount, unsigned int inputBitStart, typename T>
175 static inline T shiftData(T input)
176 {
177     static_assert(inputBitCount + inputBitStart <= (sizeof(T) * 8),
178                   "T must have at least as many bits as inputBitCount + inputBitStart.");
179     const T mask = (1 << inputBitCount) - 1;
180     return (input & mask) << inputBitStart;
181 }
182 template <unsigned int inputBitCount, typename T>
183 static inline float normalizedToFloat(T input)
184 {
185     static_assert(inputBitCount <= (sizeof(T) * 8),
186                   "T must have more bits than or same bits as inputBitCount.");
187     static_assert(inputBitCount <= 23, "Only single precision is supported");
188     constexpr float inverseMax = 1.0f / ((1 << inputBitCount) - 1);
189     return input * inverseMax;
190 }
191 template <typename T>
192 static inline float normalizedToFloat(T input)
193 {
194     return normalizedToFloat<sizeof(T) * 8, T>(input);
195 }
196 template <>
197 inline float normalizedToFloat(short input)
198 {
199     constexpr float inverseMax = 1.0f / 0x7fff;
200     return static_cast<float>(input) * inverseMax;
201 }
202 template <>
203 inline float normalizedToFloat(int input)
204 {
205     constexpr float inverseMax = 1.0f / 0x7fffffff;
206     return static_cast<float>(input) * inverseMax;
207 }
208 template <>
209 inline float normalizedToFloat(uint input)
210 {
211     constexpr float inverseMax = 1.0f / 0xffffffff;
212     return static_cast<float>(input) * inverseMax;
213 }
214 template <unsigned int outputBitCount, typename T>
215 static inline T floatToNormalized(float input)
216 {
217     static_assert(outputBitCount <= (sizeof(T) * 8),
218                   "T must have more bits than or same bits as inputBitCount.");
219     static_assert(outputBitCount > (metal::is_unsigned<T>::value ? 0 : 1),
220                   "outputBitCount must be at least 1 not counting the sign bit.");
221     constexpr unsigned int bits =
222         metal::is_unsigned<T>::value ? outputBitCount : outputBitCount - 1;
223     static_assert(bits <= 23, "Only single precision is supported");
224     return static_cast<T>(metal::round(((1 << bits) - 1) * input));
225 }
226 template <typename T>
227 static inline T floatToNormalized(float input)
228 {
229     return floatToNormalized<sizeof(T) * 8, T>(input);
230 }
231 }
232 }
233 
234 using namespace rx::mtl_shader;
235 constant bool kPremultiplyAlpha [[function_constant(1)]];
236 constant bool kUnmultiplyAlpha [[function_constant(2)]];
237 constant bool kTransformLinearToSrgb [[function_constant(3)]];
238 constant int kSourceTextureType [[function_constant(4)]];
239 constant int kSourceTexture2Type [[function_constant(5)]];
240 constant bool kSourceTextureType2D = kSourceTextureType == kTextureType2D;
241 constant bool kSourceTextureType2DArray = kSourceTextureType == kTextureType2DArray;
242 constant bool kSourceTextureType2DMS = kSourceTextureType == kTextureType2DMultisample;
243 constant bool kSourceTextureTypeCube = kSourceTextureType == kTextureTypeCube;
244 constant bool kSourceTextureType3D = kSourceTextureType == kTextureType3D;
245 constant bool kSourceTexture2Type2D = kSourceTexture2Type == kTextureType2D;
246 constant bool kSourceTexture2Type2DArray = kSourceTexture2Type == kTextureType2DArray;
247 constant bool kSourceTexture2Type2DMS = kSourceTexture2Type == kTextureType2DMultisample;
248 constant bool kSourceTexture2TypeCube = kSourceTexture2Type == kTextureTypeCube;
249 struct BlitParams
250 {
251     float4 srcTexCoords;
252     int srcLevel;
253     int srcLayer;
254     bool dstLuminance;
255     uint8_t padding[7];
256 };
257 struct BlitVSOut
258 {
259     float4 position [[position]];
260     float2 texCoords [[center_no_perspective, user(locn1)]];
261 };
262 vertex BlitVSOut blitVS(unsigned int vid [[vertex_id]], constant BlitParams &options [[buffer(0)]])
263 {
264     BlitVSOut output;
265     output.position.xy = select(float2(-1.0f), float2(1.0f), bool2(vid & uint2(2, 1)));
266     output.position.zw = float2(0.0, 1.0);
267     output.texCoords = select(options.srcTexCoords.xy, options.srcTexCoords.zw, bool2(vid & uint2(2, 1)));
268     return output;
269 }
270 template <typename SrcTexture2d>
271 static uint2 getImageCoords(SrcTexture2d srcTexture, float2 texCoords)
272 {
273     uint2 dimens(srcTexture.get_width(), srcTexture.get_height());
274     uint2 coords = uint2(texCoords * float2(dimens));
275     return coords;
276 }
277 template <typename T>
278 static inline vec<T, 4> blitSampleTextureMS(texture2d_ms<T> srcTexture, float2 texCoords)
279 {
280     uint2 coords = getImageCoords(srcTexture, texCoords);
281     return resolveTextureMS(srcTexture, coords);
282 }
283 template <typename T>
284 static inline vec<T, 4> blitSampleTexture3D(texture3d<T> srcTexture,
285                                             sampler textureSampler,
286                                             float2 texCoords,
287                                             constant BlitParams &options)
288 {
289     uint depth = srcTexture.get_depth(options.srcLevel);
290     float zCoord = (float(options.srcLayer) + 0.5) / float(depth);
291     return srcTexture.sample(textureSampler, float3(texCoords, zCoord), level(options.srcLevel));
292 }
293 template <typename T>
294 static inline vec<T, 4> blitReadTexture(BlitVSOut input [[stage_in]], texture2d<T> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<T> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<T> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<T> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<T> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
295 {
296     vec<T, 4> output;
297     switch (kSourceTextureType)
298     {
299         case kTextureType2D:
300             output = srcTexture2d.sample(textureSampler, input.texCoords, level(options.srcLevel));
301             break;
302         case kTextureType2DArray:
303             output = srcTexture2dArray.sample(textureSampler, input.texCoords, options.srcLayer,
304                                               level(options.srcLevel));
305             break;
306         case kTextureType2DMultisample:
307             output = blitSampleTextureMS(srcTexture2dMS, input.texCoords);
308             break;
309         case kTextureTypeCube:
310             output = srcTextureCube.sample(textureSampler,
311                                            cubeTexcoords(input.texCoords, options.srcLayer),
312                                            level(options.srcLevel));
313             break;
314         case kTextureType3D:
315             output = blitSampleTexture3D(srcTexture3d, textureSampler, input.texCoords, options);
316             break;
317     }
318     if (kTransformLinearToSrgb) {
319         output.x = linearToSRGB(output.x);
320         output.y = linearToSRGB(output.y);
321         output.z = linearToSRGB(output.z);
322     }
323     if (kUnmultiplyAlpha)
324     {
325         if (output.a != 0.0)
326         {
327             output.xyz /= output.a;
328         }
329     }
330     if (kPremultiplyAlpha)
331     {
332         output.xyz *= output.a;
333     }
334     if (options.dstLuminance)
335     {
336         output.g = output.b = output.r;
337     }
338     return output;
339 }
340 template <typename T>
341 static inline MultipleColorOutputs<T> blitFS(BlitVSOut input [[stage_in]], texture2d<T> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<T> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<T> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<T> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<T> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
342 {
343     vec<T, 4> output = blitReadTexture(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
344     return toMultipleColorOutputs(output);
345 }
346 fragment MultipleColorOutputs<float> blitFloatFS(BlitVSOut input [[stage_in]], texture2d<float> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<float> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<float> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<float> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<float> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
347 {
348     return blitFS(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
349 }
350 fragment MultipleColorOutputs<int> blitIntFS(BlitVSOut input [[stage_in]], texture2d<int> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<int> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<int> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<int> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<int> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
351 {
352     return blitFS(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
353 }
354 fragment MultipleColorOutputs<uint> blitUIntFS(BlitVSOut input [[stage_in]], texture2d<uint> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<uint> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<uint> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<uint> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<uint> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
355 {
356     return blitFS(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
357 }
358 fragment MultipleColorOutputs<uint> copyTextureFloatToUIntFS(BlitVSOut input [[stage_in]], texture2d<float> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<float> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<float> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<float> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<float> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
359 {
360     float4 inputColor = blitReadTexture<>(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
361     uint4 output = uint4(inputColor * float4(255.0));
362     return toMultipleColorOutputs(output);
363 }
364 struct FragmentDepthOut
365 {
366     float depth [[depth(any)]];
367 };
368 static inline float sampleDepth(
369     texture2d<float> srcTexture2d [[function_constant(kSourceTextureType2D)]],
370     texture2d_array<float> srcTexture2dArray [[function_constant(kSourceTextureType2DArray)]],
371     texture2d_ms<float> srcTexture2dMS [[function_constant(kSourceTextureType2DMS)]],
372     texturecube<float> srcTextureCube [[function_constant(kSourceTextureTypeCube)]],
373     float2 texCoords,
374     constant BlitParams &options)
375 {
376     float4 output;
377     constexpr sampler textureSampler(mag_filter::nearest, min_filter::nearest);
378     switch (kSourceTextureType)
379     {
380         case kTextureType2D:
381             output = srcTexture2d.sample(textureSampler, texCoords, level(options.srcLevel));
382             break;
383         case kTextureType2DArray:
384             output = srcTexture2dArray.sample(textureSampler, texCoords, options.srcLayer,
385                                               level(options.srcLevel));
386             break;
387         case kTextureType2DMultisample:
388             output = srcTexture2dMS.read(getImageCoords(srcTexture2dMS, texCoords), 0);
389             break;
390         case kTextureTypeCube:
391             output =
392                 srcTextureCube.sample(textureSampler, cubeTexcoords(texCoords, options.srcLayer),
393                                       level(options.srcLevel));
394             break;
395     }
396     return output.r;
397 }
398 fragment FragmentDepthOut blitDepthFS(BlitVSOut input [[stage_in]],
399                                       texture2d<float> srcTexture2d
400                                       [[texture(0), function_constant(kSourceTextureType2D)]],
401                                       texture2d_array<float> srcTexture2dArray
402                                       [[texture(0), function_constant(kSourceTextureType2DArray)]],
403                                       texture2d_ms<float> srcTexture2dMS
404                                       [[texture(0), function_constant(kSourceTextureType2DMS)]],
405                                       texturecube<float> srcTextureCube
406                                       [[texture(0), function_constant(kSourceTextureTypeCube)]],
407                                       constant BlitParams &options [[buffer(0)]])
408 {
409     FragmentDepthOut re;
410     re.depth = sampleDepth(srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube,
411                            input.texCoords, options);
412     return re;
413 }
414 static inline uint32_t sampleStencil(
415     texture2d<uint32_t> srcTexture2d [[function_constant(kSourceTexture2Type2D)]],
416     texture2d_array<uint32_t> srcTexture2dArray [[function_constant(kSourceTexture2Type2DArray)]],
417     texture2d_ms<uint32_t> srcTexture2dMS [[function_constant(kSourceTexture2Type2DMS)]],
418     texturecube<uint32_t> srcTextureCube [[function_constant(kSourceTexture2TypeCube)]],
419     float2 texCoords,
420     int srcLevel,
421     int srcLayer)
422 {
423     uint4 output;
424     constexpr sampler textureSampler(mag_filter::nearest, min_filter::nearest);
425     switch (kSourceTexture2Type)
426     {
427         case kTextureType2D:
428             output = srcTexture2d.sample(textureSampler, texCoords, level(srcLevel));
429             break;
430         case kTextureType2DArray:
431             output = srcTexture2dArray.sample(textureSampler, texCoords, srcLayer, level(srcLevel));
432             break;
433         case kTextureType2DMultisample:
434             output = srcTexture2dMS.read(getImageCoords(srcTexture2dMS, texCoords), 0);
435             break;
436         case kTextureTypeCube:
437             output = srcTextureCube.sample(textureSampler, cubeTexcoords(texCoords, srcLayer),
438                                            level(srcLevel));
439             break;
440     }
441     return output.r;
442 }
443 struct BlitStencilToBufferParams
444 {
445     float2 srcStartTexCoords;
446     float2 srcTexCoordSteps;
447     int srcLevel;
448     int srcLayer;
449     uint2 dstSize;
450     uint dstBufferRowPitch;
451     bool resolveMS;
452 };
453 kernel void blitStencilToBufferCS(ushort2 gIndices [[thread_position_in_grid]],
454                                   texture2d<uint32_t> srcTexture2d
455                                   [[texture(1), function_constant(kSourceTexture2Type2D)]],
456                                   texture2d_array<uint32_t> srcTexture2dArray
457                                   [[texture(1), function_constant(kSourceTexture2Type2DArray)]],
458                                   texture2d_ms<uint32_t> srcTexture2dMS
459                                   [[texture(1), function_constant(kSourceTexture2Type2DMS)]],
460                                   texturecube<uint32_t> srcTextureCube
461                                   [[texture(1), function_constant(kSourceTexture2TypeCube)]],
462                                   constant BlitStencilToBufferParams &options [[buffer(0)]],
463                                   device uchar *buffer [[buffer(1)]])
464 {
465     if (gIndices.x >= options.dstSize.x || gIndices.y >= options.dstSize.y)
466     {
467         return;
468     }
469     float2 srcTexCoords = options.srcStartTexCoords + float2(gIndices) * options.srcTexCoordSteps;
470     if (kSourceTexture2Type == kTextureType2DMultisample && !options.resolveMS)
471     {
472         uint samples = srcTexture2dMS.get_num_samples();
473         uint2 imageCoords = getImageCoords(srcTexture2dMS, srcTexCoords);
474         uint bufferOffset = options.dstBufferRowPitch * gIndices.y + samples * gIndices.x;
475         for (uint sample = 0; sample < samples; ++sample)
476         {
477             uint stencilPerSample = srcTexture2dMS.read(imageCoords, sample).r;
478             buffer[bufferOffset + sample] = static_cast<uchar>(stencilPerSample);
479         }
480     }
481     else
482     {
483         uint32_t stencil =
484             sampleStencil(srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube,
485                           srcTexCoords, options.srcLevel, options.srcLayer);
486         buffer[options.dstBufferRowPitch * gIndices.y + gIndices.x] = static_cast<uchar>(stencil);
487     }
488 }
489 #if __METAL_VERSION__ >= 210
490 struct FragmentStencilOut
491 {
492     uint32_t stencil [[stencil]];
493 };
494 struct FragmentDepthStencilOut
495 {
496     float depth [[depth(any)]];
497     uint32_t stencil [[stencil]];
498 };
499 fragment FragmentStencilOut blitStencilFS(
500     BlitVSOut input [[stage_in]],
501     texture2d<uint32_t> srcTexture2d [[texture(1), function_constant(kSourceTexture2Type2D)]],
502     texture2d_array<uint32_t> srcTexture2dArray
503     [[texture(1), function_constant(kSourceTexture2Type2DArray)]],
504     texture2d_ms<uint32_t> srcTexture2dMS
505     [[texture(1), function_constant(kSourceTexture2Type2DMS)]],
506     texturecube<uint32_t> srcTextureCube [[texture(1), function_constant(kSourceTexture2TypeCube)]],
507     constant BlitParams &options [[buffer(0)]])
508 {
509     FragmentStencilOut re;
510     re.stencil = sampleStencil(srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube,
511                                input.texCoords, options.srcLevel, options.srcLayer);
512     return re;
513 }
514 fragment FragmentDepthStencilOut blitDepthStencilFS(
515     BlitVSOut input [[stage_in]],
516     texture2d<float> srcDepthTexture2d [[texture(0), function_constant(kSourceTextureType2D)]],
517     texture2d_array<float> srcDepthTexture2dArray
518     [[texture(0), function_constant(kSourceTextureType2DArray)]],
519     texture2d_ms<float> srcDepthTexture2dMS
520     [[texture(0), function_constant(kSourceTextureType2DMS)]],
521     texturecube<float> srcDepthTextureCube
522     [[texture(0), function_constant(kSourceTextureTypeCube)]],
523     texture2d<uint32_t> srcStencilTexture2d
524     [[texture(1), function_constant(kSourceTexture2Type2D)]],
525     texture2d_array<uint32_t> srcStencilTexture2dArray
526     [[texture(1), function_constant(kSourceTexture2Type2DArray)]],
527     texture2d_ms<uint32_t> srcStencilTexture2dMS
528     [[texture(1), function_constant(kSourceTexture2Type2DMS)]],
529     texturecube<uint32_t> srcStencilTextureCube
530     [[texture(1), function_constant(kSourceTexture2TypeCube)]],
531     constant BlitParams &options [[buffer(0)]])
532 {
533     FragmentDepthStencilOut re;
534     re.depth = sampleDepth(srcDepthTexture2d, srcDepthTexture2dArray, srcDepthTexture2dMS,
535                            srcDepthTextureCube, input.texCoords, options);
536     re.stencil =
537         sampleStencil(srcStencilTexture2d, srcStencilTexture2dArray, srcStencilTexture2dMS,
538                       srcStencilTextureCube, input.texCoords, options.srcLevel, options.srcLayer);
539     return re;
540 }
541 #endif
542 using namespace rx::mtl_shader;
543 struct ClearParams
544 {
545     float4 clearColor;
546     float clearDepth;
547 };
548 vertex float4 clearVS(unsigned int vid [[ vertex_id ]],
549                       constant ClearParams &clearParams [[buffer(0)]])
550 {
551     return float4(gCorners[vid], clearParams.clearDepth, 1.0);
552 }
553 fragment MultipleColorOutputs<float> clearFloatFS(constant ClearParams &clearParams [[buffer(0)]])
554 {
555     return toMultipleColorOutputs(clearParams.clearColor);
556 }
557 fragment MultipleColorOutputs<int> clearIntFS(constant ClearParams &clearParams [[buffer(0)]])
558 {
559     return toMultipleColorOutputs(as_type<int4>(clearParams.clearColor));
560 }
561 fragment MultipleColorOutputs<uint> clearUIntFS(constant ClearParams &clearParams [[buffer(0)]])
562 {
563     return toMultipleColorOutputs(as_type<uint4>(clearParams.clearColor));
564 }
565 using namespace rx::mtl_shader;
566 constant bool kSourceBufferAligned[[function_constant(100)]];
567 constant bool kSourceIndexIsU8[[function_constant(200)]];
568 constant bool kSourceIndexIsU16[[function_constant(300)]];
569 constant bool kSourceIndexIsU32[[function_constant(400)]];
570 constant bool kSourceBufferUnaligned = !kSourceBufferAligned;
571 constant bool kUseSourceBufferU8 = kSourceIndexIsU8 || kSourceBufferUnaligned;
572 constant bool kUseSourceBufferU16 = kSourceIndexIsU16 && kSourceBufferAligned;
573 constant bool kUseSourceBufferU32 = kSourceIndexIsU32 && kSourceBufferAligned;
574 struct IndexConversionParams
575 {
576     uint32_t srcOffset;
577     uint32_t indexCount;
578     bool primitiveRestartEnabled;
579 };
580 inline ushort getIndexAligned(constant ushort *inputAligned, uint offset, uint idx)
581 {
582     return inputAligned[offset / 2 + idx];
583 }
584 inline uint getIndexAligned(constant uint *inputAligned, uint offset, uint idx)
585 {
586     return inputAligned[offset / 4 + idx];
587 }
588 inline uchar getIndexAligned(constant uchar *input, uint offset, uint idx)
589 {
590     return input[offset + idx];
591 }
592 inline ushort getIndexUnalignedU16(constant uchar *input, uint offset, uint idx)
593 {
594     ushort inputLo = input[offset + 2 * idx];
595     ushort inputHi = input[offset + 2 * idx + 1];
596     return inputLo | (inputHi << 8);
597 }
598 inline uint getIndexUnalignedU32(constant uchar *input, uint offset, uint idx)
599 {
600     uint input0 = input[offset + 4 * idx];
601     uint input1 = input[offset + 4 * idx + 1];
602     uint input2 = input[offset + 4 * idx + 2];
603     uint input3 = input[offset + 4 * idx + 3];
604     return input0 | (input1 << 8) | (input2 << 16) | (input3 << 24);
605 }
606 kernel void convertIndexU8ToU16(uint idx [[thread_position_in_grid]],
607                                 constant IndexConversionParams &options [[buffer(0)]],
608                                 constant uchar *input [[buffer(1)]],
609                                 device ushort *output [[buffer(2)]])
610 {
611     if (idx >= options.indexCount) { return; };
612     uchar value = getIndexAligned(input, options.srcOffset, idx);
613     if (options.primitiveRestartEnabled && value == 0xff)
614     {
615         output[idx] = 0xffff;
616     }
617     else
618     {
619         output[idx] = value;
620     }
621 }
622 kernel void convertIndexU16(uint idx [[thread_position_in_grid]],
623                             constant IndexConversionParams &options [[buffer(0)]],
624                             constant uchar *input
625                             [[buffer(1), function_constant(kSourceBufferUnaligned)]],
626                             constant ushort *inputAligned
627                             [[buffer(1), function_constant(kSourceBufferAligned)]],
628                             device ushort *output [[buffer(2)]])
629 {
630     if (idx >= options.indexCount) { return; };
631     ushort value;
632     if (kSourceBufferAligned)
633     {
634         value = getIndexAligned(inputAligned, options.srcOffset, idx);
635     }
636     else
637     {
638         value = getIndexUnalignedU16(input, options.srcOffset, idx);
639     }
640     output[idx] = value;
641 }
642 kernel void convertIndexU32(uint idx [[thread_position_in_grid]],
643                             constant IndexConversionParams &options [[buffer(0)]],
644                             constant uchar *input
645                             [[buffer(1), function_constant(kSourceBufferUnaligned)]],
646                             constant uint *inputAligned
647                             [[buffer(1), function_constant(kSourceBufferAligned)]],
648                             device uint *output [[buffer(2)]])
649 {
650     if (idx >= options.indexCount) { return; };
651     uint value;
652     if (kSourceBufferAligned)
653     {
654         value = getIndexAligned(inputAligned, options.srcOffset, idx);
655     }
656     else
657     {
658         value = getIndexUnalignedU32(input, options.srcOffset, idx);
659     }
660     output[idx] = value;
661 }
662 struct IndexFromArrayParams
663 {
664     uint firstVertex;
665     uint vertexCount;
666 };
667 kernel void genTriFanIndicesFromArray(uint idx [[thread_position_in_grid]],
668                                       constant IndexFromArrayParams &options [[buffer(0)]],
669                                       device uint *output [[buffer(2)]])
670 {
671     if (idx >= options.vertexCount) { return; };
672     uint vertexIdx = options.firstVertex + 2 + idx;
673     output[3 * idx ] = vertexIdx - 1;
674     output[3 * idx + 1] = vertexIdx;
675     output[3 * idx + 2] = options.firstVertex;
676 }
677 inline uint getIndexU32(uint offset,
678                         uint idx,
679                         constant uchar *inputU8 [[function_constant(kUseSourceBufferU8)]],
680                         constant ushort *inputU16 [[function_constant(kUseSourceBufferU16)]],
681                         constant uint *inputU32 [[function_constant(kUseSourceBufferU32)]])
682 {
683     if (kUseSourceBufferU8)
684     {
685         if (kSourceIndexIsU16)
686         {
687             return getIndexUnalignedU16(inputU8, offset, idx);
688         }
689         else if (kSourceIndexIsU32)
690         {
691             return getIndexUnalignedU32(inputU8, offset, idx);
692         }
693         return getIndexAligned(inputU8, offset, idx);
694     }
695     else if (kUseSourceBufferU16)
696     {
697         return getIndexAligned(inputU16, offset, idx);
698     }
699     else if (kUseSourceBufferU32)
700     {
701         return getIndexAligned(inputU32, offset, idx);
702     }
703     return 0;
704 }
705 kernel void genTriFanIndicesFromElements(uint idx [[thread_position_in_grid]],
706                                          constant IndexConversionParams &options [[buffer(0)]],
707                                          constant uchar *inputU8
708                                          [[buffer(1), function_constant(kUseSourceBufferU8)]],
709                                          constant ushort *inputU16
710                                          [[buffer(1), function_constant(kUseSourceBufferU16)]],
711                                          constant uint *inputU32
712                                          [[buffer(1), function_constant(kUseSourceBufferU32)]],
713                                          device uint *output [[buffer(2)]])
714 {
715     if (idx >= options.indexCount) { return; };
716     uint elemIdx = 2 + idx;
717     output[3 * idx] = getIndexU32(options.srcOffset, 0, inputU8, inputU16, inputU32);
718     output[3 * idx + 1] = getIndexU32(options.srcOffset, elemIdx - 1, inputU8, inputU16, inputU32);
719     output[3 * idx + 2] = getIndexU32(options.srcOffset, elemIdx, inputU8, inputU16, inputU32);
720 }
721 kernel void genLineLoopIndicesFromArray(uint idx [[thread_position_in_grid]],
722                                         constant IndexFromArrayParams &options [[buffer(0)]],
723                                         device uint *output [[buffer(2)]])
724 {
725     uint totalIndices = options.vertexCount + 1;
726     if (idx >= totalIndices) { return; };
727     output[idx] = options.firstVertex + idx % options.vertexCount;
728 }
729 kernel void genLineLoopIndicesFromElements(uint idx [[thread_position_in_grid]],
730                                            constant IndexConversionParams &options [[buffer(0)]],
731                                            constant uchar *inputU8
732                                            [[buffer(1), function_constant(kUseSourceBufferU8)]],
733                                            constant ushort *inputU16
734                                            [[buffer(1), function_constant(kUseSourceBufferU16)]],
735                                            constant uint *inputU32
736                                            [[buffer(1), function_constant(kUseSourceBufferU32)]],
737                                            device uint *output [[buffer(2)]])
738 {
739     uint totalTargetIndices = options.indexCount + 1;
740     if (idx >= totalTargetIndices) { return; };
741     output[idx] =
742         getIndexU32(options.srcOffset, idx % options.indexCount, inputU8, inputU16, inputU32);
743 }
744 using namespace rx::mtl_shader;
745 struct GenMipParams
746 {
747     uint srcLevel;
748     uint numMipLevelsToGen;
749     bool sRGB;
750 };
751 kernel void generate3DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
752                               ushort3 gIndices [[thread_position_in_grid]],
753                               texture3d<float> srcTexture [[texture(0)]],
754                               texture3d<float, access::write> dstMip1 [[texture(1)]],
755                               texture3d<float, access::write> dstMip2 [[texture(2)]],
756                               texture3d<float, access::write> dstMip3 [[texture(3)]],
757                               texture3d<float, access::write> dstMip4 [[texture(4)]],
758                               constant GenMipParams &options [[buffer(0)]])
759 {
760     ushort3 mipSize = ushort3(dstMip1.get_width(), dstMip1.get_height(), dstMip1.get_depth());
761     bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y && gIndices.z < mipSize.z;
762     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
763     threadgroup float sR[(8 * 8 * 8)];
764     threadgroup float sG[(8 * 8 * 8)];
765     threadgroup float sB[(8 * 8 * 8)];
766     threadgroup float sA[(8 * 8 * 8)];
767     float4 texel1;
768     if (validThread)
769     {
770         float3 texCoords = (float3(gIndices) + float3(0.5, 0.5, 0.5)) / float3(mipSize);
771         texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));
772         dstMip1.write(texel1, gIndices);
773     }
774     else
775     {
776         lIndex = 0xffffffff;
777     }
778     if (options.numMipLevelsToGen == 1)
779     {
780         return;
781     }
782     if (options.sRGB)
783     {
784         texel1 = linearToSRGB(texel1);
785     }
786     sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
787     threadgroup_barrier(mem_flags::mem_threadgroup);
788     if ((lIndex & 0x49) == 0)
789     {
790         bool3 atEdge = gIndices == (mipSize - ushort3(1));
791         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 1], sG[lIndex + 1], sB[lIndex + 1], sA[lIndex + 1]));
792         float4 texel3 = (atEdge.y) ? (texel1) : (float4(sR[lIndex + 8], sG[lIndex + 8], sB[lIndex + 8], sA[lIndex + 8]));
793         float4 texel4 = (atEdge.z) ? (texel1) : (float4(sR[lIndex + (8 * 8)], sG[lIndex + (8 * 8)], sB[lIndex + (8 * 8)], sA[lIndex + (8 * 8)]));
794         float4 texel5 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (8 + 1)], sG[lIndex + (8 + 1)], sB[lIndex + (8 + 1)], sA[lIndex + (8 + 1)]));
795         float4 texel6 = (atEdge.x | atEdge.z) ? (texel2) : (float4(sR[lIndex + ((8 * 8) + 1)], sG[lIndex + ((8 * 8) + 1)], sB[lIndex + ((8 * 8) + 1)], sA[lIndex + ((8 * 8) + 1)]));
796         float4 texel7 = (atEdge.y | atEdge.z) ? (texel3) : (float4(sR[lIndex + ((8 * 8) + 8)], sG[lIndex + ((8 * 8) + 8)], sB[lIndex + ((8 * 8) + 8)], sA[lIndex + ((8 * 8) + 8)]));
797         float4 texel8 =
798             (atEdge.x | atEdge.y | atEdge.z) ? (texel5) : (float4(sR[lIndex + ((8 * 8) + 8 + 1)], sG[lIndex + ((8 * 8) + 8 + 1)], sB[lIndex + ((8 * 8) + 8 + 1)], sA[lIndex + ((8 * 8) + 8 + 1)]));
799         texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
800         dstMip2.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 1);
801         sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
802     }
803     if (options.numMipLevelsToGen == 2)
804     {
805         return;
806     }
807     threadgroup_barrier(mem_flags::mem_threadgroup);
808     if ((lIndex & 0xdb) == 0)
809     {
810         mipSize = max(mipSize >> 1, ushort3(1));
811         bool3 atEdge = (gIndices >> 1) == (mipSize - ushort3(1));
812         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 2], sG[lIndex + 2], sB[lIndex + 2], sA[lIndex + 2]));
813         float4 texel3 =
814             (atEdge.y) ? (texel1) : (float4(sR[lIndex + (2 * 8)], sG[lIndex + (2 * 8)], sB[lIndex + (2 * 8)], sA[lIndex + (2 * 8)]));
815         float4 texel4 =
816             (atEdge.z) ? (texel1) : (float4(sR[lIndex + (2 * (8 * 8))], sG[lIndex + (2 * (8 * 8))], sB[lIndex + (2 * (8 * 8))], sA[lIndex + (2 * (8 * 8))]));
817         float4 texel5 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (2 * 8 + 2)], sG[lIndex + (2 * 8 + 2)], sB[lIndex + (2 * 8 + 2)], sA[lIndex + (2 * 8 + 2)]));
818         float4 texel6 = (atEdge.x | atEdge.z) ? (texel2) : (float4(sR[lIndex + (2 * (8 * 8) + 2)], sG[lIndex + (2 * (8 * 8) + 2)], sB[lIndex + (2 * (8 * 8) + 2)], sA[lIndex + (2 * (8 * 8) + 2)]));
819         float4 texel7 = (atEdge.y | atEdge.z) ? (texel3) : (float4(sR[lIndex + (2 * (8 * 8) + 2 * 8)], sG[lIndex + (2 * (8 * 8) + 2 * 8)], sB[lIndex + (2 * (8 * 8) + 2 * 8)], sA[lIndex + (2 * (8 * 8) + 2 * 8)]));
820         float4 texel8 = (atEdge.x | atEdge.y | atEdge.z) ? (texel5) : (float4(sR[lIndex + (2 * (8 * 8) + 2 * 8 + 2)], sG[lIndex + (2 * (8 * 8) + 2 * 8 + 2)], sB[lIndex + (2 * (8 * 8) + 2 * 8 + 2)], sA[lIndex + (2 * (8 * 8) + 2 * 8 + 2)]));
821         texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
822         dstMip3.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 2);
823         sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
824     }
825     if (options.numMipLevelsToGen == 3)
826     {
827         return;
828     }
829     threadgroup_barrier(mem_flags::mem_threadgroup);
830     if ((lIndex & 0x1ff) == 0)
831     {
832         mipSize = max(mipSize >> 1, ushort3(1));
833         bool3 atEdge = (gIndices >> 2) == (mipSize - ushort3(1));
834         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 4], sG[lIndex + 4], sB[lIndex + 4], sA[lIndex + 4]));
835         float4 texel3 =
836             (atEdge.y) ? (texel1) : (float4(sR[lIndex + (4 * 8)], sG[lIndex + (4 * 8)], sB[lIndex + (4 * 8)], sA[lIndex + (4 * 8)]));
837         float4 texel4 =
838             (atEdge.z) ? (texel1) : (float4(sR[lIndex + (4 * (8 * 8))], sG[lIndex + (4 * (8 * 8))], sB[lIndex + (4 * (8 * 8))], sA[lIndex + (4 * (8 * 8))]));
839         float4 texel5 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (4 * 8 + 4)], sG[lIndex + (4 * 8 + 4)], sB[lIndex + (4 * 8 + 4)], sA[lIndex + (4 * 8 + 4)]));
840         float4 texel6 = (atEdge.x | atEdge.z) ? (texel2) : (float4(sR[lIndex + (4 * (8 * 8) + 4)], sG[lIndex + (4 * (8 * 8) + 4)], sB[lIndex + (4 * (8 * 8) + 4)], sA[lIndex + (4 * (8 * 8) + 4)]));
841         float4 texel7 = (atEdge.y | atEdge.z) ? (texel3) : (float4(sR[lIndex + (4 * (8 * 8) + 4 * 8)], sG[lIndex + (4 * (8 * 8) + 4 * 8)], sB[lIndex + (4 * (8 * 8) + 4 * 8)], sA[lIndex + (4 * (8 * 8) + 4 * 8)]));
842         float4 texel8 = (atEdge.x | atEdge.y | atEdge.z) ? (texel5) : (float4(sR[lIndex + (4 * (8 * 8) + 4 * 8 + 4)], sG[lIndex + (4 * (8 * 8) + 4 * 8 + 4)], sB[lIndex + (4 * (8 * 8) + 4 * 8 + 4)], sA[lIndex + (4 * (8 * 8) + 4 * 8 + 4)]));
843         texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
844         dstMip4.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 3);
845     }
846 }
847 kernel void generate2DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
848                               ushort2 gIndices [[thread_position_in_grid]],
849                               texture2d<float> srcTexture [[texture(0)]],
850                               texture2d<float, access::write> dstMip1 [[texture(1)]],
851                               texture2d<float, access::write> dstMip2 [[texture(2)]],
852                               texture2d<float, access::write> dstMip3 [[texture(3)]],
853                               texture2d<float, access::write> dstMip4 [[texture(4)]],
854                               constant GenMipParams &options [[buffer(0)]])
855 {
856     uint firstMipLevel = options.srcLevel + 1;
857     ushort2 mipSize =
858         ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
859     bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y;
860     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
861     threadgroup float sR[(8 * 8)];
862     threadgroup float sG[(8 * 8)];
863     threadgroup float sB[(8 * 8)];
864     threadgroup float sA[(8 * 8)];
865     float4 texel1;
866     if (validThread)
867     {
868         float2 texCoords = (float2(gIndices) + float2(0.5, 0.5)) / float2(mipSize);
869         texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));
870         dstMip1.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices);
871     }
872     else
873     {
874         lIndex = 0xffffffff;
875     }
876     if (options.numMipLevelsToGen == 1)
877     {
878         return;
879     }
880     sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
881     threadgroup_barrier(mem_flags::mem_threadgroup);
882     if ((lIndex & 0x09) == 0)
883     {
884         bool2 atEdge = gIndices == (mipSize - ushort2(1));
885         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 1], sG[lIndex + 1], sB[lIndex + 1], sA[lIndex + 1]));
886         float4 texel3 = (atEdge.y) ? (texel1) : (float4(sR[lIndex + 8], sG[lIndex + 8], sB[lIndex + 8], sA[lIndex + 8]));
887         float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (8 + 1)], sG[lIndex + (8 + 1)], sB[lIndex + (8 + 1)], sA[lIndex + (8 + 1)]));
888         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
889         dstMip2.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 1);
890         sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
891     }
892     if (options.numMipLevelsToGen == 2)
893     {
894         return;
895     }
896     threadgroup_barrier(mem_flags::mem_threadgroup);
897     if ((lIndex & 0x1b) == 0)
898     {
899         mipSize = max(mipSize >> 1, ushort2(1));
900         bool2 atEdge = (gIndices >> 1) == (mipSize - ushort2(1));
901         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 2], sG[lIndex + 2], sB[lIndex + 2], sA[lIndex + 2]));
902         float4 texel3 =
903             (atEdge.y) ? (texel1) : (float4(sR[lIndex + 2 * 8], sG[lIndex + 2 * 8], sB[lIndex + 2 * 8], sA[lIndex + 2 * 8]));
904         float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (2 * 8 + 2)], sG[lIndex + (2 * 8 + 2)], sB[lIndex + (2 * 8 + 2)], sA[lIndex + (2 * 8 + 2)]));
905         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
906         dstMip3.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 2);
907         sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
908     }
909     if (options.numMipLevelsToGen == 3)
910     {
911         return;
912     }
913     threadgroup_barrier(mem_flags::mem_threadgroup);
914     if ((lIndex & 0x3f) == 0)
915     {
916         mipSize = max(mipSize >> 1, ushort2(1));
917         bool2 atEdge = (gIndices >> 2) == (mipSize - ushort2(1));
918         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 4], sG[lIndex + 4], sB[lIndex + 4], sA[lIndex + 4]));
919         float4 texel3 =
920             (atEdge.y) ? (texel1) : (float4(sR[lIndex + 4 * 8], sG[lIndex + 4 * 8], sB[lIndex + 4 * 8], sA[lIndex + 4 * 8]));
921         float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (4 * 8 + 4)], sG[lIndex + (4 * 8 + 4)], sB[lIndex + (4 * 8 + 4)], sA[lIndex + (4 * 8 + 4)]));
922         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
923         dstMip4.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 3);
924     }
925 }
926 template <typename TextureTypeR, typename TextureTypeW>
927 static __attribute__((always_inline)) void generateCubeOr2DArray2ndAndMoreMipmaps(
928     uint lIndex,
929     ushort3 gIndices,
930     TextureTypeR srcTexture,
931     TextureTypeW dstMip2,
932     TextureTypeW dstMip3,
933     TextureTypeW dstMip4,
934     ushort2 mip1Size,
935     float4 mip1Texel,
936     threadgroup float *sR,
937     threadgroup float *sG,
938     threadgroup float *sB,
939     threadgroup float *sA,
940     constant GenMipParams &options)
941 {
942     ushort2 mipSize = mip1Size;
943     float4 texel1 = mip1Texel;
944     sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
945     threadgroup_barrier(mem_flags::mem_threadgroup);
946     if ((lIndex & 0x09) == 0)
947     {
948         bool2 atEdge = gIndices.xy == (mipSize - ushort2(1));
949         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 1], sG[lIndex + 1], sB[lIndex + 1], sA[lIndex + 1]));
950         float4 texel3 = (atEdge.y) ? (texel1) : (float4(sR[lIndex + 8], sG[lIndex + 8], sB[lIndex + 8], sA[lIndex + 8]));
951         float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (8 + 1)], sG[lIndex + (8 + 1)], sB[lIndex + (8 + 1)], sA[lIndex + (8 + 1)]));
952         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
953         dstMip2.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices.xy >> 1, gIndices.z);
954         sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
955     }
956     if (options.numMipLevelsToGen == 2)
957     {
958         return;
959     }
960     threadgroup_barrier(mem_flags::mem_threadgroup);
961     if ((lIndex & 0x1b) == 0)
962     {
963         mipSize = max(mipSize >> 1, ushort2(1));
964         bool2 atEdge = (gIndices.xy >> 1) == (mipSize - ushort2(1));
965         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 2], sG[lIndex + 2], sB[lIndex + 2], sA[lIndex + 2]));
966         float4 texel3 =
967             (atEdge.y) ? (texel1) : (float4(sR[lIndex + 2 * 8], sG[lIndex + 2 * 8], sB[lIndex + 2 * 8], sA[lIndex + 2 * 8]));
968         float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (2 * 8 + 2)], sG[lIndex + (2 * 8 + 2)], sB[lIndex + (2 * 8 + 2)], sA[lIndex + (2 * 8 + 2)]));
969         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
970         dstMip3.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices.xy >> 2, gIndices.z);
971         sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
972     }
973     if (options.numMipLevelsToGen == 3)
974     {
975         return;
976     }
977     threadgroup_barrier(mem_flags::mem_threadgroup);
978     if ((lIndex & 0x3f) == 0)
979     {
980         mipSize = max(mipSize >> 1, ushort2(1));
981         bool2 atEdge = (gIndices.xy >> 2) == (mipSize - ushort2(1));
982         float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 4], sG[lIndex + 4], sB[lIndex + 4], sA[lIndex + 4]));
983         float4 texel3 =
984             (atEdge.y) ? (texel1) : (float4(sR[lIndex + 4 * 8], sG[lIndex + 4 * 8], sB[lIndex + 4 * 8], sA[lIndex + 4 * 8]));
985         float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (4 * 8 + 4)], sG[lIndex + (4 * 8 + 4)], sB[lIndex + (4 * 8 + 4)], sA[lIndex + (4 * 8 + 4)]));
986         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
987         dstMip4.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices.xy >> 3, gIndices.z);
988     }
989 }
990 kernel void generateCubeMipmaps(uint lIndex [[thread_index_in_threadgroup]],
991                                 ushort3 gIndices [[thread_position_in_grid]],
992                                 texturecube<float> srcTexture [[texture(0)]],
993                                 texturecube<float, access::write> dstMip1 [[texture(1)]],
994                                 texturecube<float, access::write> dstMip2 [[texture(2)]],
995                                 texturecube<float, access::write> dstMip3 [[texture(3)]],
996                                 texturecube<float, access::write> dstMip4 [[texture(4)]],
997                                 constant GenMipParams &options [[buffer(0)]])
998 {
999     uint firstMipLevel = options.srcLevel + 1;
1000     ushort2 mip1Size =
1001         ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
1002     bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;
1003     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
1004     float4 mip1Texel;
1005     if (validThread)
1006     {
1007         float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
1008         mip1Texel = srcTexture.sample(textureSampler, cubeTexcoords(texCoords, int(gIndices.z)),
1009                                       level(options.srcLevel));
1010         dstMip1.write((options.sRGB ? sRGBtoLinear(mip1Texel) : mip1Texel), gIndices.xy, gIndices.z);
1011     }
1012     else
1013     {
1014         lIndex = 0xffffffff;
1015     }
1016     if (options.numMipLevelsToGen == 1)
1017     {
1018         return;
1019     }
1020     threadgroup float sR[(8 * 8)];
1021     threadgroup float sG[(8 * 8)];
1022     threadgroup float sB[(8 * 8)];
1023     threadgroup float sA[(8 * 8)];
1024     generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
1025                                            mip1Size, mip1Texel, sR, sG, sB, sA, options);
1026 }
1027 kernel void generate2DArrayMipmaps(uint lIndex [[thread_index_in_threadgroup]],
1028                                    ushort3 gIndices [[thread_position_in_grid]],
1029                                    texture2d_array<float> srcTexture [[texture(0)]],
1030                                    texture2d_array<float, access::write> dstMip1 [[texture(1)]],
1031                                    texture2d_array<float, access::write> dstMip2 [[texture(2)]],
1032                                    texture2d_array<float, access::write> dstMip3 [[texture(3)]],
1033                                    texture2d_array<float, access::write> dstMip4 [[texture(4)]],
1034                                    constant GenMipParams &options [[buffer(0)]])
1035 {
1036     uint firstMipLevel = options.srcLevel + 1;
1037     ushort2 mip1Size =
1038         ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
1039     bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;
1040     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
1041     float4 mip1Texel;
1042     if (validThread)
1043     {
1044         float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
1045         mip1Texel =
1046             srcTexture.sample(textureSampler, texCoords, gIndices.z, level(options.srcLevel));
1047         dstMip1.write((options.sRGB ? sRGBtoLinear(mip1Texel) : mip1Texel), gIndices.xy, gIndices.z);
1048     }
1049     else
1050     {
1051         lIndex = 0xffffffff;
1052     }
1053     if (options.numMipLevelsToGen == 1)
1054     {
1055         return;
1056     }
1057     threadgroup float sR[(8 * 8)];
1058     threadgroup float sG[(8 * 8)];
1059     threadgroup float sB[(8 * 8)];
1060     threadgroup float sA[(8 * 8)];
1061     generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
1062                                            mip1Size, mip1Texel, sR, sG, sB, sA, options);
1063 }
1064 #include <metal_pack>
1065 namespace rx
1066 {
1067 namespace mtl_shader
1068 {
1069 namespace FormatID
1070 {
1071 enum
1072 {
1073     NONE,
1074     D16_UNORM,
1075     D24_UNORM_S8_UINT,
1076     D24_UNORM_X8_UINT,
1077     D32_FLOAT,
1078     D32_FLOAT_S8X24_UINT,
1079     D32_UNORM,
1080     S8_UINT,
1081     A16_FLOAT,
1082     A1R5G5B5_UNORM,
1083     A2R10G10B10_SINT_VERTEX,
1084     A2R10G10B10_SNORM_VERTEX,
1085     A2R10G10B10_SSCALED_VERTEX,
1086     A2R10G10B10_UINT_VERTEX,
1087     A2R10G10B10_UNORM_VERTEX,
1088     A2R10G10B10_USCALED_VERTEX,
1089     A32_FLOAT,
1090     A8_UNORM,
1091     ASTC_10x10_SRGB_BLOCK,
1092     ASTC_10x10_UNORM_BLOCK,
1093     ASTC_10x5_SRGB_BLOCK,
1094     ASTC_10x5_UNORM_BLOCK,
1095     ASTC_10x6_SRGB_BLOCK,
1096     ASTC_10x6_UNORM_BLOCK,
1097     ASTC_10x8_SRGB_BLOCK,
1098     ASTC_10x8_UNORM_BLOCK,
1099     ASTC_12x10_SRGB_BLOCK,
1100     ASTC_12x10_UNORM_BLOCK,
1101     ASTC_12x12_SRGB_BLOCK,
1102     ASTC_12x12_UNORM_BLOCK,
1103     ASTC_3x3x3_UNORM_BLOCK,
1104     ASTC_3x3x3_UNORM_SRGB_BLOCK,
1105     ASTC_4x3x3_UNORM_BLOCK,
1106     ASTC_4x3x3_UNORM_SRGB_BLOCK,
1107     ASTC_4x4_SRGB_BLOCK,
1108     ASTC_4x4_UNORM_BLOCK,
1109     ASTC_4x4x3_UNORM_BLOCK,
1110     ASTC_4x4x3_UNORM_SRGB_BLOCK,
1111     ASTC_4x4x4_UNORM_BLOCK,
1112     ASTC_4x4x4_UNORM_SRGB_BLOCK,
1113     ASTC_5x4_SRGB_BLOCK,
1114     ASTC_5x4_UNORM_BLOCK,
1115     ASTC_5x4x4_UNORM_BLOCK,
1116     ASTC_5x4x4_UNORM_SRGB_BLOCK,
1117     ASTC_5x5_SRGB_BLOCK,
1118     ASTC_5x5_UNORM_BLOCK,
1119     ASTC_5x5x4_UNORM_BLOCK,
1120     ASTC_5x5x4_UNORM_SRGB_BLOCK,
1121     ASTC_5x5x5_UNORM_BLOCK,
1122     ASTC_5x5x5_UNORM_SRGB_BLOCK,
1123     ASTC_6x5_SRGB_BLOCK,
1124     ASTC_6x5_UNORM_BLOCK,
1125     ASTC_6x5x5_UNORM_BLOCK,
1126     ASTC_6x5x5_UNORM_SRGB_BLOCK,
1127     ASTC_6x6_SRGB_BLOCK,
1128     ASTC_6x6_UNORM_BLOCK,
1129     ASTC_6x6x5_UNORM_BLOCK,
1130     ASTC_6x6x5_UNORM_SRGB_BLOCK,
1131     ASTC_6x6x6_UNORM_BLOCK,
1132     ASTC_6x6x6_UNORM_SRGB_BLOCK,
1133     ASTC_8x5_SRGB_BLOCK,
1134     ASTC_8x5_UNORM_BLOCK,
1135     ASTC_8x6_SRGB_BLOCK,
1136     ASTC_8x6_UNORM_BLOCK,
1137     ASTC_8x8_SRGB_BLOCK,
1138     ASTC_8x8_UNORM_BLOCK,
1139     B10G10R10A2_UNORM,
1140     B4G4R4A4_UNORM,
1141     B5G5R5A1_UNORM,
1142     B5G6R5_UNORM,
1143     B8G8R8A8_TYPELESS,
1144     B8G8R8A8_TYPELESS_SRGB,
1145     B8G8R8A8_UNORM,
1146     B8G8R8A8_UNORM_SRGB,
1147     B8G8R8X8_UNORM,
1148     B8G8R8X8_UNORM_SRGB,
1149     BC1_RGBA_UNORM_BLOCK,
1150     BC1_RGBA_UNORM_SRGB_BLOCK,
1151     BC1_RGB_UNORM_BLOCK,
1152     BC1_RGB_UNORM_SRGB_BLOCK,
1153     BC2_RGBA_UNORM_BLOCK,
1154     BC2_RGBA_UNORM_SRGB_BLOCK,
1155     BC3_RGBA_UNORM_BLOCK,
1156     BC3_RGBA_UNORM_SRGB_BLOCK,
1157     BC4_RED_SNORM_BLOCK,
1158     BC4_RED_UNORM_BLOCK,
1159     BC5_RG_SNORM_BLOCK,
1160     BC5_RG_UNORM_BLOCK,
1161     BC6H_RGB_SFLOAT_BLOCK,
1162     BC6H_RGB_UFLOAT_BLOCK,
1163     BC7_RGBA_UNORM_BLOCK,
1164     BC7_RGBA_UNORM_SRGB_BLOCK,
1165     EAC_R11G11_SNORM_BLOCK,
1166     EAC_R11G11_UNORM_BLOCK,
1167     EAC_R11_SNORM_BLOCK,
1168     EAC_R11_UNORM_BLOCK,
1169     ETC1_LOSSY_DECODE_R8G8B8_UNORM_BLOCK,
1170     ETC1_R8G8B8_UNORM_BLOCK,
1171     ETC2_R8G8B8A1_SRGB_BLOCK,
1172     ETC2_R8G8B8A1_UNORM_BLOCK,
1173     ETC2_R8G8B8A8_SRGB_BLOCK,
1174     ETC2_R8G8B8A8_UNORM_BLOCK,
1175     ETC2_R8G8B8_SRGB_BLOCK,
1176     ETC2_R8G8B8_UNORM_BLOCK,
1177     G8_B8R8_2PLANE_420_UNORM,
1178     G8_B8_R8_3PLANE_420_UNORM,
1179     L16A16_FLOAT,
1180     L16_FLOAT,
1181     L32A32_FLOAT,
1182     L32_FLOAT,
1183     L4A4_UNORM,
1184     L8A8_UNORM,
1185     L8_UNORM,
1186     PALETTE4_R4G4B4A4_UNORM,
1187     PALETTE4_R5G5B5A1_UNORM,
1188     PALETTE4_R5G6B5_UNORM,
1189     PALETTE4_R8G8B8A8_UNORM,
1190     PALETTE4_R8G8B8_UNORM,
1191     PALETTE8_R4G4B4A4_UNORM,
1192     PALETTE8_R5G5B5A1_UNORM,
1193     PALETTE8_R5G6B5_UNORM,
1194     PALETTE8_R8G8B8A8_UNORM,
1195     PALETTE8_R8G8B8_UNORM,
1196     PVRTC1_RGBA_2BPP_UNORM_BLOCK,
1197     PVRTC1_RGBA_2BPP_UNORM_SRGB_BLOCK,
1198     PVRTC1_RGBA_4BPP_UNORM_BLOCK,
1199     PVRTC1_RGBA_4BPP_UNORM_SRGB_BLOCK,
1200     PVRTC1_RGB_2BPP_UNORM_BLOCK,
1201     PVRTC1_RGB_2BPP_UNORM_SRGB_BLOCK,
1202     PVRTC1_RGB_4BPP_UNORM_BLOCK,
1203     PVRTC1_RGB_4BPP_UNORM_SRGB_BLOCK,
1204     R10G10B10A2_SINT,
1205     R10G10B10A2_SNORM,
1206     R10G10B10A2_SSCALED,
1207     R10G10B10A2_UINT,
1208     R10G10B10A2_UNORM,
1209     R10G10B10A2_USCALED,
1210     R10G10B10X2_UNORM,
1211     R11G11B10_FLOAT,
1212     R16G16B16A16_FLOAT,
1213     R16G16B16A16_SINT,
1214     R16G16B16A16_SNORM,
1215     R16G16B16A16_SSCALED,
1216     R16G16B16A16_UINT,
1217     R16G16B16A16_UNORM,
1218     R16G16B16A16_USCALED,
1219     R16G16B16_FLOAT,
1220     R16G16B16_SINT,
1221     R16G16B16_SNORM,
1222     R16G16B16_SSCALED,
1223     R16G16B16_UINT,
1224     R16G16B16_UNORM,
1225     R16G16B16_USCALED,
1226     R16G16_FLOAT,
1227     R16G16_SINT,
1228     R16G16_SNORM,
1229     R16G16_SSCALED,
1230     R16G16_UINT,
1231     R16G16_UNORM,
1232     R16G16_USCALED,
1233     R16_FLOAT,
1234     R16_SINT,
1235     R16_SNORM,
1236     R16_SSCALED,
1237     R16_UINT,
1238     R16_UNORM,
1239     R16_USCALED,
1240     R32G32B32A32_FIXED,
1241     R32G32B32A32_FLOAT,
1242     R32G32B32A32_SINT,
1243     R32G32B32A32_SNORM,
1244     R32G32B32A32_SSCALED,
1245     R32G32B32A32_UINT,
1246     R32G32B32A32_UNORM,
1247     R32G32B32A32_USCALED,
1248     R32G32B32_FIXED,
1249     R32G32B32_FLOAT,
1250     R32G32B32_SINT,
1251     R32G32B32_SNORM,
1252     R32G32B32_SSCALED,
1253     R32G32B32_UINT,
1254     R32G32B32_UNORM,
1255     R32G32B32_USCALED,
1256     R32G32_FIXED,
1257     R32G32_FLOAT,
1258     R32G32_SINT,
1259     R32G32_SNORM,
1260     R32G32_SSCALED,
1261     R32G32_UINT,
1262     R32G32_UNORM,
1263     R32G32_USCALED,
1264     R32_FIXED,
1265     R32_FLOAT,
1266     R32_SINT,
1267     R32_SNORM,
1268     R32_SSCALED,
1269     R32_UINT,
1270     R32_UNORM,
1271     R32_USCALED,
1272     R4G4B4A4_UNORM,
1273     R5G5B5A1_UNORM,
1274     R5G6B5_UNORM,
1275     R8G8B8A8_SINT,
1276     R8G8B8A8_SNORM,
1277     R8G8B8A8_SSCALED,
1278     R8G8B8A8_TYPELESS,
1279     R8G8B8A8_TYPELESS_SRGB,
1280     R8G8B8A8_UINT,
1281     R8G8B8A8_UNORM,
1282     R8G8B8A8_UNORM_SRGB,
1283     R8G8B8A8_USCALED,
1284     R8G8B8X8_UNORM,
1285     R8G8B8X8_UNORM_SRGB,
1286     R8G8B8_SINT,
1287     R8G8B8_SNORM,
1288     R8G8B8_SSCALED,
1289     R8G8B8_UINT,
1290     R8G8B8_UNORM,
1291     R8G8B8_UNORM_SRGB,
1292     R8G8B8_USCALED,
1293     R8G8_SINT,
1294     R8G8_SNORM,
1295     R8G8_SSCALED,
1296     R8G8_UINT,
1297     R8G8_UNORM,
1298     R8G8_UNORM_SRGB,
1299     R8G8_USCALED,
1300     R8_SINT,
1301     R8_SNORM,
1302     R8_SSCALED,
1303     R8_UINT,
1304     R8_UNORM,
1305     R8_UNORM_SRGB,
1306     R8_USCALED,
1307     R9G9B9E5_SHAREDEXP,
1308     X2R10G10B10_SINT_VERTEX,
1309     X2R10G10B10_SNORM_VERTEX,
1310     X2R10G10B10_SSCALED_VERTEX,
1311     X2R10G10B10_UINT_VERTEX,
1312     X2R10G10B10_UNORM_VERTEX,
1313     X2R10G10B10_USCALED_VERTEX,
1314     EXTERNAL0,
1315     EXTERNAL1,
1316     EXTERNAL2,
1317     EXTERNAL3,
1318     EXTERNAL4,
1319     EXTERNAL5,
1320     EXTERNAL6,
1321     EXTERNAL7
1322 };
1323 }
1324 }
1325 }
1326 
1327 using namespace rx::mtl_shader;
1328 constant int kCopyFormatType [[function_constant(10)]];
1329 constant int kCopyTextureType [[function_constant(20)]];
1330 constant bool kCopyTextureType2D = kCopyTextureType == kTextureType2D;
1331 constant bool kCopyTextureType2DArray = kCopyTextureType == kTextureType2DArray;
1332 constant bool kCopyTextureType2DMS = kCopyTextureType == kTextureType2DMultisample;
1333 constant bool kCopyTextureTypeCube = kCopyTextureType == kTextureTypeCube;
1334 constant bool kCopyTextureType3D = kCopyTextureType == kTextureType3D;
1335 struct CopyPixelParams
1336 {
1337     uint3 copySize;
1338     uint3 textureOffset;
1339     uint bufferStartOffset;
1340     uint pixelSize;
1341     uint bufferRowPitch;
1342     uint bufferDepthPitch;
1343 };
1344 struct WritePixelParams
1345 {
1346     uint2 copySize;
1347     uint2 textureOffset;
1348     uint bufferStartOffset;
1349     uint pixelSize;
1350     uint bufferRowPitch;
1351     uint textureLevel;
1352     uint textureLayer;
1353     bool reverseTextureRowOrder;
1354 };
1355 template <typename T>
1356 static inline void textureWrite(ushort3 gIndices,
1357                                 constant CopyPixelParams &options,
1358                                 vec<T, 4> color,
1359                                 texture2d<T, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<T, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<T, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<T, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
1360 {
1361     uint3 writeIndices = options.textureOffset + uint3(gIndices);
1362     switch (kCopyTextureType)
1363     {
1364         case kTextureType2D:
1365             dstTexture2d.write(color, writeIndices.xy);
1366             break;
1367         case kTextureType2DArray:
1368             dstTexture2dArray.write(color, writeIndices.xy, writeIndices.z);
1369             break;
1370         case kTextureType3D:
1371             dstTexture3d.write(color, writeIndices);
1372             break;
1373         case kTextureTypeCube:
1374             dstTextureCube.write(color, writeIndices.xy, writeIndices.z);
1375             break;
1376     }
1377 }
1378 template <typename T>
1379 static inline vec<T, 4> textureRead(ushort2 gIndices,
1380                                     constant WritePixelParams &options,
1381                                     texture2d<T, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<T, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<T, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<T, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<T, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]])
1382 {
1383     vec<T, 4> color;
1384     uint2 coords = uint2(gIndices);
1385     if (options.reverseTextureRowOrder)
1386     {
1387         coords.y = options.copySize.y - 1 - gIndices.y;
1388     }
1389     coords += options.textureOffset;
1390     switch (kCopyTextureType)
1391     {
1392         case kTextureType2D:
1393             color = srcTexture2d.read(coords.xy, options.textureLevel);
1394             break;
1395         case kTextureType2DArray:
1396             color = srcTexture2dArray.read(coords.xy, options.textureLayer, options.textureLevel);
1397             break;
1398         case kTextureType2DMultisample:
1399             color = resolveTextureMS(srcTexture2dMS, coords.xy);
1400             break;
1401         case kTextureType3D:
1402             color = srcTexture3d.read(uint3(coords, options.textureLayer), options.textureLevel);
1403             break;
1404         case kTextureTypeCube:
1405             color = srcTextureCube.read(coords.xy, options.textureLayer, options.textureLevel);
1406             break;
1407     }
1408     return color;
1409 }
1410 static inline float4 readR5G6B5_UNORM(uint bufferOffset, constant uchar *buffer)
1411 {
1412     float4 color;
1413     ushort src = bytesToShort<ushort>(buffer, bufferOffset);
1414     color.r = normalizedToFloat<5>(getShiftedData<5, 11>(src));
1415     color.g = normalizedToFloat<6>(getShiftedData<6, 5>(src));
1416     color.b = normalizedToFloat<5>(getShiftedData<5, 0>(src));
1417     color.a = 1.0;
1418     return color;
1419 }
1420 static inline void writeR5G6B5_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1421 {
1422     ushort dst = shiftData<5, 11>(floatToNormalized<5, ushort>(color.r)) |
1423                  shiftData<6, 5>(floatToNormalized<6, ushort>(color.g)) |
1424                  shiftData<5, 0>(floatToNormalized<5, ushort>(color.b));
1425     shortToBytes(dst, bufferOffset, buffer);
1426 }
1427 static inline float4 readR4G4B4A4_UNORM(uint bufferOffset, constant uchar *buffer)
1428 {
1429     float4 color;
1430     ushort src = bytesToShort<ushort>(buffer, bufferOffset);
1431     color.r = normalizedToFloat<4>(getShiftedData<4, 12>(src));
1432     color.g = normalizedToFloat<4>(getShiftedData<4, 8>(src));
1433     color.b = normalizedToFloat<4>(getShiftedData<4, 4>(src));
1434     color.a = normalizedToFloat<4>(getShiftedData<4, 0>(src));
1435     return color;
1436 }
1437 static inline void writeR4G4B4A4_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1438 {
1439     ushort dst = shiftData<4, 12>(floatToNormalized<4, ushort>(color.r)) |
1440                  shiftData<4, 8>(floatToNormalized<4, ushort>(color.g)) |
1441                  shiftData<4, 4>(floatToNormalized<4, ushort>(color.b)) |
1442                  shiftData<4, 0>(floatToNormalized<4, ushort>(color.a));
1443     ;
1444     shortToBytes(dst, bufferOffset, buffer);
1445 }
1446 static inline float4 readR5G5B5A1_UNORM(uint bufferOffset, constant uchar *buffer)
1447 {
1448     float4 color;
1449     ushort src = bytesToShort<ushort>(buffer, bufferOffset);
1450     color.r = normalizedToFloat<5>(getShiftedData<5, 11>(src));
1451     color.g = normalizedToFloat<5>(getShiftedData<5, 6>(src));
1452     color.b = normalizedToFloat<5>(getShiftedData<5, 1>(src));
1453     color.a = normalizedToFloat<1>(getShiftedData<1, 0>(src));
1454     return color;
1455 }
1456 static inline void writeR5G5B5A1_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1457 {
1458     ushort dst = shiftData<5, 11>(floatToNormalized<5, ushort>(color.r)) |
1459                  shiftData<5, 6>(floatToNormalized<5, ushort>(color.g)) |
1460                  shiftData<5, 1>(floatToNormalized<5, ushort>(color.b)) |
1461                  shiftData<1, 0>(floatToNormalized<1, ushort>(color.a));
1462     ;
1463     shortToBytes(dst, bufferOffset, buffer);
1464 }
1465 static inline int4 readR10G10B10A2_SINT(uint bufferOffset, constant uchar *buffer)
1466 {
1467     int4 color;
1468     int src = bytesToInt<int>(buffer, bufferOffset);
1469     constexpr int3 rgbSignMask(0x200);
1470     constexpr int3 negativeMask(0xfffffc00);
1471     constexpr int alphaSignMask = 0x2;
1472     constexpr int alphaNegMask = 0xfffffffc;
1473     color.r = getShiftedData<10, 0>(src);
1474     color.g = getShiftedData<10, 10>(src);
1475     color.b = getShiftedData<10, 20>(src);
1476     int3 isRgbNegative = (color.rgb & rgbSignMask) >> 9;
1477     color.rgb = (isRgbNegative * negativeMask) | color.rgb;
1478     color.a = getShiftedData<2, 30>(src);
1479     int isAlphaNegative = color.a & alphaSignMask >> 1;
1480     color.a = (isAlphaNegative * alphaNegMask) | color.a;
1481     return color;
1482 }
1483 static inline uint4 readR10G10B10A2_UINT(uint bufferOffset, constant uchar *buffer)
1484 {
1485     uint4 color;
1486     uint src = bytesToInt<uint>(buffer, bufferOffset);
1487     color.r = getShiftedData<10, 0>(src);
1488     color.g = getShiftedData<10, 10>(src);
1489     color.b = getShiftedData<10, 20>(src);
1490     color.a = getShiftedData<2, 30>(src);
1491     return color;
1492 }
1493 static inline float4 readR8G8B8A8(uint bufferOffset, constant uchar *buffer, bool isSRGB)
1494 {
1495     float4 color;
1496     uint src = bytesToInt<uint>(buffer, bufferOffset);
1497     if (isSRGB)
1498     {
1499         color = unpack_unorm4x8_srgb_to_float(src);
1500     }
1501     else
1502     {
1503         color = unpack_unorm4x8_to_float(src);
1504     }
1505     return color;
1506 }
1507 static inline void writeR8G8B8A8(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer, bool isSRGB)
1508 {
1509     uint dst;
1510     if (isSRGB)
1511     {
1512         dst = pack_float_to_srgb_unorm4x8(color);
1513     }
1514     else
1515     {
1516         dst = pack_float_to_unorm4x8(color);
1517     }
1518     intToBytes(dst, bufferOffset, buffer);
1519 }
1520 static inline float4 readR8G8B8(uint bufferOffset, constant uchar *buffer, bool isSRGB)
1521 {
1522     float4 color;
1523     color.r = normalizedToFloat<uchar>(buffer[bufferOffset]);
1524     color.g = normalizedToFloat<uchar>(buffer[bufferOffset + 1]);
1525     color.b = normalizedToFloat<uchar>(buffer[bufferOffset + 2]);
1526     color.a = 1.0;
1527     if (isSRGB)
1528     {
1529         color = sRGBtoLinear(color);
1530     }
1531     return color;
1532 }
1533 static inline void writeR8G8B8(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer, bool isSRGB)
1534 {
1535     color.a = 1.0;
1536     uint dst;
1537     if (isSRGB)
1538     {
1539         dst = pack_float_to_srgb_unorm4x8(color);
1540     }
1541     else
1542     {
1543         dst = pack_float_to_unorm4x8(color);
1544     }
1545     int24bitToBytes(dst, bufferOffset, buffer);
1546 }
1547 static inline float4 readR8G8B8A8_SNORM(uint bufferOffset, constant uchar *buffer)
1548 {
1549     float4 color;
1550     uint src = bytesToInt<uint>(buffer, bufferOffset);
1551     color = unpack_snorm4x8_to_float(src);
1552     return color;
1553 }
1554 static inline void writeR8G8B8A8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1555 {
1556     uint dst = pack_float_to_snorm4x8(color);
1557     intToBytes(dst, bufferOffset, buffer);
1558 }
1559 static inline float4 readR8G8B8_SNORM(uint bufferOffset, constant uchar *buffer)
1560 {
1561     float4 color;
1562     color.r = normalizedToFloat<7, char>(buffer[bufferOffset]);
1563     color.g = normalizedToFloat<7, char>(buffer[bufferOffset + 1]);
1564     color.b = normalizedToFloat<7, char>(buffer[bufferOffset + 2]);
1565     color.a = 1.0;
1566     return color;
1567 }
1568 static inline void writeR8G8B8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1569 {
1570     uint dst = pack_float_to_snorm4x8(color);
1571     int24bitToBytes(dst, bufferOffset, buffer);
1572 }
1573 static inline float4 readR8G8B8A8_UNORM(uint bufferOffset, constant uchar *buffer)
1574 {
1575     return readR8G8B8A8(bufferOffset, buffer, false);
1576 }
1577 static inline void writeR8G8B8A8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1578 {
1579     return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, false);
1580 }
1581 static inline float4 readR8G8B8A8_UNORM_SRGB(uint bufferOffset, constant uchar *buffer)
1582 {
1583     return readR8G8B8A8(bufferOffset, buffer, true);
1584 }
1585 static inline void writeR8G8B8A8_UNORM_SRGB(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1586 {
1587     return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, true);
1588 }
1589 static inline float4 readB8G8R8A8_UNORM(uint bufferOffset, constant uchar *buffer)
1590 {
1591     return readR8G8B8A8(bufferOffset, buffer, false).bgra;
1592 }
1593 static inline void writeB8G8R8A8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1594 {
1595     color.rgba = color.bgra;
1596     return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, false);
1597 }
1598 static inline float4 readB8G8R8A8_UNORM_SRGB(uint bufferOffset, constant uchar *buffer)
1599 {
1600     return readR8G8B8A8(bufferOffset, buffer, true).bgra;
1601 }
1602 static inline void writeB8G8R8A8_UNORM_SRGB(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1603 {
1604     color.rgba = color.bgra;
1605     return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, true);
1606 }
1607 static inline float4 readR8G8B8_UNORM(uint bufferOffset, constant uchar *buffer)
1608 {
1609     return readR8G8B8(bufferOffset, buffer, false);
1610 }
1611 static inline void writeR8G8B8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1612 {
1613     return writeR8G8B8(gIndices, options, bufferOffset, color, buffer, false);
1614 }
1615 static inline float4 readR8G8B8_UNORM_SRGB(uint bufferOffset, constant uchar *buffer)
1616 {
1617     return readR8G8B8(bufferOffset, buffer, true);
1618 }
1619 static inline void writeR8G8B8_UNORM_SRGB(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1620 {
1621     return writeR8G8B8(gIndices, options, bufferOffset, color, buffer, true);
1622 }
1623 static inline float4 readL8_UNORM(uint bufferOffset, constant uchar *buffer)
1624 {
1625     float4 color;
1626     color.rgb = float3(normalizedToFloat<uchar>(buffer[bufferOffset]));
1627     color.a = 1.0;
1628     return color;
1629 }
1630 static inline void writeL8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1631 {
1632     buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1633 }
1634 static inline void writeA8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1635 {
1636     buffer[bufferOffset] = floatToNormalized<uchar>(color.a);
1637 }
1638 static inline float4 readL8A8_UNORM(uint bufferOffset, constant uchar *buffer)
1639 {
1640     float4 color;
1641     color.rgb = float3(normalizedToFloat<uchar>(buffer[bufferOffset]));
1642     color.a = normalizedToFloat<uchar>(buffer[bufferOffset + 1]);
1643     return color;
1644 }
1645 static inline void writeL8A8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1646 {
1647     buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1648     buffer[bufferOffset + 1] = floatToNormalized<uchar>(color.a);
1649 }
1650 static inline float4 readR8_UNORM(uint bufferOffset, constant uchar *buffer)
1651 {
1652     float4 color;
1653     color.r = normalizedToFloat<uchar>(buffer[bufferOffset]);
1654     color.g = color.b = 0.0;
1655     color.a = 1.0;
1656     return color;
1657 }
1658 static inline void writeR8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1659 {
1660     buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1661 }
1662 static inline float4 readR8_SNORM(uint bufferOffset, constant uchar *buffer)
1663 {
1664     float4 color;
1665     color.r = normalizedToFloat<7, char>(buffer[bufferOffset]);
1666     color.g = color.b = 0.0;
1667     color.a = 1.0;
1668     return color;
1669 }
1670 static inline void writeR8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1671 {
1672     buffer[bufferOffset] = as_type<uchar>(floatToNormalized<char>(color.r));
1673 }
1674 static inline int4 readR8_SINT(uint bufferOffset, constant uchar *buffer)
1675 {
1676     int4 color;
1677     color.r = as_type<char>(buffer[bufferOffset]);
1678     color.g = color.b = 0;
1679     color.a = 1;
1680     return color;
1681 }
1682 static inline void writeR8_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1683 {
1684     buffer[bufferOffset] = static_cast<uchar>(color.r);
1685 }
1686 static inline uint4 readR8_UINT(uint bufferOffset, constant uchar *buffer)
1687 {
1688     uint4 color;
1689     color.r = as_type<uchar>(buffer[bufferOffset]);
1690     color.g = color.b = 0;
1691     color.a = 1;
1692     return color;
1693 }
1694 static inline void writeR8_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1695 {
1696     buffer[bufferOffset] = static_cast<uchar>(color.r);
1697 }
1698 static inline float4 readR8G8_UNORM(uint bufferOffset, constant uchar *buffer)
1699 {
1700     float4 color;
1701     color.r = normalizedToFloat<uchar>(buffer[bufferOffset]);
1702     color.g = normalizedToFloat<uchar>(buffer[bufferOffset + 1]);
1703     color.b = 0.0;
1704     color.a = 1.0;
1705     return color;
1706 }
1707 static inline void writeR8G8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1708 {
1709     buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1710     buffer[bufferOffset + 1] = floatToNormalized<uchar>(color.g);
1711 }
1712 static inline float4 readR8G8_SNORM(uint bufferOffset, constant uchar *buffer)
1713 {
1714     float4 color;
1715     color.r = normalizedToFloat<7, char>(buffer[bufferOffset]);
1716     color.g = normalizedToFloat<7, char>(buffer[bufferOffset + 1]);
1717     color.b = 0.0;
1718     color.a = 1.0;
1719     return color;
1720 }
1721 static inline void writeR8G8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1722 {
1723     buffer[bufferOffset] = as_type<uchar>(floatToNormalized<char>(color.r));
1724     buffer[bufferOffset + 1] = as_type<uchar>(floatToNormalized<char>(color.g));
1725 }
1726 static inline int4 readR8G8_SINT(uint bufferOffset, constant uchar *buffer)
1727 {
1728     int4 color;
1729     color.r = as_type<char>(buffer[bufferOffset]);
1730     color.g = as_type<char>(buffer[bufferOffset + 1]);
1731     color.b = 0;
1732     color.a = 1;
1733     return color;
1734 }
1735 static inline void writeR8G8_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1736 {
1737     buffer[bufferOffset] = static_cast<uchar>(color.r);
1738     buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1739 }
1740 static inline uint4 readR8G8_UINT(uint bufferOffset, constant uchar *buffer)
1741 {
1742     uint4 color;
1743     color.r = as_type<uchar>(buffer[bufferOffset]);
1744     color.g = as_type<uchar>(buffer[bufferOffset + 1]);
1745     color.b = 0;
1746     color.a = 1;
1747     return color;
1748 }
1749 static inline void writeR8G8_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1750 {
1751     buffer[bufferOffset] = static_cast<uchar>(color.r);
1752     buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1753 }
1754 static inline int4 readR8G8B8_SINT(uint bufferOffset, constant uchar *buffer)
1755 {
1756     int4 color;
1757     color.r = as_type<char>(buffer[bufferOffset]);
1758     color.g = as_type<char>(buffer[bufferOffset + 1]);
1759     color.b = as_type<char>(buffer[bufferOffset + 2]);
1760     color.a = 1;
1761     return color;
1762 }
1763 static inline uint4 readR8G8B8_UINT(uint bufferOffset, constant uchar *buffer)
1764 {
1765     uint4 color;
1766     color.r = as_type<uchar>(buffer[bufferOffset]);
1767     color.g = as_type<uchar>(buffer[bufferOffset + 1]);
1768     color.b = as_type<uchar>(buffer[bufferOffset + 2]);
1769     color.a = 1;
1770     return color;
1771 }
1772 static inline int4 readR8G8B8A8_SINT(uint bufferOffset, constant uchar *buffer)
1773 {
1774     int4 color;
1775     color.r = as_type<char>(buffer[bufferOffset]);
1776     color.g = as_type<char>(buffer[bufferOffset + 1]);
1777     color.b = as_type<char>(buffer[bufferOffset + 2]);
1778     color.a = as_type<char>(buffer[bufferOffset + 3]);
1779     return color;
1780 }
1781 static inline void writeR8G8B8A8_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1782 {
1783     buffer[bufferOffset] = static_cast<uchar>(color.r);
1784     buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1785     buffer[bufferOffset + 2] = static_cast<uchar>(color.b);
1786     buffer[bufferOffset + 3] = static_cast<uchar>(color.a);
1787 }
1788 static inline uint4 readR8G8B8A8_UINT(uint bufferOffset, constant uchar *buffer)
1789 {
1790     uint4 color;
1791     color.r = as_type<uchar>(buffer[bufferOffset]);
1792     color.g = as_type<uchar>(buffer[bufferOffset + 1]);
1793     color.b = as_type<uchar>(buffer[bufferOffset + 2]);
1794     color.a = as_type<uchar>(buffer[bufferOffset + 3]);
1795     return color;
1796 }
1797 static inline void writeR8G8B8A8_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1798 {
1799     buffer[bufferOffset] = static_cast<uchar>(color.r);
1800     buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1801     buffer[bufferOffset + 2] = static_cast<uchar>(color.b);
1802     buffer[bufferOffset + 3] = static_cast<uchar>(color.a);
1803 }
1804 static inline float4 readR16_FLOAT(uint bufferOffset, constant uchar *buffer)
1805 {
1806     float4 color;
1807     color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1808     color.g = color.b = 0.0;
1809     color.a = 1.0;
1810     return color;
1811 }
1812 static inline void writeR16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1813 {
1814     shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1815 }
1816 template <typename ShortType>
1817 static inline float4 readR16_NORM(uint bufferOffset, constant uchar *buffer)
1818 {
1819     float4 color;
1820     color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
1821     color.g = color.b = 0.0;
1822     color.a = 1.0;
1823     return color;
1824 }
1825 template<typename ShortType>
1826 static inline void writeR16_NORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1827 {
1828     shortToBytes(floatToNormalized<ShortType>(color.r), bufferOffset, buffer);
1829 }
1830 static inline int4 readR16_SINT(uint bufferOffset, constant uchar *buffer)
1831 {
1832     int4 color;
1833     color.r = bytesToShort<short>(buffer, bufferOffset);
1834     color.g = color.b = 0;
1835     color.a = 1;
1836     return color;
1837 }
1838 static inline void writeR16_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1839 {
1840     shortToBytes(static_cast<short>(color.r), bufferOffset, buffer);
1841 }
1842 static inline uint4 readR16_UINT(uint bufferOffset, constant uchar *buffer)
1843 {
1844     uint4 color;
1845     color.r = bytesToShort<ushort>(buffer, bufferOffset);
1846     color.g = color.b = 0;
1847     color.a = 1;
1848     return color;
1849 }
1850 static inline void writeR16_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1851 {
1852     shortToBytes(static_cast<ushort>(color.r), bufferOffset, buffer);
1853 }
1854 static inline float4 readA16_FLOAT(uint bufferOffset, constant uchar *buffer)
1855 {
1856     float4 color;
1857     color.a = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1858     color.rgb = 0.0;
1859     return color;
1860 }
1861 static inline void writeA16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1862 {
1863     shortToBytes(as_type<ushort>(static_cast<half>(color.a)), bufferOffset, buffer);
1864 }
1865 static inline float4 readL16_FLOAT(uint bufferOffset, constant uchar *buffer)
1866 {
1867     float4 color;
1868     color.rgb = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1869     color.a = 1.0;
1870     return color;
1871 }
1872 static inline void writeL16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1873 {
1874     shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1875 }
1876 static inline float4 readL16A16_FLOAT(uint bufferOffset, constant uchar *buffer)
1877 {
1878     float4 color;
1879     color.rgb = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1880     color.a = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1881     return color;
1882 }
1883 static inline void writeL16A16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1884 {
1885     shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1886     shortToBytes(as_type<ushort>(static_cast<half>(color.a)), bufferOffset + 2, buffer);
1887 }
1888 static inline float4 readR16G16_FLOAT(uint bufferOffset, constant uchar *buffer)
1889 {
1890     float4 color;
1891     color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1892     color.g = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1893     color.b = 0.0;
1894     color.a = 1.0;
1895     return color;
1896 }
1897 static inline void writeR16G16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1898 {
1899     shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1900     shortToBytes(as_type<ushort>(static_cast<half>(color.g)), bufferOffset + 2, buffer);
1901 }
1902 template <typename ShortType>
1903 static inline float4 readR16G16_NORM(uint bufferOffset, constant uchar *buffer)
1904 {
1905     float4 color;
1906     color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
1907     color.g = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 2));
1908     color.b = 0.0;
1909     color.a = 1.0;
1910     return color;
1911 }
1912 template<typename ShortType>
1913 static inline void writeR16G16_NORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1914 {
1915     shortToBytes(floatToNormalized<ShortType>(color.r), bufferOffset, buffer);
1916     shortToBytes(floatToNormalized<ShortType>(color.g), bufferOffset + 2, buffer);
1917 }
1918 static inline int4 readR16G16_SINT(uint bufferOffset, constant uchar *buffer)
1919 {
1920     int4 color;
1921     color.r = bytesToShort<short>(buffer, bufferOffset);
1922     color.g = bytesToShort<short>(buffer, bufferOffset + 2);
1923     color.b = 0;
1924     color.a = 1;
1925     return color;
1926 }
1927 static inline void writeR16G16_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1928 {
1929     shortToBytes(static_cast<short>(color.r), bufferOffset, buffer);
1930     shortToBytes(static_cast<short>(color.g), bufferOffset + 2, buffer);
1931 }
1932 static inline uint4 readR16G16_UINT(uint bufferOffset, constant uchar *buffer)
1933 {
1934     uint4 color;
1935     color.r = bytesToShort<ushort>(buffer, bufferOffset);
1936     color.g = bytesToShort<ushort>(buffer, bufferOffset + 2);
1937     color.b = 0;
1938     color.a = 1;
1939     return color;
1940 }
1941 static inline void writeR16G16_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1942 {
1943     shortToBytes(static_cast<ushort>(color.r), bufferOffset, buffer);
1944     shortToBytes(static_cast<ushort>(color.g), bufferOffset + 2, buffer);
1945 }
1946 static inline float4 readR16G16B16_FLOAT(uint bufferOffset, constant uchar *buffer)
1947 {
1948     float4 color;
1949     color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1950     color.g = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1951     color.b = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 4));
1952     color.a = 1.0;
1953     return color;
1954 }
1955 template <typename ShortType>
1956 static inline float4 readR16G16B16_NORM(uint bufferOffset, constant uchar *buffer)
1957 {
1958     float4 color;
1959     color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
1960     color.g = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 2));
1961     color.b = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 4));
1962     color.a = 1.0;
1963     return color;
1964 }
1965 static inline int4 readR16G16B16_SINT(uint bufferOffset, constant uchar *buffer)
1966 {
1967     int4 color;
1968     color.r = bytesToShort<short>(buffer, bufferOffset);
1969     color.g = bytesToShort<short>(buffer, bufferOffset + 2);
1970     color.b = bytesToShort<short>(buffer, bufferOffset + 4);
1971     color.a = 1;
1972     return color;
1973 }
1974 static inline uint4 readR16G16B16_UINT(uint bufferOffset, constant uchar *buffer)
1975 {
1976     uint4 color;
1977     color.r = bytesToShort<ushort>(buffer, bufferOffset);
1978     color.g = bytesToShort<ushort>(buffer, bufferOffset + 2);
1979     color.b = bytesToShort<ushort>(buffer, bufferOffset + 4);
1980     color.a = 1;
1981     return color;
1982 }
1983 static inline float4 readR16G16B16A16_FLOAT(uint bufferOffset, constant uchar *buffer)
1984 {
1985     float4 color;
1986     color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1987     color.g = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1988     color.b = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 4));
1989     color.a = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 6));
1990     return color;
1991 }
1992 static inline void writeR16G16B16A16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1993 {
1994     shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1995     shortToBytes(as_type<ushort>(static_cast<half>(color.g)), bufferOffset + 2, buffer);
1996     shortToBytes(as_type<ushort>(static_cast<half>(color.b)), bufferOffset + 4, buffer);
1997     shortToBytes(as_type<ushort>(static_cast<half>(color.a)), bufferOffset + 6, buffer);
1998 }
1999 template <typename ShortType>
2000 static inline float4 readR16G16B16A16_NORM(uint bufferOffset, constant uchar *buffer)
2001 {
2002     float4 color;
2003     color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
2004     color.g = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 2));
2005     color.b = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 4));
2006     color.a = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 6));
2007     return color;
2008 }
2009 template<typename ShortType>
2010 static inline void writeR16G16B16A16_NORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2011 {
2012     shortToBytes(floatToNormalized<ShortType>(color.r), bufferOffset, buffer);
2013     shortToBytes(floatToNormalized<ShortType>(color.g), bufferOffset + 2, buffer);
2014     shortToBytes(floatToNormalized<ShortType>(color.b), bufferOffset + 4, buffer);
2015     shortToBytes(floatToNormalized<ShortType>(color.a), bufferOffset + 6, buffer);
2016 }
2017 static inline int4 readR16G16B16A16_SINT(uint bufferOffset, constant uchar *buffer)
2018 {
2019     int4 color;
2020     color.r = bytesToShort<short>(buffer, bufferOffset);
2021     color.g = bytesToShort<short>(buffer, bufferOffset + 2);
2022     color.b = bytesToShort<short>(buffer, bufferOffset + 4);
2023     color.a = bytesToShort<short>(buffer, bufferOffset + 6);
2024     return color;
2025 }
2026 static inline void writeR16G16B16A16_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2027 {
2028     shortToBytes(static_cast<short>(color.r), bufferOffset, buffer);
2029     shortToBytes(static_cast<short>(color.g), bufferOffset + 2, buffer);
2030     shortToBytes(static_cast<short>(color.b), bufferOffset + 4, buffer);
2031     shortToBytes(static_cast<short>(color.a), bufferOffset + 6, buffer);
2032 }
2033 static inline uint4 readR16G16B16A16_UINT(uint bufferOffset, constant uchar *buffer)
2034 {
2035     uint4 color;
2036     color.r = bytesToShort<ushort>(buffer, bufferOffset);
2037     color.g = bytesToShort<ushort>(buffer, bufferOffset + 2);
2038     color.b = bytesToShort<ushort>(buffer, bufferOffset + 4);
2039     color.a = bytesToShort<ushort>(buffer, bufferOffset + 6);
2040     return color;
2041 }
2042 static inline void writeR16G16B16A16_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2043 {
2044     shortToBytes(static_cast<ushort>(color.r), bufferOffset, buffer);
2045     shortToBytes(static_cast<ushort>(color.g), bufferOffset + 2, buffer);
2046     shortToBytes(static_cast<ushort>(color.b), bufferOffset + 4, buffer);
2047     shortToBytes(static_cast<ushort>(color.a), bufferOffset + 6, buffer);
2048 }
2049 static inline float4 readR32_FLOAT(uint bufferOffset, constant uchar *buffer)
2050 {
2051     float4 color;
2052     color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2053     color.g = color.b = 0.0;
2054     color.a = 1.0;
2055     return color;
2056 }
2057 static inline void writeR32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2058 {
2059     intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2060 }
2061 template <typename IntType>
2062 static inline float4 readR32_NORM(uint bufferOffset, constant uchar *buffer)
2063 {
2064     float4 color;
2065     color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2066     color.g = color.b = 0.0;
2067     color.a = 1.0;
2068     return color;
2069 }
2070 static inline float4 readA32_FLOAT(uint bufferOffset, constant uchar *buffer)
2071 {
2072     float4 color;
2073     color.a = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2074     color.rgb = 0.0;
2075     return color;
2076 }
2077 static inline void writeA32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2078 {
2079     intToBytes(as_type<uint>(color.a), bufferOffset, buffer);
2080 }
2081 static inline float4 readL32_FLOAT(uint bufferOffset, constant uchar *buffer)
2082 {
2083     float4 color;
2084     color.rgb = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2085     color.a = 1.0;
2086     return color;
2087 }
2088 static inline void writeL32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2089 {
2090     intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2091 }
2092 static inline int4 readR32_SINT(uint bufferOffset, constant uchar *buffer)
2093 {
2094     int4 color;
2095     color.r = bytesToInt<int>(buffer, bufferOffset);
2096     color.g = color.b = 0;
2097     color.a = 1;
2098     return color;
2099 }
2100 static inline void writeR32_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2101 {
2102     intToBytes(color.r, bufferOffset, buffer);
2103 }
2104 static inline float4 readR32_FIXED(uint bufferOffset, constant uchar *buffer)
2105 {
2106     float4 color;
2107     constexpr float kDivisor = 1.0f / (1 << 16);
2108     color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2109     color.g = color.b = 0.0;
2110     color.a = 1.0;
2111     return color;
2112 }
2113 static inline uint4 readR32_UINT(uint bufferOffset, constant uchar *buffer)
2114 {
2115     uint4 color;
2116     color.r = bytesToInt<uint>(buffer, bufferOffset);
2117     color.g = color.b = 0;
2118     color.a = 1;
2119     return color;
2120 }
2121 static inline void writeR32_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2122 {
2123     intToBytes(color.r, bufferOffset, buffer);
2124 }
2125 static inline float4 readL32A32_FLOAT(uint bufferOffset, constant uchar *buffer)
2126 {
2127     float4 color;
2128     color.rgb = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2129     color.a = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2130     return color;
2131 }
2132 static inline void writeL32A32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2133 {
2134     intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2135     intToBytes(as_type<uint>(color.a), bufferOffset + 4, buffer);
2136 }
2137 static inline float4 readR32G32_FLOAT(uint bufferOffset, constant uchar *buffer)
2138 {
2139     float4 color;
2140     color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2141     color.g = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2142     color.b = 0.0;
2143     color.a = 1.0;
2144     return color;
2145 }
2146 static inline void writeR32G32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2147 {
2148     intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2149     intToBytes(as_type<uint>(color.g), bufferOffset + 4, buffer);
2150 }
2151 template <typename IntType>
2152 static inline float4 readR32G32_NORM(uint bufferOffset, constant uchar *buffer)
2153 {
2154     float4 color;
2155     color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2156     color.g = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 4));
2157     color.b = 0.0;
2158     color.a = 1.0;
2159     return color;
2160 }
2161 static inline int4 readR32G32_SINT(uint bufferOffset, constant uchar *buffer)
2162 {
2163     int4 color;
2164     color.r = bytesToInt<int>(buffer, bufferOffset);
2165     color.g = bytesToInt<int>(buffer, bufferOffset + 4);
2166     color.b = 0;
2167     color.a = 1;
2168     return color;
2169 }
2170 static inline void writeR32G32_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2171 {
2172     intToBytes(color.r, bufferOffset, buffer);
2173     intToBytes(color.g, bufferOffset + 4, buffer);
2174 }
2175 static inline float4 readR32G32_FIXED(uint bufferOffset, constant uchar *buffer)
2176 {
2177     float4 color;
2178     constexpr float kDivisor = 1.0f / (1 << 16);
2179     color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2180     color.g = bytesToInt<int>(buffer, bufferOffset + 4) * kDivisor;
2181     color.b = 0.0;
2182     color.a = 1.0;
2183     return color;
2184 }
2185 static inline uint4 readR32G32_UINT(uint bufferOffset, constant uchar *buffer)
2186 {
2187     uint4 color;
2188     color.r = bytesToInt<uint>(buffer, bufferOffset);
2189     color.g = bytesToInt<uint>(buffer, bufferOffset + 4);
2190     color.b = 0;
2191     color.a = 1;
2192     return color;
2193 }
2194 static inline void writeR32G32_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2195 {
2196     intToBytes(color.r, bufferOffset, buffer);
2197     intToBytes(color.g, bufferOffset + 4, buffer);
2198 }
2199 static inline float4 readR32G32B32_FLOAT(uint bufferOffset, constant uchar *buffer)
2200 {
2201     float4 color;
2202     color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2203     color.g = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2204     color.b = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 8));
2205     color.a = 1.0;
2206     return color;
2207 }
2208 template <typename IntType>
2209 static inline float4 readR32G32B32_NORM(uint bufferOffset, constant uchar *buffer)
2210 {
2211     float4 color;
2212     color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2213     color.g = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 4));
2214     color.b = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 8));
2215     color.a = 1.0;
2216     return color;
2217 }
2218 static inline int4 readR32G32B32_SINT(uint bufferOffset, constant uchar *buffer)
2219 {
2220     int4 color;
2221     color.r = bytesToInt<int>(buffer, bufferOffset);
2222     color.g = bytesToInt<int>(buffer, bufferOffset + 4);
2223     color.b = bytesToInt<int>(buffer, bufferOffset + 8);
2224     color.a = 1;
2225     return color;
2226 }
2227 static inline float4 readR32G32B32_FIXED(uint bufferOffset, constant uchar *buffer)
2228 {
2229     float4 color;
2230     constexpr float kDivisor = 1.0f / (1 << 16);
2231     color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2232     color.g = bytesToInt<int>(buffer, bufferOffset + 4) * kDivisor;
2233     color.b = bytesToInt<int>(buffer, bufferOffset + 8) * kDivisor;
2234     color.a = 1.0;
2235     return color;
2236 }
2237 static inline uint4 readR32G32B32_UINT(uint bufferOffset, constant uchar *buffer)
2238 {
2239     uint4 color;
2240     color.r = bytesToInt<uint>(buffer, bufferOffset);
2241     color.g = bytesToInt<uint>(buffer, bufferOffset + 4);
2242     color.b = bytesToInt<uint>(buffer, bufferOffset + 8);
2243     color.a = 1;
2244     return color;
2245 }
2246 static inline float4 readR32G32B32A32_FLOAT(uint bufferOffset, constant uchar *buffer)
2247 {
2248     float4 color;
2249     color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2250     color.g = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2251     color.b = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 8));
2252     color.a = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 12));
2253     return color;
2254 }
2255 static inline void writeR32G32B32A32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2256 {
2257     intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2258     intToBytes(as_type<uint>(color.g), bufferOffset + 4, buffer);
2259     intToBytes(as_type<uint>(color.b), bufferOffset + 8, buffer);
2260     intToBytes(as_type<uint>(color.a), bufferOffset + 12, buffer);
2261 }
2262 template <typename IntType>
2263 static inline float4 readR32G32B32A32_NORM(uint bufferOffset, constant uchar *buffer)
2264 {
2265     float4 color;
2266     color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2267     color.g = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 4));
2268     color.b = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 8));
2269     color.a = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 12));
2270     return color;
2271 }
2272 static inline int4 readR32G32B32A32_SINT(uint bufferOffset, constant uchar *buffer)
2273 {
2274     int4 color;
2275     color.r = bytesToInt<int>(buffer, bufferOffset);
2276     color.g = bytesToInt<int>(buffer, bufferOffset + 4);
2277     color.b = bytesToInt<int>(buffer, bufferOffset + 8);
2278     color.a = bytesToInt<int>(buffer, bufferOffset + 12);
2279     return color;
2280 }
2281 static inline void writeR32G32B32A32_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2282 {
2283     intToBytes(color.r, bufferOffset, buffer);
2284     intToBytes(color.g, bufferOffset + 4, buffer);
2285     intToBytes(color.b, bufferOffset + 8, buffer);
2286     intToBytes(color.a, bufferOffset + 12, buffer);
2287 }
2288 static inline float4 readR32G32B32A32_FIXED(uint bufferOffset, constant uchar *buffer)
2289 {
2290     float4 color;
2291     constexpr float kDivisor = 1.0f / (1 << 16);
2292     color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2293     color.g = bytesToInt<int>(buffer, bufferOffset + 4) * kDivisor;
2294     color.b = bytesToInt<int>(buffer, bufferOffset + 8) * kDivisor;
2295     color.a = bytesToInt<int>(buffer, bufferOffset + 12) * kDivisor;
2296     return color;
2297 }
2298 static inline uint4 readR32G32B32A32_UINT(uint bufferOffset, constant uchar *buffer)
2299 {
2300     uint4 color;
2301     color.r = bytesToInt<uint>(buffer, bufferOffset);
2302     color.g = bytesToInt<uint>(buffer, bufferOffset + 4);
2303     color.b = bytesToInt<uint>(buffer, bufferOffset + 8);
2304     color.a = bytesToInt<uint>(buffer, bufferOffset + 12);
2305     return color;
2306 }
2307 static inline void writeR32G32B32A32_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2308 {
2309     intToBytes(color.r, bufferOffset, buffer);
2310     intToBytes(color.g, bufferOffset + 4, buffer);
2311     intToBytes(color.b, bufferOffset + 8, buffer);
2312     intToBytes(color.a, bufferOffset + 12, buffer);
2313 }
2314 static inline int4 readR8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8_SINT(bufferOffset, buffer); } static inline uint4 readR8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8_UINT(bufferOffset, buffer); } static inline int4 readR8G8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8_SINT(bufferOffset, buffer); } static inline uint4 readR8G8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8_UINT(bufferOffset, buffer); } static inline int4 readR8G8B8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8_SINT(bufferOffset, buffer); } static inline uint4 readR8G8B8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8_UINT(bufferOffset, buffer); } static inline int4 readR8G8B8A8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8A8_SINT(bufferOffset, buffer); } static inline uint4 readR8G8B8A8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8A8_UINT(bufferOffset, buffer); }
2315 static inline int4 readR16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16_SINT(bufferOffset, buffer); } static inline uint4 readR16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16_UINT(bufferOffset, buffer); } static inline int4 readR16G16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16_SINT(bufferOffset, buffer); } static inline uint4 readR16G16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16_UINT(bufferOffset, buffer); } static inline int4 readR16G16B16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16_SINT(bufferOffset, buffer); } static inline uint4 readR16G16B16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16_UINT(bufferOffset, buffer); } static inline int4 readR16G16B16A16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16A16_SINT(bufferOffset, buffer); } static inline uint4 readR16G16B16A16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16A16_UINT(bufferOffset, buffer); }
2316 static inline int4 readR32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32_SINT(bufferOffset, buffer); } static inline uint4 readR32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32_UINT(bufferOffset, buffer); } static inline int4 readR32G32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32_SINT(bufferOffset, buffer); } static inline uint4 readR32G32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32_UINT(bufferOffset, buffer); } static inline int4 readR32G32B32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32_SINT(bufferOffset, buffer); } static inline uint4 readR32G32B32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32_UINT(bufferOffset, buffer); } static inline int4 readR32G32B32A32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32A32_SINT(bufferOffset, buffer); } static inline uint4 readR32G32B32A32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32A32_UINT(bufferOffset, buffer); }
2317 static inline int4 readR10G10B10A2_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR10G10B10A2_SINT(bufferOffset, buffer); } static inline uint4 readR10G10B10A2_USCALED(uint bufferOffset, constant uchar *buffer) { return readR10G10B10A2_UINT(bufferOffset, buffer); }
2318 kernel void readFromBufferToFloatTexture(ushort3 gIndices [[thread_position_in_grid]], constant CopyPixelParams &options[[buffer(0)]], constant uchar *buffer [[buffer(1)]], texture2d<float, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<float, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<float, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<float, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
2319 {
2320     if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y || gIndices.z >= options.copySize.z) { return; }
2321     uint bufferOffset = options.bufferStartOffset + (gIndices.z * options.bufferDepthPitch + gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2322     switch (kCopyFormatType)
2323     {
2324         case FormatID::R5G6B5_UNORM: { auto color = readR5G6B5_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_UNORM: { auto color = readR8G8B8A8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_UNORM_SRGB: { auto color = readR8G8B8A8_UNORM_SRGB(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_SNORM: { auto color = readR8G8B8A8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::B8G8R8A8_UNORM: { auto color = readB8G8R8A8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::B8G8R8A8_UNORM_SRGB: { auto color = readB8G8R8A8_UNORM_SRGB(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_UNORM: { auto color = readR8G8B8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_UNORM_SRGB: { auto color = readR8G8B8_UNORM_SRGB(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_SNORM: { auto color = readR8G8B8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L8_UNORM: { auto color = readL8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L8A8_UNORM: { auto color = readL8A8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R5G5B5A1_UNORM: { auto color = readR5G5B5A1_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R4G4B4A4_UNORM: { auto color = readR4G4B4A4_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8_UNORM: { auto color = readR8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8_SNORM: { auto color = readR8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_UNORM: { auto color = readR8G8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_SNORM: { auto color = readR8G8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_FLOAT: { auto color = readR16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_SNORM: { auto color = readR16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_UNORM: { auto color = readR16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::A16_FLOAT: { auto color = readA16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L16_FLOAT: { auto color = readL16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L16A16_FLOAT: { auto color = readL16A16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_FLOAT: { auto color = readR16G16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_SNORM: { auto color = readR16G16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_UNORM: { auto color = readR16G16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_FLOAT: { auto color = readR16G16B16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_SNORM: { auto color = readR16G16B16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_UNORM: { auto color = readR16G16B16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_FLOAT: { auto color = readR16G16B16A16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_SNORM: { auto color = readR16G16B16A16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_UNORM: { auto color = readR16G16B16A16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32_FLOAT: { auto color = readR32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::A32_FLOAT: { auto color = readA32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L32_FLOAT: { auto color = readL32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L32A32_FLOAT: { auto color = readL32A32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32_FLOAT: { auto color = readR32G32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32_FLOAT: { auto color = readR32G32B32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32A32_FLOAT: { auto color = readR32G32B32A32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break;
2325     }
2326 }
2327 kernel void readFromBufferToIntTexture(ushort3 gIndices [[thread_position_in_grid]], constant CopyPixelParams &options[[buffer(0)]], constant uchar *buffer [[buffer(1)]], texture2d<int, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<int, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<int, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<int, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
2328 {
2329     if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y || gIndices.z >= options.copySize.z) { return; }
2330     uint bufferOffset = options.bufferStartOffset + (gIndices.z * options.bufferDepthPitch + gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2331     switch (kCopyFormatType)
2332     {
2333         case FormatID::R8_SINT: { auto color = readR8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_SINT: { auto color = readR8G8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_SINT: { auto color = readR8G8B8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_SINT: { auto color = readR8G8B8A8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_SINT: { auto color = readR16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_SINT: { auto color = readR16G16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_SINT: { auto color = readR16G16B16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_SINT: { auto color = readR16G16B16A16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32_SINT: { auto color = readR32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32_SINT: { auto color = readR32G32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32_SINT: { auto color = readR32G32B32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32A32_SINT: { auto color = readR32G32B32A32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break;
2334     }
2335 }
2336 kernel void readFromBufferToUIntTexture(ushort3 gIndices [[thread_position_in_grid]], constant CopyPixelParams &options[[buffer(0)]], constant uchar *buffer [[buffer(1)]], texture2d<uint, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<uint, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<uint, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<uint, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
2337 {
2338     if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y || gIndices.z >= options.copySize.z) { return; }
2339     uint bufferOffset = options.bufferStartOffset + (gIndices.z * options.bufferDepthPitch + gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2340     switch (kCopyFormatType)
2341     {
2342         case FormatID::R8_UINT: { auto color = readR8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_UINT: { auto color = readR8G8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_UINT: { auto color = readR8G8B8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_UINT: { auto color = readR8G8B8A8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_UINT: { auto color = readR16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_UINT: { auto color = readR16G16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_UINT: { auto color = readR16G16B16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_UINT: { auto color = readR16G16B16A16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32_UINT: { auto color = readR32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32_UINT: { auto color = readR32G32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32_UINT: { auto color = readR32G32B32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32A32_UINT: { auto color = readR32G32B32A32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break;
2343     }
2344 }
2345 kernel void writeFromFloatTextureToBuffer(ushort2 gIndices [[thread_position_in_grid]], constant WritePixelParams &options[[buffer(0)]], texture2d<float, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<float, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<float, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<float, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<float, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]], device uchar *buffer [[buffer(1)]])
2346 {
2347     if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y) { return; }
2348     uint bufferOffset = options.bufferStartOffset + (gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2349     switch (kCopyFormatType)
2350     {
2351         case FormatID::R5G6B5_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR5G6B5_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_UNORM_SRGB: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_UNORM_SRGB(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::B8G8R8A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeB8G8R8A8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::B8G8R8A8_UNORM_SRGB: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeB8G8R8A8_UNORM_SRGB(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8_UNORM_SRGB: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8_UNORM_SRGB(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeA8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L8A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL8A8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R5G5B5A1_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR5G5B5A1_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R4G4B4A4_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR4G4B4A4_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_NORM<short>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_NORM<ushort>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::A16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeA16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L16A16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL16A16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_NORM<short>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_NORM<ushort>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_NORM<short>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_NORM<ushort>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::A32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeA32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L32A32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL32A32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32B32A32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32B32A32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break;
2352     }
2353 }
2354 kernel void writeFromIntTextureToBuffer(ushort2 gIndices [[thread_position_in_grid]], constant WritePixelParams &options[[buffer(0)]], texture2d<int, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<int, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<int, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<int, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<int, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]], device uchar *buffer [[buffer(1)]])
2355 {
2356     if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y) { return; }
2357     uint bufferOffset = options.bufferStartOffset + (gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2358     switch (kCopyFormatType)
2359     {
2360         case FormatID::R8_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32B32A32_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32B32A32_SINT(gIndices, options, bufferOffset, color, buffer); } break;
2361     }
2362 }
2363 kernel void writeFromUIntTextureToBuffer(ushort2 gIndices [[thread_position_in_grid]], constant WritePixelParams &options[[buffer(0)]], texture2d<uint, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<uint, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<uint, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<uint, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<uint, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]], device uchar *buffer [[buffer(1)]])
2364 {
2365     if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y) { return; }
2366     uint bufferOffset = options.bufferStartOffset + (gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2367     switch (kCopyFormatType)
2368     {
2369         case FormatID::R8_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32B32A32_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32B32A32_UINT(gIndices, options, bufferOffset, color, buffer); } break;
2370     }
2371 }
2372 struct CopyVertexParams
2373 {
2374     uint srcBufferStartOffset;
2375     uint srcStride;
2376     uint srcComponentBytes;
2377     uint srcComponents;
2378     uchar4 srcDefaultAlphaData;
2379     uint dstBufferStartOffset;
2380     uint dstStride;
2381     uint dstComponents;
2382     uint vertexCount;
2383 };
2384 template <typename IntType>
2385 static inline void writeFloatVertex(constant CopyVertexParams &options,
2386                                     uint idx,
2387                                     vec<IntType, 4> data,
2388                                     device uchar *dst)
2389 {
2390     uint dstOffset = idx * options.dstStride + options.dstBufferStartOffset;
2391     for (uint component = 0; component < options.dstComponents; ++component, dstOffset += 4)
2392     {
2393         floatToBytes(static_cast<float>(data[component]), dstOffset, dst);
2394     }
2395 }
2396 template <>
2397 inline void writeFloatVertex(constant CopyVertexParams &options,
2398                              uint idx,
2399                              vec<float, 4> data,
2400                              device uchar *dst)
2401 {
2402     uint dstOffset = idx * options.dstStride + options.dstBufferStartOffset;
2403     for (uint component = 0; component < options.dstComponents; ++component, dstOffset += 4)
2404     {
2405         floatToBytes(data[component], dstOffset, dst);
2406     }
2407 }
2408 static inline void convertToFloatVertexFormat(uint index,
2409                                               constant CopyVertexParams &options,
2410                                               constant uchar *srcBuffer,
2411                                               device uchar *dstBuffer)
2412 {
2413     uint bufferOffset = options.srcBufferStartOffset + options.srcStride * index;
2414     switch (kCopyFormatType)
2415     {
2416         case FormatID::R8_UNORM: { auto data = readR8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_SNORM: { auto data = readR8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_UINT: { auto data = readR8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_SINT: { auto data = readR8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_USCALED: { auto data = readR8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_SSCALED: { auto data = readR8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_UNORM: { auto data = readR8G8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_SNORM: { auto data = readR8G8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_UINT: { auto data = readR8G8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_SINT: { auto data = readR8G8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_USCALED: { auto data = readR8G8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_SSCALED: { auto data = readR8G8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_UNORM: { auto data = readR8G8B8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_SNORM: { auto data = readR8G8B8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_UINT: { auto data = readR8G8B8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_SINT: { auto data = readR8G8B8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_USCALED: { auto data = readR8G8B8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_SSCALED: { auto data = readR8G8B8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_UNORM: { auto data = readR8G8B8A8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_SNORM: { auto data = readR8G8B8A8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_UINT: { auto data = readR8G8B8A8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_SINT: { auto data = readR8G8B8A8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_USCALED: { auto data = readR8G8B8A8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_SSCALED: { auto data = readR8G8B8A8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_UNORM: { auto data = readR16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_SNORM: { auto data = readR16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_UINT: { auto data = readR16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_SINT: { auto data = readR16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_USCALED: { auto data = readR16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_SSCALED: { auto data = readR16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_UNORM: { auto data = readR16G16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_SNORM: { auto data = readR16G16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_UINT: { auto data = readR16G16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_SINT: { auto data = readR16G16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_USCALED: { auto data = readR16G16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_SSCALED: { auto data = readR16G16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_UNORM: { auto data = readR16G16B16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_SNORM: { auto data = readR16G16B16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_UINT: { auto data = readR16G16B16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_SINT: { auto data = readR16G16B16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_USCALED: { auto data = readR16G16B16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_SSCALED: { auto data = readR16G16B16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_UNORM: { auto data = readR16G16B16A16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_SNORM: { auto data = readR16G16B16A16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_UINT: { auto data = readR16G16B16A16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_SINT: { auto data = readR16G16B16A16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_USCALED: { auto data = readR16G16B16A16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_SSCALED: { auto data = readR16G16B16A16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_UNORM: { auto data = readR32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_SNORM: { auto data = readR32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_UINT: { auto data = readR32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_SINT: { auto data = readR32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_USCALED: { auto data = readR32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_SSCALED: { auto data = readR32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_UNORM: { auto data = readR32G32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_SNORM: { auto data = readR32G32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_UINT: { auto data = readR32G32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_SINT: { auto data = readR32G32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_USCALED: { auto data = readR32G32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_SSCALED: { auto data = readR32G32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_UNORM: { auto data = readR32G32B32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_SNORM: { auto data = readR32G32B32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_UINT: { auto data = readR32G32B32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_SINT: { auto data = readR32G32B32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_USCALED: { auto data = readR32G32B32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_SSCALED: { auto data = readR32G32B32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_UNORM: { auto data = readR32G32B32A32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_SNORM: { auto data = readR32G32B32A32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_UINT: { auto data = readR32G32B32A32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_SINT: { auto data = readR32G32B32A32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_USCALED: { auto data = readR32G32B32A32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_SSCALED: { auto data = readR32G32B32A32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_FLOAT: { auto data = readR16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_FLOAT: { auto data = readR16G16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_FLOAT: { auto data = readR16G16B16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_FLOAT: { auto data = readR16G16B16A16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_FLOAT: { auto data = readR32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_FLOAT: { auto data = readR32G32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_FLOAT: { auto data = readR32G32B32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_FLOAT: { auto data = readR32G32B32A32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_FIXED: { auto data = readR32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_FIXED: { auto data = readR32G32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_FIXED: { auto data = readR32G32B32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_FIXED: { auto data = readR32G32B32A32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_SINT: { auto data = readR10G10B10A2_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_UINT: { auto data = readR10G10B10A2_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_SSCALED: { auto data = readR10G10B10A2_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_USCALED: { auto data = readR10G10B10A2_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break;
2417     }
2418 }
2419 kernel void convertToFloatVertexFormatCS(uint index [[thread_position_in_grid]],
2420                                          constant CopyVertexParams &options [[buffer(0)]],
2421                                          constant uchar *srcBuffer [[buffer(1)]],
2422                                          device uchar *dstBuffer [[buffer(2)]])
2423 {
2424     if (index >= options.vertexCount) { return; };
2425     convertToFloatVertexFormat(index, options, srcBuffer, dstBuffer);
2426 }
2427 vertex void convertToFloatVertexFormatVS(uint index [[vertex_id]],
2428                                          constant CopyVertexParams &options [[buffer(0)]],
2429                                          constant uchar *srcBuffer [[buffer(1)]],
2430                                          device uchar *dstBuffer [[buffer(2)]])
2431 {
2432     convertToFloatVertexFormat(index, options, srcBuffer, dstBuffer);
2433 }
2434 static inline void expandVertexFormatComponents(uint index,
2435                                                 constant CopyVertexParams &options,
2436                                                 constant uchar *srcBuffer,
2437                                                 device uchar *dstBuffer)
2438 {
2439     uint srcOffset = options.srcBufferStartOffset + options.srcStride * index;
2440     uint dstOffset = options.dstBufferStartOffset + options.dstStride * index;
2441     uint dstComponentsBeforeAlpha = min(options.dstComponents, 3u);
2442     uint component;
2443     for (component = 0; component < options.srcComponents; ++component,
2444         srcOffset += options.srcComponentBytes, dstOffset += options.srcComponentBytes)
2445     {
2446         for (uint byte = 0; byte < options.srcComponentBytes; ++byte)
2447         {
2448             dstBuffer[dstOffset + byte] = srcBuffer[srcOffset + byte];
2449         }
2450     }
2451     for (; component < dstComponentsBeforeAlpha;
2452          ++component, dstOffset += options.srcComponentBytes)
2453     {
2454         for (uint byte = 0; byte < options.srcComponentBytes; ++byte)
2455         {
2456             dstBuffer[dstOffset + byte] = 0;
2457         }
2458     }
2459     if (component < options.dstComponents)
2460     {
2461         for (uint byte = 0; byte < options.srcComponentBytes; ++byte)
2462         {
2463             dstBuffer[dstOffset + byte] = options.srcDefaultAlphaData[byte];
2464         }
2465     }
2466 }
2467 kernel void expandVertexFormatComponentsCS(uint index [[thread_position_in_grid]],
2468                                            constant CopyVertexParams &options [[buffer(0)]],
2469                                            constant uchar *srcBuffer [[buffer(1)]],
2470                                            device uchar *dstBuffer [[buffer(2)]])
2471 {
2472     if (index >= options.vertexCount) { return; };
2473     expandVertexFormatComponents(index, options, srcBuffer, dstBuffer);
2474 }
2475 vertex void expandVertexFormatComponentsVS(uint index [[vertex_id]],
2476                                            constant CopyVertexParams &options [[buffer(0)]],
2477                                            constant uchar *srcBuffer [[buffer(1)]],
2478                                            device uchar *dstBuffer [[buffer(2)]])
2479 {
2480     expandVertexFormatComponents(index, options, srcBuffer, dstBuffer);
2481 }
2482 kernel void linearizeBlocks(ushort2 position [[thread_position_in_grid]],
2483                             constant uint2 *dimensions [[buffer(0)]],
2484                             constant uint2 *srcBuffer [[buffer(1)]],
2485                             device uint2 *dstBuffer [[buffer(2)]])
2486 {
2487     if (any(uint2(position) >= *dimensions))
2488     {
2489         return;
2490     }
2491     uint2 t = uint2(position);
2492     t = (t | (t << 8)) & 0x00FF00FF;
2493     t = (t | (t << 4)) & 0x0F0F0F0F;
2494     t = (t | (t << 2)) & 0x33333333;
2495     t = (t | (t << 1)) & 0x55555555;
2496     dstBuffer[position.y * (*dimensions).x + position.x] = srcBuffer[(t.x << 1) | t.y];
2497 }
2498 kernel void saturateDepth(uint2 position [[thread_position_in_grid]],
2499                           constant uint3 *dimensions [[buffer(0)]],
2500                           device float *srcBuffer [[buffer(1)]],
2501                           device float *dstBuffer [[buffer(2)]])
2502 {
2503     if (any(position >= (*dimensions).xy))
2504     {
2505         return;
2506     }
2507     const uint srcOffset = position.y * (*dimensions).z + position.x;
2508     const uint dstOffset = position.y * (*dimensions).x + position.x;
2509     dstBuffer[dstOffset] = saturate(srcBuffer[srcOffset]);
2510 }
2511 constant bool kCombineWithExistingResult [[function_constant(1000)]];
2512 struct CombineVisibilityResultOptions
2513 {
2514     uint startOffset;
2515     uint numOffsets;
2516 };
2517 kernel void combineVisibilityResult(uint idx [[thread_position_in_grid]],
2518                                     constant CombineVisibilityResultOptions &options [[buffer(0)]],
2519                                     constant ushort4 *renderpassVisibilityResult [[buffer(1)]],
2520                                     device ushort4 *finalResults [[buffer(2)]])
2521 {
2522     if (idx > 0)
2523     {
2524         return;
2525     }
2526     ushort4 finalResult16x4;
2527     if (kCombineWithExistingResult)
2528     {
2529         finalResult16x4 = finalResults[0];
2530     }
2531     else
2532     {
2533         finalResult16x4 = ushort4(0, 0, 0, 0);
2534     }
2535     for (uint i = 0; i < options.numOffsets; ++i)
2536     {
2537         uint offset = options.startOffset + i;
2538         ushort4 renderpassResult = renderpassVisibilityResult[offset];
2539         finalResult16x4 = finalResult16x4 | renderpassResult;
2540     }
2541     finalResults[0] = finalResult16x4;
2542 }
2543 using namespace metal;
2544 constant uint fixIndexBufferKey [[ function_constant(2000) ]];
2545 constant bool indexBufferIsUint16 = (((fixIndexBufferKey >> 0U) & 0x03U) == 2U);
2546 constant bool indexBufferIsUint32 = (((fixIndexBufferKey >> 0U) & 0x03U) == 3U);
2547 constant bool outIndexBufferIsUint16 = (((fixIndexBufferKey >> 2U) & 0x03U) == 2U);
2548 constant bool outIndexBufferIsUint32 = (((fixIndexBufferKey >> 2U) & 0x03U) == 3U);
2549 constant bool doPrimRestart = (fixIndexBufferKey & 0x00100U);
2550 constant uint fixIndexBufferMode = (fixIndexBufferKey >> 4U) & 0x0FU;
2551 static inline uint readIdx(
2552                            const device ushort *indexBufferUint16,
2553                            const device uint *indexBufferUint32,
2554                            const uint restartIndex,
2555                            const uint indexCount,
2556                            uint idx,
2557                            thread bool &foundRestart,
2558                            thread uint &indexThatRestartedFirst
2559                            )
2560 {
2561     uint inIndex = idx;
2562     if(inIndex < indexCount)
2563     {
2564         if(indexBufferIsUint16)
2565         {
2566             inIndex = indexBufferUint16[inIndex];
2567         }
2568         else if(indexBufferIsUint32)
2569         {
2570             inIndex = indexBufferUint32[inIndex];
2571         }
2572     }
2573     else
2574     {
2575         foundRestart = true;
2576         indexThatRestartedFirst = idx;
2577     }
2578     if(doPrimRestart && !foundRestart && inIndex == restartIndex)
2579     {
2580         foundRestart = true;
2581         indexThatRestartedFirst = idx;
2582     }
2583     return inIndex;
2584 }
2585 static inline void outputPrimitive(
2586                                    const device ushort *indexBufferUint16,
2587                                    const device uint *indexBufferUint32,
2588                                    device ushort *outIndexBufferUint16,
2589                                    device uint *outIndexBufferUint32,
2590                                    const uint restartIndex,
2591                                    const uint indexCount,
2592                                    thread uint &baseIndex,
2593                                    uint onIndex,
2594                                    thread uint &onOutIndex
2595                                    )
2596 {
2597     if(baseIndex > onIndex) return;
2598     bool foundRestart = false;
2599     uint indexThatRestartedFirst = 0;
2600     switch(fixIndexBufferMode)
2601     {
2602         case 0x00U:
2603         {
2604             auto tmpIndex = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex, foundRestart, indexThatRestartedFirst);
2605             if(foundRestart)
2606             {
2607                 baseIndex = indexThatRestartedFirst + 1;
2608                 return;
2609             }
2610             ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex; } onOutIndex++; });
2611         }
2612         break;
2613         case 0x01U:
2614         {
2615             auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0, foundRestart, indexThatRestartedFirst);
2616             auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1, foundRestart, indexThatRestartedFirst);
2617             if(foundRestart)
2618             {
2619                 baseIndex = indexThatRestartedFirst + 1;
2620                 return;
2621             }
2622             if((onIndex - baseIndex) & 1) return;
2623             if(fixIndexBufferKey & 0x00200U)
2624             {
2625                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2626                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2627             }
2628             else
2629             {
2630                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2631                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2632             }
2633         }
2634         break;
2635         case 0x03U:
2636         {
2637             auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0, foundRestart, indexThatRestartedFirst);
2638             auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1, foundRestart, indexThatRestartedFirst);
2639             if(foundRestart)
2640             {
2641                 baseIndex = indexThatRestartedFirst + 1;
2642                 return;
2643             }
2644             if(fixIndexBufferKey & 0x00200U)
2645             {
2646                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2647                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2648             }
2649             else
2650             {
2651                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2652                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2653             }
2654         }
2655         break;
2656         case 0x04U:
2657         {
2658             auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0, foundRestart, indexThatRestartedFirst);
2659             auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1, foundRestart, indexThatRestartedFirst);
2660             auto tmpIndex2 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 2, foundRestart, indexThatRestartedFirst);
2661             if(foundRestart)
2662             {
2663                 baseIndex = indexThatRestartedFirst + 1;
2664                 return;
2665             }
2666             if(((onIndex - baseIndex) % 3) != 0) return;
2667             if(fixIndexBufferKey & 0x00200U)
2668             {
2669                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2670                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2671                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2672             }
2673             else
2674             {
2675                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2676                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2677                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2678             }
2679         }
2680         break;
2681         case 0x05U:
2682         {
2683             uint isOdd = ((onIndex - baseIndex) & 1);
2684             auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0 + isOdd, foundRestart, indexThatRestartedFirst);
2685             auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1 - isOdd, foundRestart, indexThatRestartedFirst);
2686             auto tmpIndex2 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 2, foundRestart, indexThatRestartedFirst);
2687             if(foundRestart)
2688             {
2689                 baseIndex = indexThatRestartedFirst + 1;
2690                 return;
2691             }
2692             if(fixIndexBufferKey & 0x00200U)
2693             {
2694                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2695                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2696                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2697             }
2698             else
2699             {
2700                 if(isOdd)
2701                 {
2702                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2703                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2704                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2705                 }
2706                 else
2707                 {
2708                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2709                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2710                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2711                 }
2712             }
2713             assert(onOutIndex <= (onIndex + 1) * 3);
2714             assert(onOutIndex <= (indexCount - 2) * 3);
2715         }
2716         break;
2717     }
2718 }
2719 kernel void fixIndexBuffer(
2720                            const device ushort *indexBufferUint16 [[ buffer(0), function_constant(indexBufferIsUint16) ]],
2721                            const device uint *indexBufferUint32 [[ buffer(0), function_constant(indexBufferIsUint32) ]],
2722                            device ushort *outIndexBufferUint16 [[ buffer(1), function_constant(outIndexBufferIsUint16) ]],
2723                            device uint *outIndexBufferUint32 [[ buffer(1), function_constant(outIndexBufferIsUint32) ]],
2724                            constant uint &indexCount [[ buffer(2) ]],
2725                            constant uint &primCount [[ buffer(3) ]],
2726                            uint prim [[thread_position_in_grid]])
2727 {
2728     constexpr uint restartIndex = 0xFFFFFFFF;
2729     uint baseIndex = 0;
2730     uint onIndex = onIndex;
2731     uint onOutIndex = onOutIndex;
2732     if(prim < primCount)
2733     {
2734         switch(fixIndexBufferMode)
2735         {
2736             case 0x00U:
2737                 onIndex = prim;
2738                 onOutIndex = prim;
2739                 break;
2740             case 0x01U:
2741                 onIndex = prim * 2;
2742                 onOutIndex = prim * 2;
2743                 break;
2744             case 0x03U:
2745                 onIndex = prim;
2746                 onOutIndex = prim * 2;
2747                 break;
2748             case 0x04U:
2749                 onIndex = prim * 3;
2750                 onOutIndex = prim * 3;
2751                 break;
2752             case 0x05U:
2753                 onIndex = prim;
2754                 onOutIndex = prim * 3;
2755                 break;
2756         }
2757         outputPrimitive(indexBufferUint16, indexBufferUint32, outIndexBufferUint16, outIndexBufferUint32, restartIndex, indexCount, baseIndex, onIndex, onOutIndex);
2758     }
2759 }
2760 static inline void generatePrimitive(
2761                                    device ushort *outIndexBufferUint16,
2762                                    device uint *outIndexBufferUint32,
2763                                    const uint firstVertex,
2764                                    const uint indexCount,
2765                                    thread uint &baseIndex,
2766                                    uint onIndex,
2767                                    uint primCount,
2768                                    thread uint &onOutIndex
2769                                    )
2770 {
2771     if(baseIndex > onIndex) return;
2772     switch(fixIndexBufferMode)
2773     {
2774         case 0x00U:
2775         {
2776             ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = onIndex + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = onIndex + firstVertex; } onOutIndex++; });
2777         }
2778         break;
2779         case 0x01U:
2780         {
2781             auto tmpIndex0 = onIndex + 0;
2782             auto tmpIndex1 = onIndex + 1;
2783             if(fixIndexBufferKey & 0x00200U)
2784             {
2785                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2786                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2787             }
2788             else
2789             {
2790                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2791                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2792             }
2793         }
2794         break;
2795         case 0x02U:
2796         {
2797             auto tmpIndex0 = onIndex + 0;
2798             auto tmpIndex1 = (onIndex + 1) % primCount;
2799             if(fixIndexBufferKey & 0x00200U)
2800             {
2801                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2802                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2803             }
2804             else
2805             {
2806                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2807                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2808             }
2809         }
2810         break;
2811         case 0x03U:
2812         {
2813             auto tmpIndex0 = onIndex + 0;
2814             auto tmpIndex1 = onIndex + 1;
2815             if(fixIndexBufferKey & 0x00200U)
2816             {
2817                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2818                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2819             }
2820             else
2821             {
2822                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2823                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2824             }
2825         }
2826         break;
2827         case 0x04U:
2828         {
2829             auto tmpIndex0 = onIndex + 0;
2830             auto tmpIndex1 = onIndex + 1;
2831             auto tmpIndex2 = onIndex + 2;
2832             if(fixIndexBufferKey & 0x00200U)
2833             {
2834                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2835                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2836                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2837             }
2838             else
2839             {
2840                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2841                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2842                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2843             }
2844         }
2845         break;
2846         case 0x05U:
2847         {
2848             uint isOdd = ((onIndex - baseIndex) & 1);
2849             auto tmpIndex0 = onIndex + 0 + isOdd;
2850             auto tmpIndex1 = onIndex + 1 - isOdd;
2851             auto tmpIndex2 = onIndex + 2;
2852             if(fixIndexBufferKey & 0x00200U)
2853             {
2854                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2855                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2856                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2857             }
2858             else
2859             {
2860                 if(isOdd)
2861                 {
2862                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2863                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2864                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2865                 }
2866                 else
2867                 {
2868                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2869                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2870                     ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2871                 }
2872             }
2873             assert(onOutIndex <= (onIndex + 1) * 3);
2874             assert(onOutIndex <= (indexCount - 2) * 3);
2875             break;
2876         }
2877         case 0x06U:
2878         {
2879             auto tmpIndex0 = 0;
2880             auto tmpIndex1 = onIndex + 1;
2881             auto tmpIndex2 = onIndex + 2;
2882             if(fixIndexBufferKey & 0x00200U)
2883             {
2884                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2885                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2886                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2887             }
2888             else
2889             {
2890                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2891                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2892                 ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2893             }
2894         }
2895         break;
2896     }
2897 }
2898 kernel void genIndexBuffer(
2899                            device ushort *outIndexBufferUint16 [[ buffer(1), function_constant(outIndexBufferIsUint16) ]],
2900                            device uint *outIndexBufferUint32 [[ buffer(1), function_constant(outIndexBufferIsUint32) ]],
2901                            constant uint &indexCount [[ buffer(2) ]],
2902                            constant uint &primCount [[ buffer(3) ]],
2903                            constant uint &firstVertex [[ buffer(4) ]],
2904                            uint prim [[thread_position_in_grid]])
2905 {
2906     uint baseIndex = 0;
2907     uint onIndex = onIndex;
2908     uint onOutIndex = onOutIndex;
2909     if(prim < primCount)
2910     {
2911         switch(fixIndexBufferMode)
2912         {
2913             case 0x00U:
2914                 onIndex = prim;
2915                 onOutIndex = prim;
2916                 break;
2917             case 0x01U:
2918                 onIndex = prim * 2;
2919                 onOutIndex = prim * 2;
2920                 break;
2921             case 0x03U:
2922                 onIndex = prim;
2923                 onOutIndex = prim * 2;
2924                 break;
2925             case 0x02U:
2926                 onIndex = prim;
2927                 onOutIndex = prim * 2;
2928                 break;
2929             case 0x04U:
2930                 onIndex = prim * 3;
2931                 onOutIndex = prim * 3;
2932                 break;
2933             case 0x05U:
2934                 onIndex = prim;
2935                 onOutIndex = prim * 3;
2936                 break;
2937             case 0x06U:
2938                 onIndex = prim;
2939                 onOutIndex = prim * 3;
2940                 break;
2941         }
2942         generatePrimitive(outIndexBufferUint16, outIndexBufferUint32, firstVertex, indexCount, baseIndex, onIndex, primCount, onOutIndex);
2943     }
2944 }
2945 
2946 
2947 )";
2948