• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// GENERATED FILE - DO NOT EDIT.
2// Generated by gen_mtl_internal_shaders.py
3//
4// Copyright 2020 The ANGLE Project Authors. All rights reserved.
5// Use of this source code is governed by a BSD-style license that can be
6// found in the LICENSE file.
7//
8
9// Combined Metal default shaders.
10
11# include <simd/simd.h>
12# include <metal_stdlib>
13namespace rx
14{
15namespace mtl_shader
16{
17enum
18{
19    kTextureType2D = 0,
20    kTextureType2DMultisample = 1,
21    kTextureType2DArray = 2,
22    kTextureTypeCube = 3,
23    kTextureType3D = 4,
24    kTextureTypeCount = 5,
25};
26}
27}
28using namespace metal;
29constant uint32_t kNumColorOutputs [[function_constant(0)]];
30constant bool kColorOutputAvailable0 = kNumColorOutputs > 0;
31constant bool kColorOutputAvailable1 = kNumColorOutputs > 1;
32constant bool kColorOutputAvailable2 = kNumColorOutputs > 2;
33constant bool kColorOutputAvailable3 = kNumColorOutputs > 3;
34constant bool kColorOutputAvailable4 = kNumColorOutputs > 4;
35constant bool kColorOutputAvailable5 = kNumColorOutputs > 5;
36constant bool kColorOutputAvailable6 = kNumColorOutputs > 6;
37constant bool kColorOutputAvailable7 = kNumColorOutputs > 7;
38namespace rx
39{
40namespace mtl_shader
41{
42constant float2 gCorners[3] = {float2(-1.0f, -1.0f), float2(3.0f, -1.0f), float2(-1.0f, 3.0f)};
43template <typename T>
44struct MultipleColorOutputs
45{
46    vec<T, 4> color0 [[color(0), function_constant(kColorOutputAvailable0)]];
47    vec<T, 4> color1 [[color(1), function_constant(kColorOutputAvailable1)]];
48    vec<T, 4> color2 [[color(2), function_constant(kColorOutputAvailable2)]];
49    vec<T, 4> color3 [[color(3), function_constant(kColorOutputAvailable3)]];
50    vec<T, 4> color4 [[color(4), function_constant(kColorOutputAvailable4)]];
51    vec<T, 4> color5 [[color(5), function_constant(kColorOutputAvailable5)]];
52    vec<T, 4> color6 [[color(6), function_constant(kColorOutputAvailable6)]];
53    vec<T, 4> color7 [[color(7), function_constant(kColorOutputAvailable7)]];
54};
55template <typename T>
56static inline MultipleColorOutputs<T> toMultipleColorOutputs(vec<T, 4> color)
57{
58    MultipleColorOutputs<T> re;
59    do { if (kColorOutputAvailable0) { re.color0 = color; } } while (0);
60    do { if (kColorOutputAvailable1) { re.color1 = color; } } while (0);
61    do { if (kColorOutputAvailable2) { re.color2 = color; } } while (0);
62    do { if (kColorOutputAvailable3) { re.color3 = color; } } while (0);
63    do { if (kColorOutputAvailable4) { re.color4 = color; } } while (0);
64    do { if (kColorOutputAvailable5) { re.color5 = color; } } while (0);
65    do { if (kColorOutputAvailable6) { re.color6 = color; } } while (0);
66    do { if (kColorOutputAvailable7) { re.color7 = color; } } while (0);
67    return re;
68}
69static inline float3 cubeTexcoords(float2 texcoords, int face)
70{
71    texcoords = 2.0 * texcoords - 1.0;
72    switch (face)
73    {
74        case 0:
75            return float3(1.0, -texcoords.y, -texcoords.x);
76        case 1:
77            return float3(-1.0, -texcoords.y, texcoords.x);
78        case 2:
79            return float3(texcoords.x, 1.0, texcoords.y);
80        case 3:
81            return float3(texcoords.x, -1.0, -texcoords.y);
82        case 4:
83            return float3(texcoords.x, -texcoords.y, 1.0);
84        case 5:
85            return float3(-texcoords.x, -texcoords.y, -1.0);
86    }
87    return float3(texcoords, 0);
88}
89template <typename T>
90static inline vec<T, 4> resolveTextureMS(texture2d_ms<T> srcTexture, uint2 coords)
91{
92    uint samples = srcTexture.get_num_samples();
93    vec<T, 4> output(0);
94    for (uint sample = 0; sample < samples; ++sample)
95    {
96        output += srcTexture.read(coords, sample);
97    }
98    output = output / samples;
99    return output;
100}
101static inline float4 sRGBtoLinear(float4 color)
102{
103    float3 linear1 = color.rgb / 12.92;
104    float3 linear2 = powr((color.rgb + float3(0.055)) / 1.055, 2.4);
105    float3 factor = float3(color.rgb <= float3(0.04045));
106    float4 linear = float4(factor * linear1 + float3(1.0 - factor) * linear2, color.a);
107    return linear;
108}
109static inline float linearToSRGB(float color)
110{
111    if (color <= 0.0f)
112        return 0.0f;
113    if (color < 0.0031308f)
114        return 12.92f * color;
115    if (color < 1.0f)
116        return 1.055f * powr(color, 0.41666f) - 0.055f;
117    return 1.0f;
118}
119static inline float4 linearToSRGB(float4 color)
120{
121    return float4(linearToSRGB(color.r), linearToSRGB(color.g), linearToSRGB(color.b), color.a);
122}
123template <typename Short>
124static inline Short bytesToShort(constant uchar *input, uint offset)
125{
126    Short inputLo = input[offset];
127    Short inputHi = input[offset + 1];
128    return inputLo | (inputHi << 8);
129}
130template <typename Int>
131static inline Int bytesToInt(constant uchar *input, uint offset)
132{
133    Int input0 = input[offset];
134    Int input1 = input[offset + 1];
135    Int input2 = input[offset + 2];
136    Int input3 = input[offset + 3];
137    return input0 | (input1 << 8) | (input2 << 16) | (input3 << 24);
138}
139template <typename Short>
140static inline void shortToBytes(Short val, uint offset, device uchar *output)
141{
142    ushort valUnsigned = as_type<ushort>(val);
143    output[offset] = valUnsigned & 0xff;
144    output[offset + 1] = (valUnsigned >> 8) & 0xff;
145}
146template <typename Int>
147static inline void intToBytes(Int val, uint offset, device uchar *output)
148{
149    uint valUnsigned = as_type<uint>(val);
150    output[offset] = valUnsigned & 0xff;
151    output[offset + 1] = (valUnsigned >> 8) & 0xff;
152    output[offset + 2] = (valUnsigned >> 16) & 0xff;
153    output[offset + 3] = (valUnsigned >> 24) & 0xff;
154}
155static inline void floatToBytes(float val, uint offset, device uchar *output)
156{
157    intToBytes(as_type<uint>(val), offset, output);
158}
159static inline void int24bitToBytes(uint val, uint offset, device uchar *output)
160{
161    output[offset] = val & 0xff;
162    output[offset + 1] = (val >> 8) & 0xff;
163    output[offset + 2] = (val >> 16) & 0xff;
164}
165template <unsigned int inputBitCount, unsigned int inputBitStart, typename T>
166static inline T getShiftedData(T input)
167{
168    static_assert(inputBitCount + inputBitStart <= (sizeof(T) * 8),
169                  "T must have at least as many bits as inputBitCount + inputBitStart.");
170    const T mask = (1 << inputBitCount) - 1;
171    return (input >> inputBitStart) & mask;
172}
173template <unsigned int inputBitCount, unsigned int inputBitStart, typename T>
174static inline T shiftData(T input)
175{
176    static_assert(inputBitCount + inputBitStart <= (sizeof(T) * 8),
177                  "T must have at least as many bits as inputBitCount + inputBitStart.");
178    const T mask = (1 << inputBitCount) - 1;
179    return (input & mask) << inputBitStart;
180}
181template <unsigned int inputBitCount, typename T>
182static inline float normalizedToFloat(T input)
183{
184    static_assert(inputBitCount <= (sizeof(T) * 8),
185                  "T must have more bits than or same bits as inputBitCount.");
186    static_assert(inputBitCount <= 23, "Only single precision is supported");
187    constexpr float inverseMax = 1.0f / ((1 << inputBitCount) - 1);
188    return input * inverseMax;
189}
190template <typename T>
191static inline float normalizedToFloat(T input)
192{
193    return normalizedToFloat<sizeof(T) * 8, T>(input);
194}
195template <>
196inline float normalizedToFloat(short input)
197{
198    constexpr float inverseMax = 1.0f / 0x7fff;
199    return static_cast<float>(input) * inverseMax;
200}
201template <>
202inline float normalizedToFloat(int input)
203{
204    constexpr float inverseMax = 1.0f / 0x7fffffff;
205    return static_cast<float>(input) * inverseMax;
206}
207template <>
208inline float normalizedToFloat(uint input)
209{
210    constexpr float inverseMax = 1.0f / 0xffffffff;
211    return static_cast<float>(input) * inverseMax;
212}
213template <unsigned int outputBitCount, typename T>
214static inline T floatToNormalized(float input)
215{
216    static_assert(outputBitCount <= (sizeof(T) * 8),
217                  "T must have more bits than or same bits as inputBitCount.");
218    static_assert(outputBitCount > (metal::is_unsigned<T>::value ? 0 : 1),
219                  "outputBitCount must be at least 1 not counting the sign bit.");
220    constexpr unsigned int bits =
221        metal::is_unsigned<T>::value ? outputBitCount : outputBitCount - 1;
222    static_assert(bits <= 23, "Only single precision is supported");
223    return static_cast<T>(metal::round(((1 << bits) - 1) * input));
224}
225template <typename T>
226static inline T floatToNormalized(float input)
227{
228    return floatToNormalized<sizeof(T) * 8, T>(input);
229}
230}
231}
232
233using namespace rx::mtl_shader;
234constant bool kPremultiplyAlpha [[function_constant(1)]];
235constant bool kUnmultiplyAlpha [[function_constant(2)]];
236constant bool kTransformLinearToSrgb [[function_constant(3)]];
237constant int kSourceTextureType [[function_constant(4)]];
238constant int kSourceTexture2Type [[function_constant(5)]];
239constant bool kSourceTextureType2D = kSourceTextureType == kTextureType2D;
240constant bool kSourceTextureType2DArray = kSourceTextureType == kTextureType2DArray;
241constant bool kSourceTextureType2DMS = kSourceTextureType == kTextureType2DMultisample;
242constant bool kSourceTextureTypeCube = kSourceTextureType == kTextureTypeCube;
243constant bool kSourceTextureType3D = kSourceTextureType == kTextureType3D;
244constant bool kSourceTexture2Type2D = kSourceTexture2Type == kTextureType2D;
245constant bool kSourceTexture2Type2DArray = kSourceTexture2Type == kTextureType2DArray;
246constant bool kSourceTexture2Type2DMS = kSourceTexture2Type == kTextureType2DMultisample;
247constant bool kSourceTexture2TypeCube = kSourceTexture2Type == kTextureTypeCube;
248struct BlitParams
249{
250    float4 srcTexCoords;
251    int srcLevel;
252    int srcLayer;
253    bool dstLuminance;
254    uint8_t padding[7];
255};
256struct BlitVSOut
257{
258    float4 position [[position]];
259    float2 texCoords [[center_no_perspective, user(locn1)]];
260};
261vertex BlitVSOut blitVS(unsigned int vid [[vertex_id]], constant BlitParams &options [[buffer(0)]])
262{
263    BlitVSOut output;
264    output.position.xy = select(float2(-1.0f), float2(1.0f), bool2(vid & uint2(2, 1)));
265    output.position.zw = float2(0.0, 1.0);
266    output.texCoords = select(options.srcTexCoords.xy, options.srcTexCoords.zw, bool2(vid & uint2(2, 1)));
267    return output;
268}
269template <typename SrcTexture2d>
270static uint2 getImageCoords(SrcTexture2d srcTexture, float2 texCoords)
271{
272    uint2 dimens(srcTexture.get_width(), srcTexture.get_height());
273    uint2 coords = uint2(texCoords * float2(dimens));
274    return coords;
275}
276template <typename T>
277static inline vec<T, 4> blitSampleTextureMS(texture2d_ms<T> srcTexture, float2 texCoords)
278{
279    uint2 coords = getImageCoords(srcTexture, texCoords);
280    return resolveTextureMS(srcTexture, coords);
281}
282template <typename T>
283static inline vec<T, 4> blitSampleTexture3D(texture3d<T> srcTexture,
284                                            sampler textureSampler,
285                                            float2 texCoords,
286                                            constant BlitParams &options)
287{
288    uint depth = srcTexture.get_depth(options.srcLevel);
289    float zCoord = (float(options.srcLayer) + 0.5) / float(depth);
290    return srcTexture.sample(textureSampler, float3(texCoords, zCoord), level(options.srcLevel));
291}
292template <typename T>
293static inline vec<T, 4> blitReadTexture(BlitVSOut input [[stage_in]], texture2d<T> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<T> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<T> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<T> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<T> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
294{
295    vec<T, 4> output;
296    switch (kSourceTextureType)
297    {
298        case kTextureType2D:
299            output = srcTexture2d.sample(textureSampler, input.texCoords, level(options.srcLevel));
300            break;
301        case kTextureType2DArray:
302            output = srcTexture2dArray.sample(textureSampler, input.texCoords, options.srcLayer,
303                                              level(options.srcLevel));
304            break;
305        case kTextureType2DMultisample:
306            output = blitSampleTextureMS(srcTexture2dMS, input.texCoords);
307            break;
308        case kTextureTypeCube:
309            output = srcTextureCube.sample(textureSampler,
310                                           cubeTexcoords(input.texCoords, options.srcLayer),
311                                           level(options.srcLevel));
312            break;
313        case kTextureType3D:
314            output = blitSampleTexture3D(srcTexture3d, textureSampler, input.texCoords, options);
315            break;
316    }
317    if (kTransformLinearToSrgb) {
318        output.x = linearToSRGB(output.x);
319        output.y = linearToSRGB(output.y);
320        output.z = linearToSRGB(output.z);
321    }
322    if (kUnmultiplyAlpha)
323    {
324        if (output.a != 0.0)
325        {
326            output.xyz /= output.a;
327        }
328    }
329    if (kPremultiplyAlpha)
330    {
331        output.xyz *= output.a;
332    }
333    if (options.dstLuminance)
334    {
335        output.g = output.b = output.r;
336    }
337    return output;
338}
339template <typename T>
340static inline MultipleColorOutputs<T> blitFS(BlitVSOut input [[stage_in]], texture2d<T> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<T> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<T> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<T> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<T> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
341{
342    vec<T, 4> output = blitReadTexture(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
343    return toMultipleColorOutputs(output);
344}
345fragment MultipleColorOutputs<float> blitFloatFS(BlitVSOut input [[stage_in]], texture2d<float> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<float> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<float> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<float> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<float> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
346{
347    return blitFS(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
348}
349fragment MultipleColorOutputs<int> blitIntFS(BlitVSOut input [[stage_in]], texture2d<int> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<int> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<int> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<int> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<int> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
350{
351    return blitFS(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
352}
353fragment MultipleColorOutputs<uint> blitUIntFS(BlitVSOut input [[stage_in]], texture2d<uint> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<uint> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<uint> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<uint> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<uint> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
354{
355    return blitFS(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
356}
357fragment MultipleColorOutputs<uint> copyTextureFloatToUIntFS(BlitVSOut input [[stage_in]], texture2d<float> srcTexture2d [[texture(0), function_constant(kSourceTextureType2D)]], texture2d_array<float> srcTexture2dArray [[texture(0), function_constant(kSourceTextureType2DArray)]], texture2d_ms<float> srcTexture2dMS [[texture(0), function_constant(kSourceTextureType2DMS)]], texturecube<float> srcTextureCube [[texture(0), function_constant(kSourceTextureTypeCube)]], texture3d<float> srcTexture3d [[texture(0), function_constant(kSourceTextureType3D)]], sampler textureSampler [[sampler(0)]], constant BlitParams &options [[buffer(0)]])
358{
359    float4 inputColor = blitReadTexture<>(input, srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube, srcTexture3d, textureSampler, options);
360    uint4 output = uint4(inputColor * float4(255.0));
361    return toMultipleColorOutputs(output);
362}
363struct FragmentDepthOut
364{
365    float depth [[depth(any)]];
366};
367static inline float sampleDepth(
368    texture2d<float> srcTexture2d [[function_constant(kSourceTextureType2D)]],
369    texture2d_array<float> srcTexture2dArray [[function_constant(kSourceTextureType2DArray)]],
370    texture2d_ms<float> srcTexture2dMS [[function_constant(kSourceTextureType2DMS)]],
371    texturecube<float> srcTextureCube [[function_constant(kSourceTextureTypeCube)]],
372    float2 texCoords,
373    constant BlitParams &options)
374{
375    float4 output;
376    constexpr sampler textureSampler(mag_filter::nearest, min_filter::nearest);
377    switch (kSourceTextureType)
378    {
379        case kTextureType2D:
380            output = srcTexture2d.sample(textureSampler, texCoords, level(options.srcLevel));
381            break;
382        case kTextureType2DArray:
383            output = srcTexture2dArray.sample(textureSampler, texCoords, options.srcLayer,
384                                              level(options.srcLevel));
385            break;
386        case kTextureType2DMultisample:
387            output = srcTexture2dMS.read(getImageCoords(srcTexture2dMS, texCoords), 0);
388            break;
389        case kTextureTypeCube:
390            output =
391                srcTextureCube.sample(textureSampler, cubeTexcoords(texCoords, options.srcLayer),
392                                      level(options.srcLevel));
393            break;
394    }
395    return output.r;
396}
397fragment FragmentDepthOut blitDepthFS(BlitVSOut input [[stage_in]],
398                                      texture2d<float> srcTexture2d
399                                      [[texture(0), function_constant(kSourceTextureType2D)]],
400                                      texture2d_array<float> srcTexture2dArray
401                                      [[texture(0), function_constant(kSourceTextureType2DArray)]],
402                                      texture2d_ms<float> srcTexture2dMS
403                                      [[texture(0), function_constant(kSourceTextureType2DMS)]],
404                                      texturecube<float> srcTextureCube
405                                      [[texture(0), function_constant(kSourceTextureTypeCube)]],
406                                      constant BlitParams &options [[buffer(0)]])
407{
408    FragmentDepthOut re;
409    re.depth = sampleDepth(srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube,
410                           input.texCoords, options);
411    return re;
412}
413static inline uint32_t sampleStencil(
414    texture2d<uint32_t> srcTexture2d [[function_constant(kSourceTexture2Type2D)]],
415    texture2d_array<uint32_t> srcTexture2dArray [[function_constant(kSourceTexture2Type2DArray)]],
416    texture2d_ms<uint32_t> srcTexture2dMS [[function_constant(kSourceTexture2Type2DMS)]],
417    texturecube<uint32_t> srcTextureCube [[function_constant(kSourceTexture2TypeCube)]],
418    float2 texCoords,
419    int srcLevel,
420    int srcLayer)
421{
422    uint4 output;
423    constexpr sampler textureSampler(mag_filter::nearest, min_filter::nearest);
424    switch (kSourceTexture2Type)
425    {
426        case kTextureType2D:
427            output = srcTexture2d.sample(textureSampler, texCoords, level(srcLevel));
428            break;
429        case kTextureType2DArray:
430            output = srcTexture2dArray.sample(textureSampler, texCoords, srcLayer, level(srcLevel));
431            break;
432        case kTextureType2DMultisample:
433            output = srcTexture2dMS.read(getImageCoords(srcTexture2dMS, texCoords), 0);
434            break;
435        case kTextureTypeCube:
436            output = srcTextureCube.sample(textureSampler, cubeTexcoords(texCoords, srcLayer),
437                                           level(srcLevel));
438            break;
439    }
440    return output.r;
441}
442struct BlitStencilToBufferParams
443{
444    float2 srcStartTexCoords;
445    float2 srcTexCoordSteps;
446    int srcLevel;
447    int srcLayer;
448    uint2 dstSize;
449    uint dstBufferRowPitch;
450    bool resolveMS;
451};
452kernel void blitStencilToBufferCS(ushort2 gIndices [[thread_position_in_grid]],
453                                  texture2d<uint32_t> srcTexture2d
454                                  [[texture(1), function_constant(kSourceTexture2Type2D)]],
455                                  texture2d_array<uint32_t> srcTexture2dArray
456                                  [[texture(1), function_constant(kSourceTexture2Type2DArray)]],
457                                  texture2d_ms<uint32_t> srcTexture2dMS
458                                  [[texture(1), function_constant(kSourceTexture2Type2DMS)]],
459                                  texturecube<uint32_t> srcTextureCube
460                                  [[texture(1), function_constant(kSourceTexture2TypeCube)]],
461                                  constant BlitStencilToBufferParams &options [[buffer(0)]],
462                                  device uchar *buffer [[buffer(1)]])
463{
464    if (gIndices.x >= options.dstSize.x || gIndices.y >= options.dstSize.y)
465    {
466        return;
467    }
468    float2 srcTexCoords = options.srcStartTexCoords + float2(gIndices) * options.srcTexCoordSteps;
469    if (kSourceTexture2Type == kTextureType2DMultisample && !options.resolveMS)
470    {
471        uint samples = srcTexture2dMS.get_num_samples();
472        uint2 imageCoords = getImageCoords(srcTexture2dMS, srcTexCoords);
473        uint bufferOffset = options.dstBufferRowPitch * gIndices.y + samples * gIndices.x;
474        for (uint sample = 0; sample < samples; ++sample)
475        {
476            uint stencilPerSample = srcTexture2dMS.read(imageCoords, sample).r;
477            buffer[bufferOffset + sample] = static_cast<uchar>(stencilPerSample);
478        }
479    }
480    else
481    {
482        uint32_t stencil =
483            sampleStencil(srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube,
484                          srcTexCoords, options.srcLevel, options.srcLayer);
485        buffer[options.dstBufferRowPitch * gIndices.y + gIndices.x] = static_cast<uchar>(stencil);
486    }
487}
488#if __METAL_VERSION__ >= 210
489struct FragmentStencilOut
490{
491    uint32_t stencil [[stencil]];
492};
493struct FragmentDepthStencilOut
494{
495    float depth [[depth(any)]];
496    uint32_t stencil [[stencil]];
497};
498fragment FragmentStencilOut blitStencilFS(
499    BlitVSOut input [[stage_in]],
500    texture2d<uint32_t> srcTexture2d [[texture(1), function_constant(kSourceTexture2Type2D)]],
501    texture2d_array<uint32_t> srcTexture2dArray
502    [[texture(1), function_constant(kSourceTexture2Type2DArray)]],
503    texture2d_ms<uint32_t> srcTexture2dMS
504    [[texture(1), function_constant(kSourceTexture2Type2DMS)]],
505    texturecube<uint32_t> srcTextureCube [[texture(1), function_constant(kSourceTexture2TypeCube)]],
506    constant BlitParams &options [[buffer(0)]])
507{
508    FragmentStencilOut re;
509    re.stencil = sampleStencil(srcTexture2d, srcTexture2dArray, srcTexture2dMS, srcTextureCube,
510                               input.texCoords, options.srcLevel, options.srcLayer);
511    return re;
512}
513fragment FragmentDepthStencilOut blitDepthStencilFS(
514    BlitVSOut input [[stage_in]],
515    texture2d<float> srcDepthTexture2d [[texture(0), function_constant(kSourceTextureType2D)]],
516    texture2d_array<float> srcDepthTexture2dArray
517    [[texture(0), function_constant(kSourceTextureType2DArray)]],
518    texture2d_ms<float> srcDepthTexture2dMS
519    [[texture(0), function_constant(kSourceTextureType2DMS)]],
520    texturecube<float> srcDepthTextureCube
521    [[texture(0), function_constant(kSourceTextureTypeCube)]],
522    texture2d<uint32_t> srcStencilTexture2d
523    [[texture(1), function_constant(kSourceTexture2Type2D)]],
524    texture2d_array<uint32_t> srcStencilTexture2dArray
525    [[texture(1), function_constant(kSourceTexture2Type2DArray)]],
526    texture2d_ms<uint32_t> srcStencilTexture2dMS
527    [[texture(1), function_constant(kSourceTexture2Type2DMS)]],
528    texturecube<uint32_t> srcStencilTextureCube
529    [[texture(1), function_constant(kSourceTexture2TypeCube)]],
530    constant BlitParams &options [[buffer(0)]])
531{
532    FragmentDepthStencilOut re;
533    re.depth = sampleDepth(srcDepthTexture2d, srcDepthTexture2dArray, srcDepthTexture2dMS,
534                           srcDepthTextureCube, input.texCoords, options);
535    re.stencil =
536        sampleStencil(srcStencilTexture2d, srcStencilTexture2dArray, srcStencilTexture2dMS,
537                      srcStencilTextureCube, input.texCoords, options.srcLevel, options.srcLayer);
538    return re;
539}
540#endif
541using namespace rx::mtl_shader;
542struct ClearParams
543{
544    float4 clearColor;
545    float clearDepth;
546};
547vertex float4 clearVS(unsigned int vid [[ vertex_id ]],
548                      constant ClearParams &clearParams [[buffer(0)]])
549{
550    return float4(gCorners[vid], clearParams.clearDepth, 1.0);
551}
552fragment MultipleColorOutputs<float> clearFloatFS(constant ClearParams &clearParams [[buffer(0)]])
553{
554    return toMultipleColorOutputs(clearParams.clearColor);
555}
556fragment MultipleColorOutputs<int> clearIntFS(constant ClearParams &clearParams [[buffer(0)]])
557{
558    return toMultipleColorOutputs(as_type<int4>(clearParams.clearColor));
559}
560fragment MultipleColorOutputs<uint> clearUIntFS(constant ClearParams &clearParams [[buffer(0)]])
561{
562    return toMultipleColorOutputs(as_type<uint4>(clearParams.clearColor));
563}
564using namespace rx::mtl_shader;
565constant bool kSourceBufferAligned[[function_constant(100)]];
566constant bool kSourceIndexIsU8[[function_constant(200)]];
567constant bool kSourceIndexIsU16[[function_constant(300)]];
568constant bool kSourceIndexIsU32[[function_constant(400)]];
569constant bool kSourceBufferUnaligned = !kSourceBufferAligned;
570constant bool kUseSourceBufferU8 = kSourceIndexIsU8 || kSourceBufferUnaligned;
571constant bool kUseSourceBufferU16 = kSourceIndexIsU16 && kSourceBufferAligned;
572constant bool kUseSourceBufferU32 = kSourceIndexIsU32 && kSourceBufferAligned;
573struct IndexConversionParams
574{
575    uint32_t srcOffset;
576    uint32_t indexCount;
577    bool primitiveRestartEnabled;
578};
579inline ushort getIndexAligned(constant ushort *inputAligned, uint offset, uint idx)
580{
581    return inputAligned[offset / 2 + idx];
582}
583inline uint getIndexAligned(constant uint *inputAligned, uint offset, uint idx)
584{
585    return inputAligned[offset / 4 + idx];
586}
587inline uchar getIndexAligned(constant uchar *input, uint offset, uint idx)
588{
589    return input[offset + idx];
590}
591inline ushort getIndexUnalignedU16(constant uchar *input, uint offset, uint idx)
592{
593    ushort inputLo = input[offset + 2 * idx];
594    ushort inputHi = input[offset + 2 * idx + 1];
595    return inputLo | (inputHi << 8);
596}
597inline uint getIndexUnalignedU32(constant uchar *input, uint offset, uint idx)
598{
599    uint input0 = input[offset + 4 * idx];
600    uint input1 = input[offset + 4 * idx + 1];
601    uint input2 = input[offset + 4 * idx + 2];
602    uint input3 = input[offset + 4 * idx + 3];
603    return input0 | (input1 << 8) | (input2 << 16) | (input3 << 24);
604}
605kernel void convertIndexU8ToU16(uint idx [[thread_position_in_grid]],
606                                constant IndexConversionParams &options [[buffer(0)]],
607                                constant uchar *input [[buffer(1)]],
608                                device ushort *output [[buffer(2)]])
609{
610    if (idx >= options.indexCount) { return; };
611    uchar value = getIndexAligned(input, options.srcOffset, idx);
612    if (options.primitiveRestartEnabled && value == 0xff)
613    {
614        output[idx] = 0xffff;
615    }
616    else
617    {
618        output[idx] = value;
619    }
620}
621kernel void convertIndexU16(uint idx [[thread_position_in_grid]],
622                            constant IndexConversionParams &options [[buffer(0)]],
623                            constant uchar *input
624                            [[buffer(1), function_constant(kSourceBufferUnaligned)]],
625                            constant ushort *inputAligned
626                            [[buffer(1), function_constant(kSourceBufferAligned)]],
627                            device ushort *output [[buffer(2)]])
628{
629    if (idx >= options.indexCount) { return; };
630    ushort value;
631    if (kSourceBufferAligned)
632    {
633        value = getIndexAligned(inputAligned, options.srcOffset, idx);
634    }
635    else
636    {
637        value = getIndexUnalignedU16(input, options.srcOffset, idx);
638    }
639    output[idx] = value;
640}
641kernel void convertIndexU32(uint idx [[thread_position_in_grid]],
642                            constant IndexConversionParams &options [[buffer(0)]],
643                            constant uchar *input
644                            [[buffer(1), function_constant(kSourceBufferUnaligned)]],
645                            constant uint *inputAligned
646                            [[buffer(1), function_constant(kSourceBufferAligned)]],
647                            device uint *output [[buffer(2)]])
648{
649    if (idx >= options.indexCount) { return; };
650    uint value;
651    if (kSourceBufferAligned)
652    {
653        value = getIndexAligned(inputAligned, options.srcOffset, idx);
654    }
655    else
656    {
657        value = getIndexUnalignedU32(input, options.srcOffset, idx);
658    }
659    output[idx] = value;
660}
661struct IndexFromArrayParams
662{
663    uint firstVertex;
664    uint vertexCount;
665};
666kernel void genTriFanIndicesFromArray(uint idx [[thread_position_in_grid]],
667                                      constant IndexFromArrayParams &options [[buffer(0)]],
668                                      device uint *output [[buffer(2)]])
669{
670    if (idx >= options.vertexCount) { return; };
671    uint vertexIdx = options.firstVertex + 2 + idx;
672    output[3 * idx ] = vertexIdx - 1;
673    output[3 * idx + 1] = vertexIdx;
674    output[3 * idx + 2] = options.firstVertex;
675}
676inline uint getIndexU32(uint offset,
677                        uint idx,
678                        constant uchar *inputU8 [[function_constant(kUseSourceBufferU8)]],
679                        constant ushort *inputU16 [[function_constant(kUseSourceBufferU16)]],
680                        constant uint *inputU32 [[function_constant(kUseSourceBufferU32)]])
681{
682    if (kUseSourceBufferU8)
683    {
684        if (kSourceIndexIsU16)
685        {
686            return getIndexUnalignedU16(inputU8, offset, idx);
687        }
688        else if (kSourceIndexIsU32)
689        {
690            return getIndexUnalignedU32(inputU8, offset, idx);
691        }
692        return getIndexAligned(inputU8, offset, idx);
693    }
694    else if (kUseSourceBufferU16)
695    {
696        return getIndexAligned(inputU16, offset, idx);
697    }
698    else if (kUseSourceBufferU32)
699    {
700        return getIndexAligned(inputU32, offset, idx);
701    }
702    return 0;
703}
704kernel void genTriFanIndicesFromElements(uint idx [[thread_position_in_grid]],
705                                         constant IndexConversionParams &options [[buffer(0)]],
706                                         constant uchar *inputU8
707                                         [[buffer(1), function_constant(kUseSourceBufferU8)]],
708                                         constant ushort *inputU16
709                                         [[buffer(1), function_constant(kUseSourceBufferU16)]],
710                                         constant uint *inputU32
711                                         [[buffer(1), function_constant(kUseSourceBufferU32)]],
712                                         device uint *output [[buffer(2)]])
713{
714    if (idx >= options.indexCount) { return; };
715    uint elemIdx = 2 + idx;
716    output[3 * idx] = getIndexU32(options.srcOffset, 0, inputU8, inputU16, inputU32);
717    output[3 * idx + 1] = getIndexU32(options.srcOffset, elemIdx - 1, inputU8, inputU16, inputU32);
718    output[3 * idx + 2] = getIndexU32(options.srcOffset, elemIdx, inputU8, inputU16, inputU32);
719}
720kernel void genLineLoopIndicesFromArray(uint idx [[thread_position_in_grid]],
721                                        constant IndexFromArrayParams &options [[buffer(0)]],
722                                        device uint *output [[buffer(2)]])
723{
724    uint totalIndices = options.vertexCount + 1;
725    if (idx >= totalIndices) { return; };
726    output[idx] = options.firstVertex + idx % options.vertexCount;
727}
728kernel void genLineLoopIndicesFromElements(uint idx [[thread_position_in_grid]],
729                                           constant IndexConversionParams &options [[buffer(0)]],
730                                           constant uchar *inputU8
731                                           [[buffer(1), function_constant(kUseSourceBufferU8)]],
732                                           constant ushort *inputU16
733                                           [[buffer(1), function_constant(kUseSourceBufferU16)]],
734                                           constant uint *inputU32
735                                           [[buffer(1), function_constant(kUseSourceBufferU32)]],
736                                           device uint *output [[buffer(2)]])
737{
738    uint totalTargetIndices = options.indexCount + 1;
739    if (idx >= totalTargetIndices) { return; };
740    output[idx] =
741        getIndexU32(options.srcOffset, idx % options.indexCount, inputU8, inputU16, inputU32);
742}
743using namespace rx::mtl_shader;
744struct GenMipParams
745{
746    uint srcLevel;
747    uint numMipLevelsToGen;
748    bool sRGB;
749};
750kernel void generate3DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
751                              ushort3 gIndices [[thread_position_in_grid]],
752                              texture3d<float> srcTexture [[texture(0)]],
753                              texture3d<float, access::write> dstMip1 [[texture(1)]],
754                              texture3d<float, access::write> dstMip2 [[texture(2)]],
755                              texture3d<float, access::write> dstMip3 [[texture(3)]],
756                              texture3d<float, access::write> dstMip4 [[texture(4)]],
757                              constant GenMipParams &options [[buffer(0)]])
758{
759    ushort3 mipSize = ushort3(dstMip1.get_width(), dstMip1.get_height(), dstMip1.get_depth());
760    bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y && gIndices.z < mipSize.z;
761    constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
762    threadgroup float sR[(8 * 8 * 8)];
763    threadgroup float sG[(8 * 8 * 8)];
764    threadgroup float sB[(8 * 8 * 8)];
765    threadgroup float sA[(8 * 8 * 8)];
766    float4 texel1;
767    if (validThread)
768    {
769        float3 texCoords = (float3(gIndices) + float3(0.5, 0.5, 0.5)) / float3(mipSize);
770        texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));
771        dstMip1.write(texel1, gIndices);
772    }
773    else
774    {
775        lIndex = 0xffffffff;
776    }
777    if (options.numMipLevelsToGen == 1)
778    {
779        return;
780    }
781    if (options.sRGB)
782    {
783        texel1 = linearToSRGB(texel1);
784    }
785    sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
786    threadgroup_barrier(mem_flags::mem_threadgroup);
787    if ((lIndex & 0x49) == 0)
788    {
789        bool3 atEdge = gIndices == (mipSize - ushort3(1));
790        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 1], sG[lIndex + 1], sB[lIndex + 1], sA[lIndex + 1]));
791        float4 texel3 = (atEdge.y) ? (texel1) : (float4(sR[lIndex + 8], sG[lIndex + 8], sB[lIndex + 8], sA[lIndex + 8]));
792        float4 texel4 = (atEdge.z) ? (texel1) : (float4(sR[lIndex + (8 * 8)], sG[lIndex + (8 * 8)], sB[lIndex + (8 * 8)], sA[lIndex + (8 * 8)]));
793        float4 texel5 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (8 + 1)], sG[lIndex + (8 + 1)], sB[lIndex + (8 + 1)], sA[lIndex + (8 + 1)]));
794        float4 texel6 = (atEdge.x | atEdge.z) ? (texel2) : (float4(sR[lIndex + ((8 * 8) + 1)], sG[lIndex + ((8 * 8) + 1)], sB[lIndex + ((8 * 8) + 1)], sA[lIndex + ((8 * 8) + 1)]));
795        float4 texel7 = (atEdge.y | atEdge.z) ? (texel3) : (float4(sR[lIndex + ((8 * 8) + 8)], sG[lIndex + ((8 * 8) + 8)], sB[lIndex + ((8 * 8) + 8)], sA[lIndex + ((8 * 8) + 8)]));
796        float4 texel8 =
797            (atEdge.x | atEdge.y | atEdge.z) ? (texel5) : (float4(sR[lIndex + ((8 * 8) + 8 + 1)], sG[lIndex + ((8 * 8) + 8 + 1)], sB[lIndex + ((8 * 8) + 8 + 1)], sA[lIndex + ((8 * 8) + 8 + 1)]));
798        texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
799        dstMip2.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 1);
800        sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
801    }
802    if (options.numMipLevelsToGen == 2)
803    {
804        return;
805    }
806    threadgroup_barrier(mem_flags::mem_threadgroup);
807    if ((lIndex & 0xdb) == 0)
808    {
809        mipSize = max(mipSize >> 1, ushort3(1));
810        bool3 atEdge = (gIndices >> 1) == (mipSize - ushort3(1));
811        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 2], sG[lIndex + 2], sB[lIndex + 2], sA[lIndex + 2]));
812        float4 texel3 =
813            (atEdge.y) ? (texel1) : (float4(sR[lIndex + (2 * 8)], sG[lIndex + (2 * 8)], sB[lIndex + (2 * 8)], sA[lIndex + (2 * 8)]));
814        float4 texel4 =
815            (atEdge.z) ? (texel1) : (float4(sR[lIndex + (2 * (8 * 8))], sG[lIndex + (2 * (8 * 8))], sB[lIndex + (2 * (8 * 8))], sA[lIndex + (2 * (8 * 8))]));
816        float4 texel5 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (2 * 8 + 2)], sG[lIndex + (2 * 8 + 2)], sB[lIndex + (2 * 8 + 2)], sA[lIndex + (2 * 8 + 2)]));
817        float4 texel6 = (atEdge.x | atEdge.z) ? (texel2) : (float4(sR[lIndex + (2 * (8 * 8) + 2)], sG[lIndex + (2 * (8 * 8) + 2)], sB[lIndex + (2 * (8 * 8) + 2)], sA[lIndex + (2 * (8 * 8) + 2)]));
818        float4 texel7 = (atEdge.y | atEdge.z) ? (texel3) : (float4(sR[lIndex + (2 * (8 * 8) + 2 * 8)], sG[lIndex + (2 * (8 * 8) + 2 * 8)], sB[lIndex + (2 * (8 * 8) + 2 * 8)], sA[lIndex + (2 * (8 * 8) + 2 * 8)]));
819        float4 texel8 = (atEdge.x | atEdge.y | atEdge.z) ? (texel5) : (float4(sR[lIndex + (2 * (8 * 8) + 2 * 8 + 2)], sG[lIndex + (2 * (8 * 8) + 2 * 8 + 2)], sB[lIndex + (2 * (8 * 8) + 2 * 8 + 2)], sA[lIndex + (2 * (8 * 8) + 2 * 8 + 2)]));
820        texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
821        dstMip3.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 2);
822        sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
823    }
824    if (options.numMipLevelsToGen == 3)
825    {
826        return;
827    }
828    threadgroup_barrier(mem_flags::mem_threadgroup);
829    if ((lIndex & 0x1ff) == 0)
830    {
831        mipSize = max(mipSize >> 1, ushort3(1));
832        bool3 atEdge = (gIndices >> 2) == (mipSize - ushort3(1));
833        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 4], sG[lIndex + 4], sB[lIndex + 4], sA[lIndex + 4]));
834        float4 texel3 =
835            (atEdge.y) ? (texel1) : (float4(sR[lIndex + (4 * 8)], sG[lIndex + (4 * 8)], sB[lIndex + (4 * 8)], sA[lIndex + (4 * 8)]));
836        float4 texel4 =
837            (atEdge.z) ? (texel1) : (float4(sR[lIndex + (4 * (8 * 8))], sG[lIndex + (4 * (8 * 8))], sB[lIndex + (4 * (8 * 8))], sA[lIndex + (4 * (8 * 8))]));
838        float4 texel5 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (4 * 8 + 4)], sG[lIndex + (4 * 8 + 4)], sB[lIndex + (4 * 8 + 4)], sA[lIndex + (4 * 8 + 4)]));
839        float4 texel6 = (atEdge.x | atEdge.z) ? (texel2) : (float4(sR[lIndex + (4 * (8 * 8) + 4)], sG[lIndex + (4 * (8 * 8) + 4)], sB[lIndex + (4 * (8 * 8) + 4)], sA[lIndex + (4 * (8 * 8) + 4)]));
840        float4 texel7 = (atEdge.y | atEdge.z) ? (texel3) : (float4(sR[lIndex + (4 * (8 * 8) + 4 * 8)], sG[lIndex + (4 * (8 * 8) + 4 * 8)], sB[lIndex + (4 * (8 * 8) + 4 * 8)], sA[lIndex + (4 * (8 * 8) + 4 * 8)]));
841        float4 texel8 = (atEdge.x | atEdge.y | atEdge.z) ? (texel5) : (float4(sR[lIndex + (4 * (8 * 8) + 4 * 8 + 4)], sG[lIndex + (4 * (8 * 8) + 4 * 8 + 4)], sB[lIndex + (4 * (8 * 8) + 4 * 8 + 4)], sA[lIndex + (4 * (8 * 8) + 4 * 8 + 4)]));
842        texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
843        dstMip4.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 3);
844    }
845}
846kernel void generate2DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
847                              ushort2 gIndices [[thread_position_in_grid]],
848                              texture2d<float> srcTexture [[texture(0)]],
849                              texture2d<float, access::write> dstMip1 [[texture(1)]],
850                              texture2d<float, access::write> dstMip2 [[texture(2)]],
851                              texture2d<float, access::write> dstMip3 [[texture(3)]],
852                              texture2d<float, access::write> dstMip4 [[texture(4)]],
853                              constant GenMipParams &options [[buffer(0)]])
854{
855    uint firstMipLevel = options.srcLevel + 1;
856    ushort2 mipSize =
857        ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
858    bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y;
859    constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
860    threadgroup float sR[(8 * 8)];
861    threadgroup float sG[(8 * 8)];
862    threadgroup float sB[(8 * 8)];
863    threadgroup float sA[(8 * 8)];
864    float4 texel1;
865    if (validThread)
866    {
867        float2 texCoords = (float2(gIndices) + float2(0.5, 0.5)) / float2(mipSize);
868        texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));
869        dstMip1.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices);
870    }
871    else
872    {
873        lIndex = 0xffffffff;
874    }
875    if (options.numMipLevelsToGen == 1)
876    {
877        return;
878    }
879    sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
880    threadgroup_barrier(mem_flags::mem_threadgroup);
881    if ((lIndex & 0x09) == 0)
882    {
883        bool2 atEdge = gIndices == (mipSize - ushort2(1));
884        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 1], sG[lIndex + 1], sB[lIndex + 1], sA[lIndex + 1]));
885        float4 texel3 = (atEdge.y) ? (texel1) : (float4(sR[lIndex + 8], sG[lIndex + 8], sB[lIndex + 8], sA[lIndex + 8]));
886        float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (8 + 1)], sG[lIndex + (8 + 1)], sB[lIndex + (8 + 1)], sA[lIndex + (8 + 1)]));
887        texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
888        dstMip2.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 1);
889        sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
890    }
891    if (options.numMipLevelsToGen == 2)
892    {
893        return;
894    }
895    threadgroup_barrier(mem_flags::mem_threadgroup);
896    if ((lIndex & 0x1b) == 0)
897    {
898        mipSize = max(mipSize >> 1, ushort2(1));
899        bool2 atEdge = (gIndices >> 1) == (mipSize - ushort2(1));
900        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 2], sG[lIndex + 2], sB[lIndex + 2], sA[lIndex + 2]));
901        float4 texel3 =
902            (atEdge.y) ? (texel1) : (float4(sR[lIndex + 2 * 8], sG[lIndex + 2 * 8], sB[lIndex + 2 * 8], sA[lIndex + 2 * 8]));
903        float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (2 * 8 + 2)], sG[lIndex + (2 * 8 + 2)], sB[lIndex + (2 * 8 + 2)], sA[lIndex + (2 * 8 + 2)]));
904        texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
905        dstMip3.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 2);
906        sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
907    }
908    if (options.numMipLevelsToGen == 3)
909    {
910        return;
911    }
912    threadgroup_barrier(mem_flags::mem_threadgroup);
913    if ((lIndex & 0x3f) == 0)
914    {
915        mipSize = max(mipSize >> 1, ushort2(1));
916        bool2 atEdge = (gIndices >> 2) == (mipSize - ushort2(1));
917        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 4], sG[lIndex + 4], sB[lIndex + 4], sA[lIndex + 4]));
918        float4 texel3 =
919            (atEdge.y) ? (texel1) : (float4(sR[lIndex + 4 * 8], sG[lIndex + 4 * 8], sB[lIndex + 4 * 8], sA[lIndex + 4 * 8]));
920        float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (4 * 8 + 4)], sG[lIndex + (4 * 8 + 4)], sB[lIndex + (4 * 8 + 4)], sA[lIndex + (4 * 8 + 4)]));
921        texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
922        dstMip4.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices >> 3);
923    }
924}
925template <typename TextureTypeR, typename TextureTypeW>
926static __attribute__((always_inline)) void generateCubeOr2DArray2ndAndMoreMipmaps(
927    uint lIndex,
928    ushort3 gIndices,
929    TextureTypeR srcTexture,
930    TextureTypeW dstMip2,
931    TextureTypeW dstMip3,
932    TextureTypeW dstMip4,
933    ushort2 mip1Size,
934    float4 mip1Texel,
935    threadgroup float *sR,
936    threadgroup float *sG,
937    threadgroup float *sB,
938    threadgroup float *sA,
939    constant GenMipParams &options)
940{
941    ushort2 mipSize = mip1Size;
942    float4 texel1 = mip1Texel;
943    sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
944    threadgroup_barrier(mem_flags::mem_threadgroup);
945    if ((lIndex & 0x09) == 0)
946    {
947        bool2 atEdge = gIndices.xy == (mipSize - ushort2(1));
948        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 1], sG[lIndex + 1], sB[lIndex + 1], sA[lIndex + 1]));
949        float4 texel3 = (atEdge.y) ? (texel1) : (float4(sR[lIndex + 8], sG[lIndex + 8], sB[lIndex + 8], sA[lIndex + 8]));
950        float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (8 + 1)], sG[lIndex + (8 + 1)], sB[lIndex + (8 + 1)], sA[lIndex + (8 + 1)]));
951        texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
952        dstMip2.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices.xy >> 1, gIndices.z);
953        sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
954    }
955    if (options.numMipLevelsToGen == 2)
956    {
957        return;
958    }
959    threadgroup_barrier(mem_flags::mem_threadgroup);
960    if ((lIndex & 0x1b) == 0)
961    {
962        mipSize = max(mipSize >> 1, ushort2(1));
963        bool2 atEdge = (gIndices.xy >> 1) == (mipSize - ushort2(1));
964        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 2], sG[lIndex + 2], sB[lIndex + 2], sA[lIndex + 2]));
965        float4 texel3 =
966            (atEdge.y) ? (texel1) : (float4(sR[lIndex + 2 * 8], sG[lIndex + 2 * 8], sB[lIndex + 2 * 8], sA[lIndex + 2 * 8]));
967        float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (2 * 8 + 2)], sG[lIndex + (2 * 8 + 2)], sB[lIndex + (2 * 8 + 2)], sA[lIndex + (2 * 8 + 2)]));
968        texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
969        dstMip3.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices.xy >> 2, gIndices.z);
970        sR[lIndex] = texel1.r; sG[lIndex] = texel1.g; sB[lIndex] = texel1.b; sA[lIndex] = texel1.a;;
971    }
972    if (options.numMipLevelsToGen == 3)
973    {
974        return;
975    }
976    threadgroup_barrier(mem_flags::mem_threadgroup);
977    if ((lIndex & 0x3f) == 0)
978    {
979        mipSize = max(mipSize >> 1, ushort2(1));
980        bool2 atEdge = (gIndices.xy >> 2) == (mipSize - ushort2(1));
981        float4 texel2 = (atEdge.x) ? (texel1) : (float4(sR[lIndex + 4], sG[lIndex + 4], sB[lIndex + 4], sA[lIndex + 4]));
982        float4 texel3 =
983            (atEdge.y) ? (texel1) : (float4(sR[lIndex + 4 * 8], sG[lIndex + 4 * 8], sB[lIndex + 4 * 8], sA[lIndex + 4 * 8]));
984        float4 texel4 = (atEdge.x | atEdge.y) ? (texel2) : (float4(sR[lIndex + (4 * 8 + 4)], sG[lIndex + (4 * 8 + 4)], sB[lIndex + (4 * 8 + 4)], sA[lIndex + (4 * 8 + 4)]));
985        texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;
986        dstMip4.write((options.sRGB ? sRGBtoLinear(texel1) : texel1), gIndices.xy >> 3, gIndices.z);
987    }
988}
989kernel void generateCubeMipmaps(uint lIndex [[thread_index_in_threadgroup]],
990                                ushort3 gIndices [[thread_position_in_grid]],
991                                texturecube<float> srcTexture [[texture(0)]],
992                                texturecube<float, access::write> dstMip1 [[texture(1)]],
993                                texturecube<float, access::write> dstMip2 [[texture(2)]],
994                                texturecube<float, access::write> dstMip3 [[texture(3)]],
995                                texturecube<float, access::write> dstMip4 [[texture(4)]],
996                                constant GenMipParams &options [[buffer(0)]])
997{
998    uint firstMipLevel = options.srcLevel + 1;
999    ushort2 mip1Size =
1000        ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
1001    bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;
1002    constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
1003    float4 mip1Texel;
1004    if (validThread)
1005    {
1006        float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
1007        mip1Texel = srcTexture.sample(textureSampler, cubeTexcoords(texCoords, int(gIndices.z)),
1008                                      level(options.srcLevel));
1009        dstMip1.write((options.sRGB ? sRGBtoLinear(mip1Texel) : mip1Texel), gIndices.xy, gIndices.z);
1010    }
1011    else
1012    {
1013        lIndex = 0xffffffff;
1014    }
1015    if (options.numMipLevelsToGen == 1)
1016    {
1017        return;
1018    }
1019    threadgroup float sR[(8 * 8)];
1020    threadgroup float sG[(8 * 8)];
1021    threadgroup float sB[(8 * 8)];
1022    threadgroup float sA[(8 * 8)];
1023    generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
1024                                           mip1Size, mip1Texel, sR, sG, sB, sA, options);
1025}
1026kernel void generate2DArrayMipmaps(uint lIndex [[thread_index_in_threadgroup]],
1027                                   ushort3 gIndices [[thread_position_in_grid]],
1028                                   texture2d_array<float> srcTexture [[texture(0)]],
1029                                   texture2d_array<float, access::write> dstMip1 [[texture(1)]],
1030                                   texture2d_array<float, access::write> dstMip2 [[texture(2)]],
1031                                   texture2d_array<float, access::write> dstMip3 [[texture(3)]],
1032                                   texture2d_array<float, access::write> dstMip4 [[texture(4)]],
1033                                   constant GenMipParams &options [[buffer(0)]])
1034{
1035    uint firstMipLevel = options.srcLevel + 1;
1036    ushort2 mip1Size =
1037        ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
1038    bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;
1039    constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
1040    float4 mip1Texel;
1041    if (validThread)
1042    {
1043        float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
1044        mip1Texel =
1045            srcTexture.sample(textureSampler, texCoords, gIndices.z, level(options.srcLevel));
1046        dstMip1.write((options.sRGB ? sRGBtoLinear(mip1Texel) : mip1Texel), gIndices.xy, gIndices.z);
1047    }
1048    else
1049    {
1050        lIndex = 0xffffffff;
1051    }
1052    if (options.numMipLevelsToGen == 1)
1053    {
1054        return;
1055    }
1056    threadgroup float sR[(8 * 8)];
1057    threadgroup float sG[(8 * 8)];
1058    threadgroup float sB[(8 * 8)];
1059    threadgroup float sA[(8 * 8)];
1060    generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
1061                                           mip1Size, mip1Texel, sR, sG, sB, sA, options);
1062}
1063#include <metal_pack>
1064namespace rx
1065{
1066namespace mtl_shader
1067{
1068namespace FormatID
1069{
1070enum
1071{
1072    NONE,
1073    D16_UNORM,
1074    D24_UNORM_S8_UINT,
1075    D24_UNORM_X8_UINT,
1076    D32_FLOAT,
1077    D32_FLOAT_S8X24_UINT,
1078    D32_UNORM,
1079    S8_UINT,
1080    A16_FLOAT,
1081    A1R5G5B5_UNORM,
1082    A2R10G10B10_SINT_VERTEX,
1083    A2R10G10B10_SNORM_VERTEX,
1084    A2R10G10B10_SSCALED_VERTEX,
1085    A2R10G10B10_UINT_VERTEX,
1086    A2R10G10B10_UNORM_VERTEX,
1087    A2R10G10B10_USCALED_VERTEX,
1088    A32_FLOAT,
1089    A8_UNORM,
1090    ASTC_10x10_SRGB_BLOCK,
1091    ASTC_10x10_UNORM_BLOCK,
1092    ASTC_10x5_SRGB_BLOCK,
1093    ASTC_10x5_UNORM_BLOCK,
1094    ASTC_10x6_SRGB_BLOCK,
1095    ASTC_10x6_UNORM_BLOCK,
1096    ASTC_10x8_SRGB_BLOCK,
1097    ASTC_10x8_UNORM_BLOCK,
1098    ASTC_12x10_SRGB_BLOCK,
1099    ASTC_12x10_UNORM_BLOCK,
1100    ASTC_12x12_SRGB_BLOCK,
1101    ASTC_12x12_UNORM_BLOCK,
1102    ASTC_3x3x3_UNORM_BLOCK,
1103    ASTC_3x3x3_UNORM_SRGB_BLOCK,
1104    ASTC_4x3x3_UNORM_BLOCK,
1105    ASTC_4x3x3_UNORM_SRGB_BLOCK,
1106    ASTC_4x4_SRGB_BLOCK,
1107    ASTC_4x4_UNORM_BLOCK,
1108    ASTC_4x4x3_UNORM_BLOCK,
1109    ASTC_4x4x3_UNORM_SRGB_BLOCK,
1110    ASTC_4x4x4_UNORM_BLOCK,
1111    ASTC_4x4x4_UNORM_SRGB_BLOCK,
1112    ASTC_5x4_SRGB_BLOCK,
1113    ASTC_5x4_UNORM_BLOCK,
1114    ASTC_5x4x4_UNORM_BLOCK,
1115    ASTC_5x4x4_UNORM_SRGB_BLOCK,
1116    ASTC_5x5_SRGB_BLOCK,
1117    ASTC_5x5_UNORM_BLOCK,
1118    ASTC_5x5x4_UNORM_BLOCK,
1119    ASTC_5x5x4_UNORM_SRGB_BLOCK,
1120    ASTC_5x5x5_UNORM_BLOCK,
1121    ASTC_5x5x5_UNORM_SRGB_BLOCK,
1122    ASTC_6x5_SRGB_BLOCK,
1123    ASTC_6x5_UNORM_BLOCK,
1124    ASTC_6x5x5_UNORM_BLOCK,
1125    ASTC_6x5x5_UNORM_SRGB_BLOCK,
1126    ASTC_6x6_SRGB_BLOCK,
1127    ASTC_6x6_UNORM_BLOCK,
1128    ASTC_6x6x5_UNORM_BLOCK,
1129    ASTC_6x6x5_UNORM_SRGB_BLOCK,
1130    ASTC_6x6x6_UNORM_BLOCK,
1131    ASTC_6x6x6_UNORM_SRGB_BLOCK,
1132    ASTC_8x5_SRGB_BLOCK,
1133    ASTC_8x5_UNORM_BLOCK,
1134    ASTC_8x6_SRGB_BLOCK,
1135    ASTC_8x6_UNORM_BLOCK,
1136    ASTC_8x8_SRGB_BLOCK,
1137    ASTC_8x8_UNORM_BLOCK,
1138    B10G10R10A2_UNORM,
1139    B4G4R4A4_UNORM,
1140    B5G5R5A1_UNORM,
1141    B5G6R5_UNORM,
1142    B8G8R8A8_TYPELESS,
1143    B8G8R8A8_TYPELESS_SRGB,
1144    B8G8R8A8_UNORM,
1145    B8G8R8A8_UNORM_SRGB,
1146    B8G8R8X8_UNORM,
1147    B8G8R8X8_UNORM_SRGB,
1148    BC1_RGBA_UNORM_BLOCK,
1149    BC1_RGBA_UNORM_SRGB_BLOCK,
1150    BC1_RGB_UNORM_BLOCK,
1151    BC1_RGB_UNORM_SRGB_BLOCK,
1152    BC2_RGBA_UNORM_BLOCK,
1153    BC2_RGBA_UNORM_SRGB_BLOCK,
1154    BC3_RGBA_UNORM_BLOCK,
1155    BC3_RGBA_UNORM_SRGB_BLOCK,
1156    BC4_RED_SNORM_BLOCK,
1157    BC4_RED_UNORM_BLOCK,
1158    BC5_RG_SNORM_BLOCK,
1159    BC5_RG_UNORM_BLOCK,
1160    BC6H_RGB_SFLOAT_BLOCK,
1161    BC6H_RGB_UFLOAT_BLOCK,
1162    BC7_RGBA_UNORM_BLOCK,
1163    BC7_RGBA_UNORM_SRGB_BLOCK,
1164    EAC_R11G11_SNORM_BLOCK,
1165    EAC_R11G11_UNORM_BLOCK,
1166    EAC_R11_SNORM_BLOCK,
1167    EAC_R11_UNORM_BLOCK,
1168    ETC1_LOSSY_DECODE_R8G8B8_UNORM_BLOCK,
1169    ETC1_R8G8B8_UNORM_BLOCK,
1170    ETC2_R8G8B8A1_SRGB_BLOCK,
1171    ETC2_R8G8B8A1_UNORM_BLOCK,
1172    ETC2_R8G8B8A8_SRGB_BLOCK,
1173    ETC2_R8G8B8A8_UNORM_BLOCK,
1174    ETC2_R8G8B8_SRGB_BLOCK,
1175    ETC2_R8G8B8_UNORM_BLOCK,
1176    G8_B8R8_2PLANE_420_UNORM,
1177    G8_B8_R8_3PLANE_420_UNORM,
1178    L16A16_FLOAT,
1179    L16_FLOAT,
1180    L32A32_FLOAT,
1181    L32_FLOAT,
1182    L4A4_UNORM,
1183    L8A8_UNORM,
1184    L8_UNORM,
1185    PALETTE4_R4G4B4A4_UNORM,
1186    PALETTE4_R5G5B5A1_UNORM,
1187    PALETTE4_R5G6B5_UNORM,
1188    PALETTE4_R8G8B8A8_UNORM,
1189    PALETTE4_R8G8B8_UNORM,
1190    PALETTE8_R4G4B4A4_UNORM,
1191    PALETTE8_R5G5B5A1_UNORM,
1192    PALETTE8_R5G6B5_UNORM,
1193    PALETTE8_R8G8B8A8_UNORM,
1194    PALETTE8_R8G8B8_UNORM,
1195    PVRTC1_RGBA_2BPP_UNORM_BLOCK,
1196    PVRTC1_RGBA_2BPP_UNORM_SRGB_BLOCK,
1197    PVRTC1_RGBA_4BPP_UNORM_BLOCK,
1198    PVRTC1_RGBA_4BPP_UNORM_SRGB_BLOCK,
1199    PVRTC1_RGB_2BPP_UNORM_BLOCK,
1200    PVRTC1_RGB_2BPP_UNORM_SRGB_BLOCK,
1201    PVRTC1_RGB_4BPP_UNORM_BLOCK,
1202    PVRTC1_RGB_4BPP_UNORM_SRGB_BLOCK,
1203    R10G10B10A2_SINT,
1204    R10G10B10A2_SNORM,
1205    R10G10B10A2_SSCALED,
1206    R10G10B10A2_UINT,
1207    R10G10B10A2_UNORM,
1208    R10G10B10A2_USCALED,
1209    R10G10B10X2_UNORM,
1210    R11G11B10_FLOAT,
1211    R16G16B16A16_FLOAT,
1212    R16G16B16A16_SINT,
1213    R16G16B16A16_SNORM,
1214    R16G16B16A16_SSCALED,
1215    R16G16B16A16_UINT,
1216    R16G16B16A16_UNORM,
1217    R16G16B16A16_USCALED,
1218    R16G16B16_FLOAT,
1219    R16G16B16_SINT,
1220    R16G16B16_SNORM,
1221    R16G16B16_SSCALED,
1222    R16G16B16_UINT,
1223    R16G16B16_UNORM,
1224    R16G16B16_USCALED,
1225    R16G16_FLOAT,
1226    R16G16_SINT,
1227    R16G16_SNORM,
1228    R16G16_SSCALED,
1229    R16G16_UINT,
1230    R16G16_UNORM,
1231    R16G16_USCALED,
1232    R16_FLOAT,
1233    R16_SINT,
1234    R16_SNORM,
1235    R16_SSCALED,
1236    R16_UINT,
1237    R16_UNORM,
1238    R16_USCALED,
1239    R32G32B32A32_FIXED,
1240    R32G32B32A32_FLOAT,
1241    R32G32B32A32_SINT,
1242    R32G32B32A32_SNORM,
1243    R32G32B32A32_SSCALED,
1244    R32G32B32A32_UINT,
1245    R32G32B32A32_UNORM,
1246    R32G32B32A32_USCALED,
1247    R32G32B32_FIXED,
1248    R32G32B32_FLOAT,
1249    R32G32B32_SINT,
1250    R32G32B32_SNORM,
1251    R32G32B32_SSCALED,
1252    R32G32B32_UINT,
1253    R32G32B32_UNORM,
1254    R32G32B32_USCALED,
1255    R32G32_FIXED,
1256    R32G32_FLOAT,
1257    R32G32_SINT,
1258    R32G32_SNORM,
1259    R32G32_SSCALED,
1260    R32G32_UINT,
1261    R32G32_UNORM,
1262    R32G32_USCALED,
1263    R32_FIXED,
1264    R32_FLOAT,
1265    R32_SINT,
1266    R32_SNORM,
1267    R32_SSCALED,
1268    R32_UINT,
1269    R32_UNORM,
1270    R32_USCALED,
1271    R4G4B4A4_UNORM,
1272    R5G5B5A1_UNORM,
1273    R5G6B5_UNORM,
1274    R8G8B8A8_SINT,
1275    R8G8B8A8_SNORM,
1276    R8G8B8A8_SSCALED,
1277    R8G8B8A8_TYPELESS,
1278    R8G8B8A8_TYPELESS_SRGB,
1279    R8G8B8A8_UINT,
1280    R8G8B8A8_UNORM,
1281    R8G8B8A8_UNORM_SRGB,
1282    R8G8B8A8_USCALED,
1283    R8G8B8X8_UNORM,
1284    R8G8B8X8_UNORM_SRGB,
1285    R8G8B8_SINT,
1286    R8G8B8_SNORM,
1287    R8G8B8_SSCALED,
1288    R8G8B8_UINT,
1289    R8G8B8_UNORM,
1290    R8G8B8_UNORM_SRGB,
1291    R8G8B8_USCALED,
1292    R8G8_SINT,
1293    R8G8_SNORM,
1294    R8G8_SSCALED,
1295    R8G8_UINT,
1296    R8G8_UNORM,
1297    R8G8_UNORM_SRGB,
1298    R8G8_USCALED,
1299    R8_SINT,
1300    R8_SNORM,
1301    R8_SSCALED,
1302    R8_UINT,
1303    R8_UNORM,
1304    R8_UNORM_SRGB,
1305    R8_USCALED,
1306    R9G9B9E5_SHAREDEXP,
1307    X2R10G10B10_SINT_VERTEX,
1308    X2R10G10B10_SNORM_VERTEX,
1309    X2R10G10B10_SSCALED_VERTEX,
1310    X2R10G10B10_UINT_VERTEX,
1311    X2R10G10B10_UNORM_VERTEX,
1312    X2R10G10B10_USCALED_VERTEX,
1313    EXTERNAL0,
1314    EXTERNAL1,
1315    EXTERNAL2,
1316    EXTERNAL3,
1317    EXTERNAL4,
1318    EXTERNAL5,
1319    EXTERNAL6,
1320    EXTERNAL7
1321};
1322}
1323}
1324}
1325
1326using namespace rx::mtl_shader;
1327constant int kCopyFormatType [[function_constant(10)]];
1328constant int kCopyTextureType [[function_constant(20)]];
1329constant bool kCopyTextureType2D = kCopyTextureType == kTextureType2D;
1330constant bool kCopyTextureType2DArray = kCopyTextureType == kTextureType2DArray;
1331constant bool kCopyTextureType2DMS = kCopyTextureType == kTextureType2DMultisample;
1332constant bool kCopyTextureTypeCube = kCopyTextureType == kTextureTypeCube;
1333constant bool kCopyTextureType3D = kCopyTextureType == kTextureType3D;
1334struct CopyPixelParams
1335{
1336    uint3 copySize;
1337    uint3 textureOffset;
1338    uint bufferStartOffset;
1339    uint pixelSize;
1340    uint bufferRowPitch;
1341    uint bufferDepthPitch;
1342};
1343struct WritePixelParams
1344{
1345    uint2 copySize;
1346    uint2 textureOffset;
1347    uint bufferStartOffset;
1348    uint pixelSize;
1349    uint bufferRowPitch;
1350    uint textureLevel;
1351    uint textureLayer;
1352    bool reverseTextureRowOrder;
1353};
1354template <typename T>
1355static inline void textureWrite(ushort3 gIndices,
1356                                constant CopyPixelParams &options,
1357                                vec<T, 4> color,
1358                                texture2d<T, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<T, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<T, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<T, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
1359{
1360    uint3 writeIndices = options.textureOffset + uint3(gIndices);
1361    switch (kCopyTextureType)
1362    {
1363        case kTextureType2D:
1364            dstTexture2d.write(color, writeIndices.xy);
1365            break;
1366        case kTextureType2DArray:
1367            dstTexture2dArray.write(color, writeIndices.xy, writeIndices.z);
1368            break;
1369        case kTextureType3D:
1370            dstTexture3d.write(color, writeIndices);
1371            break;
1372        case kTextureTypeCube:
1373            dstTextureCube.write(color, writeIndices.xy, writeIndices.z);
1374            break;
1375    }
1376}
1377template <typename T>
1378static inline vec<T, 4> textureRead(ushort2 gIndices,
1379                                    constant WritePixelParams &options,
1380                                    texture2d<T, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<T, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<T, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<T, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<T, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]])
1381{
1382    vec<T, 4> color;
1383    uint2 coords = uint2(gIndices);
1384    if (options.reverseTextureRowOrder)
1385    {
1386        coords.y = options.copySize.y - 1 - gIndices.y;
1387    }
1388    coords += options.textureOffset;
1389    switch (kCopyTextureType)
1390    {
1391        case kTextureType2D:
1392            color = srcTexture2d.read(coords.xy, options.textureLevel);
1393            break;
1394        case kTextureType2DArray:
1395            color = srcTexture2dArray.read(coords.xy, options.textureLayer, options.textureLevel);
1396            break;
1397        case kTextureType2DMultisample:
1398            color = resolveTextureMS(srcTexture2dMS, coords.xy);
1399            break;
1400        case kTextureType3D:
1401            color = srcTexture3d.read(uint3(coords, options.textureLayer), options.textureLevel);
1402            break;
1403        case kTextureTypeCube:
1404            color = srcTextureCube.read(coords.xy, options.textureLayer, options.textureLevel);
1405            break;
1406    }
1407    return color;
1408}
1409static inline float4 readR5G6B5_UNORM(uint bufferOffset, constant uchar *buffer)
1410{
1411    float4 color;
1412    ushort src = bytesToShort<ushort>(buffer, bufferOffset);
1413    color.r = normalizedToFloat<5>(getShiftedData<5, 11>(src));
1414    color.g = normalizedToFloat<6>(getShiftedData<6, 5>(src));
1415    color.b = normalizedToFloat<5>(getShiftedData<5, 0>(src));
1416    color.a = 1.0;
1417    return color;
1418}
1419static inline void writeR5G6B5_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1420{
1421    ushort dst = shiftData<5, 11>(floatToNormalized<5, ushort>(color.r)) |
1422                 shiftData<6, 5>(floatToNormalized<6, ushort>(color.g)) |
1423                 shiftData<5, 0>(floatToNormalized<5, ushort>(color.b));
1424    shortToBytes(dst, bufferOffset, buffer);
1425}
1426static inline float4 readR4G4B4A4_UNORM(uint bufferOffset, constant uchar *buffer)
1427{
1428    float4 color;
1429    ushort src = bytesToShort<ushort>(buffer, bufferOffset);
1430    color.r = normalizedToFloat<4>(getShiftedData<4, 12>(src));
1431    color.g = normalizedToFloat<4>(getShiftedData<4, 8>(src));
1432    color.b = normalizedToFloat<4>(getShiftedData<4, 4>(src));
1433    color.a = normalizedToFloat<4>(getShiftedData<4, 0>(src));
1434    return color;
1435}
1436static inline void writeR4G4B4A4_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1437{
1438    ushort dst = shiftData<4, 12>(floatToNormalized<4, ushort>(color.r)) |
1439                 shiftData<4, 8>(floatToNormalized<4, ushort>(color.g)) |
1440                 shiftData<4, 4>(floatToNormalized<4, ushort>(color.b)) |
1441                 shiftData<4, 0>(floatToNormalized<4, ushort>(color.a));
1442    ;
1443    shortToBytes(dst, bufferOffset, buffer);
1444}
1445static inline float4 readR5G5B5A1_UNORM(uint bufferOffset, constant uchar *buffer)
1446{
1447    float4 color;
1448    ushort src = bytesToShort<ushort>(buffer, bufferOffset);
1449    color.r = normalizedToFloat<5>(getShiftedData<5, 11>(src));
1450    color.g = normalizedToFloat<5>(getShiftedData<5, 6>(src));
1451    color.b = normalizedToFloat<5>(getShiftedData<5, 1>(src));
1452    color.a = normalizedToFloat<1>(getShiftedData<1, 0>(src));
1453    return color;
1454}
1455static inline void writeR5G5B5A1_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1456{
1457    ushort dst = shiftData<5, 11>(floatToNormalized<5, ushort>(color.r)) |
1458                 shiftData<5, 6>(floatToNormalized<5, ushort>(color.g)) |
1459                 shiftData<5, 1>(floatToNormalized<5, ushort>(color.b)) |
1460                 shiftData<1, 0>(floatToNormalized<1, ushort>(color.a));
1461    ;
1462    shortToBytes(dst, bufferOffset, buffer);
1463}
1464static inline int4 readR10G10B10A2_SINT(uint bufferOffset, constant uchar *buffer)
1465{
1466    int4 color;
1467    int src = bytesToInt<int>(buffer, bufferOffset);
1468    constexpr int3 rgbSignMask(0x200);
1469    constexpr int3 negativeMask(0xfffffc00);
1470    constexpr int alphaSignMask = 0x2;
1471    constexpr int alphaNegMask = 0xfffffffc;
1472    color.r = getShiftedData<10, 0>(src);
1473    color.g = getShiftedData<10, 10>(src);
1474    color.b = getShiftedData<10, 20>(src);
1475    int3 isRgbNegative = (color.rgb & rgbSignMask) >> 9;
1476    color.rgb = (isRgbNegative * negativeMask) | color.rgb;
1477    color.a = getShiftedData<2, 30>(src);
1478    int isAlphaNegative = color.a & alphaSignMask >> 1;
1479    color.a = (isAlphaNegative * alphaNegMask) | color.a;
1480    return color;
1481}
1482static inline uint4 readR10G10B10A2_UINT(uint bufferOffset, constant uchar *buffer)
1483{
1484    uint4 color;
1485    uint src = bytesToInt<uint>(buffer, bufferOffset);
1486    color.r = getShiftedData<10, 0>(src);
1487    color.g = getShiftedData<10, 10>(src);
1488    color.b = getShiftedData<10, 20>(src);
1489    color.a = getShiftedData<2, 30>(src);
1490    return color;
1491}
1492static inline float4 readR8G8B8A8(uint bufferOffset, constant uchar *buffer, bool isSRGB)
1493{
1494    float4 color;
1495    uint src = bytesToInt<uint>(buffer, bufferOffset);
1496    if (isSRGB)
1497    {
1498        color = unpack_unorm4x8_srgb_to_float(src);
1499    }
1500    else
1501    {
1502        color = unpack_unorm4x8_to_float(src);
1503    }
1504    return color;
1505}
1506static inline void writeR8G8B8A8(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer, bool isSRGB)
1507{
1508    uint dst;
1509    if (isSRGB)
1510    {
1511        dst = pack_float_to_srgb_unorm4x8(color);
1512    }
1513    else
1514    {
1515        dst = pack_float_to_unorm4x8(color);
1516    }
1517    intToBytes(dst, bufferOffset, buffer);
1518}
1519static inline float4 readR8G8B8(uint bufferOffset, constant uchar *buffer, bool isSRGB)
1520{
1521    float4 color;
1522    color.r = normalizedToFloat<uchar>(buffer[bufferOffset]);
1523    color.g = normalizedToFloat<uchar>(buffer[bufferOffset + 1]);
1524    color.b = normalizedToFloat<uchar>(buffer[bufferOffset + 2]);
1525    color.a = 1.0;
1526    if (isSRGB)
1527    {
1528        color = sRGBtoLinear(color);
1529    }
1530    return color;
1531}
1532static inline void writeR8G8B8(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer, bool isSRGB)
1533{
1534    color.a = 1.0;
1535    uint dst;
1536    if (isSRGB)
1537    {
1538        dst = pack_float_to_srgb_unorm4x8(color);
1539    }
1540    else
1541    {
1542        dst = pack_float_to_unorm4x8(color);
1543    }
1544    int24bitToBytes(dst, bufferOffset, buffer);
1545}
1546static inline float4 readR8G8B8A8_SNORM(uint bufferOffset, constant uchar *buffer)
1547{
1548    float4 color;
1549    uint src = bytesToInt<uint>(buffer, bufferOffset);
1550    color = unpack_snorm4x8_to_float(src);
1551    return color;
1552}
1553static inline void writeR8G8B8A8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1554{
1555    uint dst = pack_float_to_snorm4x8(color);
1556    intToBytes(dst, bufferOffset, buffer);
1557}
1558static inline float4 readR8G8B8_SNORM(uint bufferOffset, constant uchar *buffer)
1559{
1560    float4 color;
1561    color.r = normalizedToFloat<7, char>(buffer[bufferOffset]);
1562    color.g = normalizedToFloat<7, char>(buffer[bufferOffset + 1]);
1563    color.b = normalizedToFloat<7, char>(buffer[bufferOffset + 2]);
1564    color.a = 1.0;
1565    return color;
1566}
1567static inline void writeR8G8B8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1568{
1569    uint dst = pack_float_to_snorm4x8(color);
1570    int24bitToBytes(dst, bufferOffset, buffer);
1571}
1572static inline float4 readR8G8B8A8_UNORM(uint bufferOffset, constant uchar *buffer)
1573{
1574    return readR8G8B8A8(bufferOffset, buffer, false);
1575}
1576static inline void writeR8G8B8A8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1577{
1578    return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, false);
1579}
1580static inline float4 readR8G8B8A8_UNORM_SRGB(uint bufferOffset, constant uchar *buffer)
1581{
1582    return readR8G8B8A8(bufferOffset, buffer, true);
1583}
1584static inline void writeR8G8B8A8_UNORM_SRGB(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1585{
1586    return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, true);
1587}
1588static inline float4 readB8G8R8A8_UNORM(uint bufferOffset, constant uchar *buffer)
1589{
1590    return readR8G8B8A8(bufferOffset, buffer, false).bgra;
1591}
1592static inline void writeB8G8R8A8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1593{
1594    color.rgba = color.bgra;
1595    return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, false);
1596}
1597static inline float4 readB8G8R8A8_UNORM_SRGB(uint bufferOffset, constant uchar *buffer)
1598{
1599    return readR8G8B8A8(bufferOffset, buffer, true).bgra;
1600}
1601static inline void writeB8G8R8A8_UNORM_SRGB(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1602{
1603    color.rgba = color.bgra;
1604    return writeR8G8B8A8(gIndices, options, bufferOffset, color, buffer, true);
1605}
1606static inline float4 readR8G8B8_UNORM(uint bufferOffset, constant uchar *buffer)
1607{
1608    return readR8G8B8(bufferOffset, buffer, false);
1609}
1610static inline void writeR8G8B8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1611{
1612    return writeR8G8B8(gIndices, options, bufferOffset, color, buffer, false);
1613}
1614static inline float4 readR8G8B8_UNORM_SRGB(uint bufferOffset, constant uchar *buffer)
1615{
1616    return readR8G8B8(bufferOffset, buffer, true);
1617}
1618static inline void writeR8G8B8_UNORM_SRGB(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1619{
1620    return writeR8G8B8(gIndices, options, bufferOffset, color, buffer, true);
1621}
1622static inline float4 readL8_UNORM(uint bufferOffset, constant uchar *buffer)
1623{
1624    float4 color;
1625    color.rgb = float3(normalizedToFloat<uchar>(buffer[bufferOffset]));
1626    color.a = 1.0;
1627    return color;
1628}
1629static inline void writeL8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1630{
1631    buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1632}
1633static inline void writeA8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1634{
1635    buffer[bufferOffset] = floatToNormalized<uchar>(color.a);
1636}
1637static inline float4 readL8A8_UNORM(uint bufferOffset, constant uchar *buffer)
1638{
1639    float4 color;
1640    color.rgb = float3(normalizedToFloat<uchar>(buffer[bufferOffset]));
1641    color.a = normalizedToFloat<uchar>(buffer[bufferOffset + 1]);
1642    return color;
1643}
1644static inline void writeL8A8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1645{
1646    buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1647    buffer[bufferOffset + 1] = floatToNormalized<uchar>(color.a);
1648}
1649static inline float4 readR8_UNORM(uint bufferOffset, constant uchar *buffer)
1650{
1651    float4 color;
1652    color.r = normalizedToFloat<uchar>(buffer[bufferOffset]);
1653    color.g = color.b = 0.0;
1654    color.a = 1.0;
1655    return color;
1656}
1657static inline void writeR8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1658{
1659    buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1660}
1661static inline float4 readR8_SNORM(uint bufferOffset, constant uchar *buffer)
1662{
1663    float4 color;
1664    color.r = normalizedToFloat<7, char>(buffer[bufferOffset]);
1665    color.g = color.b = 0.0;
1666    color.a = 1.0;
1667    return color;
1668}
1669static inline void writeR8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1670{
1671    buffer[bufferOffset] = as_type<uchar>(floatToNormalized<char>(color.r));
1672}
1673static inline int4 readR8_SINT(uint bufferOffset, constant uchar *buffer)
1674{
1675    int4 color;
1676    color.r = as_type<char>(buffer[bufferOffset]);
1677    color.g = color.b = 0;
1678    color.a = 1;
1679    return color;
1680}
1681static inline void writeR8_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1682{
1683    buffer[bufferOffset] = static_cast<uchar>(color.r);
1684}
1685static inline uint4 readR8_UINT(uint bufferOffset, constant uchar *buffer)
1686{
1687    uint4 color;
1688    color.r = as_type<uchar>(buffer[bufferOffset]);
1689    color.g = color.b = 0;
1690    color.a = 1;
1691    return color;
1692}
1693static inline void writeR8_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1694{
1695    buffer[bufferOffset] = static_cast<uchar>(color.r);
1696}
1697static inline float4 readR8G8_UNORM(uint bufferOffset, constant uchar *buffer)
1698{
1699    float4 color;
1700    color.r = normalizedToFloat<uchar>(buffer[bufferOffset]);
1701    color.g = normalizedToFloat<uchar>(buffer[bufferOffset + 1]);
1702    color.b = 0.0;
1703    color.a = 1.0;
1704    return color;
1705}
1706static inline void writeR8G8_UNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1707{
1708    buffer[bufferOffset] = floatToNormalized<uchar>(color.r);
1709    buffer[bufferOffset + 1] = floatToNormalized<uchar>(color.g);
1710}
1711static inline float4 readR8G8_SNORM(uint bufferOffset, constant uchar *buffer)
1712{
1713    float4 color;
1714    color.r = normalizedToFloat<7, char>(buffer[bufferOffset]);
1715    color.g = normalizedToFloat<7, char>(buffer[bufferOffset + 1]);
1716    color.b = 0.0;
1717    color.a = 1.0;
1718    return color;
1719}
1720static inline void writeR8G8_SNORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1721{
1722    buffer[bufferOffset] = as_type<uchar>(floatToNormalized<char>(color.r));
1723    buffer[bufferOffset + 1] = as_type<uchar>(floatToNormalized<char>(color.g));
1724}
1725static inline int4 readR8G8_SINT(uint bufferOffset, constant uchar *buffer)
1726{
1727    int4 color;
1728    color.r = as_type<char>(buffer[bufferOffset]);
1729    color.g = as_type<char>(buffer[bufferOffset + 1]);
1730    color.b = 0;
1731    color.a = 1;
1732    return color;
1733}
1734static inline void writeR8G8_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1735{
1736    buffer[bufferOffset] = static_cast<uchar>(color.r);
1737    buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1738}
1739static inline uint4 readR8G8_UINT(uint bufferOffset, constant uchar *buffer)
1740{
1741    uint4 color;
1742    color.r = as_type<uchar>(buffer[bufferOffset]);
1743    color.g = as_type<uchar>(buffer[bufferOffset + 1]);
1744    color.b = 0;
1745    color.a = 1;
1746    return color;
1747}
1748static inline void writeR8G8_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1749{
1750    buffer[bufferOffset] = static_cast<uchar>(color.r);
1751    buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1752}
1753static inline int4 readR8G8B8_SINT(uint bufferOffset, constant uchar *buffer)
1754{
1755    int4 color;
1756    color.r = as_type<char>(buffer[bufferOffset]);
1757    color.g = as_type<char>(buffer[bufferOffset + 1]);
1758    color.b = as_type<char>(buffer[bufferOffset + 2]);
1759    color.a = 1;
1760    return color;
1761}
1762static inline uint4 readR8G8B8_UINT(uint bufferOffset, constant uchar *buffer)
1763{
1764    uint4 color;
1765    color.r = as_type<uchar>(buffer[bufferOffset]);
1766    color.g = as_type<uchar>(buffer[bufferOffset + 1]);
1767    color.b = as_type<uchar>(buffer[bufferOffset + 2]);
1768    color.a = 1;
1769    return color;
1770}
1771static inline int4 readR8G8B8A8_SINT(uint bufferOffset, constant uchar *buffer)
1772{
1773    int4 color;
1774    color.r = as_type<char>(buffer[bufferOffset]);
1775    color.g = as_type<char>(buffer[bufferOffset + 1]);
1776    color.b = as_type<char>(buffer[bufferOffset + 2]);
1777    color.a = as_type<char>(buffer[bufferOffset + 3]);
1778    return color;
1779}
1780static inline void writeR8G8B8A8_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1781{
1782    buffer[bufferOffset] = static_cast<uchar>(color.r);
1783    buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1784    buffer[bufferOffset + 2] = static_cast<uchar>(color.b);
1785    buffer[bufferOffset + 3] = static_cast<uchar>(color.a);
1786}
1787static inline uint4 readR8G8B8A8_UINT(uint bufferOffset, constant uchar *buffer)
1788{
1789    uint4 color;
1790    color.r = as_type<uchar>(buffer[bufferOffset]);
1791    color.g = as_type<uchar>(buffer[bufferOffset + 1]);
1792    color.b = as_type<uchar>(buffer[bufferOffset + 2]);
1793    color.a = as_type<uchar>(buffer[bufferOffset + 3]);
1794    return color;
1795}
1796static inline void writeR8G8B8A8_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1797{
1798    buffer[bufferOffset] = static_cast<uchar>(color.r);
1799    buffer[bufferOffset + 1] = static_cast<uchar>(color.g);
1800    buffer[bufferOffset + 2] = static_cast<uchar>(color.b);
1801    buffer[bufferOffset + 3] = static_cast<uchar>(color.a);
1802}
1803static inline float4 readR16_FLOAT(uint bufferOffset, constant uchar *buffer)
1804{
1805    float4 color;
1806    color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1807    color.g = color.b = 0.0;
1808    color.a = 1.0;
1809    return color;
1810}
1811static inline void writeR16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1812{
1813    shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1814}
1815template <typename ShortType>
1816static inline float4 readR16_NORM(uint bufferOffset, constant uchar *buffer)
1817{
1818    float4 color;
1819    color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
1820    color.g = color.b = 0.0;
1821    color.a = 1.0;
1822    return color;
1823}
1824template<typename ShortType>
1825static inline void writeR16_NORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1826{
1827    shortToBytes(floatToNormalized<ShortType>(color.r), bufferOffset, buffer);
1828}
1829static inline int4 readR16_SINT(uint bufferOffset, constant uchar *buffer)
1830{
1831    int4 color;
1832    color.r = bytesToShort<short>(buffer, bufferOffset);
1833    color.g = color.b = 0;
1834    color.a = 1;
1835    return color;
1836}
1837static inline void writeR16_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1838{
1839    shortToBytes(static_cast<short>(color.r), bufferOffset, buffer);
1840}
1841static inline uint4 readR16_UINT(uint bufferOffset, constant uchar *buffer)
1842{
1843    uint4 color;
1844    color.r = bytesToShort<ushort>(buffer, bufferOffset);
1845    color.g = color.b = 0;
1846    color.a = 1;
1847    return color;
1848}
1849static inline void writeR16_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1850{
1851    shortToBytes(static_cast<ushort>(color.r), bufferOffset, buffer);
1852}
1853static inline float4 readA16_FLOAT(uint bufferOffset, constant uchar *buffer)
1854{
1855    float4 color;
1856    color.a = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1857    color.rgb = 0.0;
1858    return color;
1859}
1860static inline void writeA16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1861{
1862    shortToBytes(as_type<ushort>(static_cast<half>(color.a)), bufferOffset, buffer);
1863}
1864static inline float4 readL16_FLOAT(uint bufferOffset, constant uchar *buffer)
1865{
1866    float4 color;
1867    color.rgb = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1868    color.a = 1.0;
1869    return color;
1870}
1871static inline void writeL16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1872{
1873    shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1874}
1875static inline float4 readL16A16_FLOAT(uint bufferOffset, constant uchar *buffer)
1876{
1877    float4 color;
1878    color.rgb = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1879    color.a = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1880    return color;
1881}
1882static inline void writeL16A16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1883{
1884    shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1885    shortToBytes(as_type<ushort>(static_cast<half>(color.a)), bufferOffset + 2, buffer);
1886}
1887static inline float4 readR16G16_FLOAT(uint bufferOffset, constant uchar *buffer)
1888{
1889    float4 color;
1890    color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1891    color.g = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1892    color.b = 0.0;
1893    color.a = 1.0;
1894    return color;
1895}
1896static inline void writeR16G16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1897{
1898    shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1899    shortToBytes(as_type<ushort>(static_cast<half>(color.g)), bufferOffset + 2, buffer);
1900}
1901template <typename ShortType>
1902static inline float4 readR16G16_NORM(uint bufferOffset, constant uchar *buffer)
1903{
1904    float4 color;
1905    color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
1906    color.g = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 2));
1907    color.b = 0.0;
1908    color.a = 1.0;
1909    return color;
1910}
1911template<typename ShortType>
1912static inline void writeR16G16_NORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1913{
1914    shortToBytes(floatToNormalized<ShortType>(color.r), bufferOffset, buffer);
1915    shortToBytes(floatToNormalized<ShortType>(color.g), bufferOffset + 2, buffer);
1916}
1917static inline int4 readR16G16_SINT(uint bufferOffset, constant uchar *buffer)
1918{
1919    int4 color;
1920    color.r = bytesToShort<short>(buffer, bufferOffset);
1921    color.g = bytesToShort<short>(buffer, bufferOffset + 2);
1922    color.b = 0;
1923    color.a = 1;
1924    return color;
1925}
1926static inline void writeR16G16_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
1927{
1928    shortToBytes(static_cast<short>(color.r), bufferOffset, buffer);
1929    shortToBytes(static_cast<short>(color.g), bufferOffset + 2, buffer);
1930}
1931static inline uint4 readR16G16_UINT(uint bufferOffset, constant uchar *buffer)
1932{
1933    uint4 color;
1934    color.r = bytesToShort<ushort>(buffer, bufferOffset);
1935    color.g = bytesToShort<ushort>(buffer, bufferOffset + 2);
1936    color.b = 0;
1937    color.a = 1;
1938    return color;
1939}
1940static inline void writeR16G16_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
1941{
1942    shortToBytes(static_cast<ushort>(color.r), bufferOffset, buffer);
1943    shortToBytes(static_cast<ushort>(color.g), bufferOffset + 2, buffer);
1944}
1945static inline float4 readR16G16B16_FLOAT(uint bufferOffset, constant uchar *buffer)
1946{
1947    float4 color;
1948    color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1949    color.g = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1950    color.b = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 4));
1951    color.a = 1.0;
1952    return color;
1953}
1954template <typename ShortType>
1955static inline float4 readR16G16B16_NORM(uint bufferOffset, constant uchar *buffer)
1956{
1957    float4 color;
1958    color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
1959    color.g = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 2));
1960    color.b = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 4));
1961    color.a = 1.0;
1962    return color;
1963}
1964static inline int4 readR16G16B16_SINT(uint bufferOffset, constant uchar *buffer)
1965{
1966    int4 color;
1967    color.r = bytesToShort<short>(buffer, bufferOffset);
1968    color.g = bytesToShort<short>(buffer, bufferOffset + 2);
1969    color.b = bytesToShort<short>(buffer, bufferOffset + 4);
1970    color.a = 1;
1971    return color;
1972}
1973static inline uint4 readR16G16B16_UINT(uint bufferOffset, constant uchar *buffer)
1974{
1975    uint4 color;
1976    color.r = bytesToShort<ushort>(buffer, bufferOffset);
1977    color.g = bytesToShort<ushort>(buffer, bufferOffset + 2);
1978    color.b = bytesToShort<ushort>(buffer, bufferOffset + 4);
1979    color.a = 1;
1980    return color;
1981}
1982static inline float4 readR16G16B16A16_FLOAT(uint bufferOffset, constant uchar *buffer)
1983{
1984    float4 color;
1985    color.r = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset));
1986    color.g = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 2));
1987    color.b = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 4));
1988    color.a = as_type<half>(bytesToShort<ushort>(buffer, bufferOffset + 6));
1989    return color;
1990}
1991static inline void writeR16G16B16A16_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
1992{
1993    shortToBytes(as_type<ushort>(static_cast<half>(color.r)), bufferOffset, buffer);
1994    shortToBytes(as_type<ushort>(static_cast<half>(color.g)), bufferOffset + 2, buffer);
1995    shortToBytes(as_type<ushort>(static_cast<half>(color.b)), bufferOffset + 4, buffer);
1996    shortToBytes(as_type<ushort>(static_cast<half>(color.a)), bufferOffset + 6, buffer);
1997}
1998template <typename ShortType>
1999static inline float4 readR16G16B16A16_NORM(uint bufferOffset, constant uchar *buffer)
2000{
2001    float4 color;
2002    color.r = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset));
2003    color.g = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 2));
2004    color.b = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 4));
2005    color.a = normalizedToFloat<ShortType>(bytesToShort<ShortType>(buffer, bufferOffset + 6));
2006    return color;
2007}
2008template<typename ShortType>
2009static inline void writeR16G16B16A16_NORM(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2010{
2011    shortToBytes(floatToNormalized<ShortType>(color.r), bufferOffset, buffer);
2012    shortToBytes(floatToNormalized<ShortType>(color.g), bufferOffset + 2, buffer);
2013    shortToBytes(floatToNormalized<ShortType>(color.b), bufferOffset + 4, buffer);
2014    shortToBytes(floatToNormalized<ShortType>(color.a), bufferOffset + 6, buffer);
2015}
2016static inline int4 readR16G16B16A16_SINT(uint bufferOffset, constant uchar *buffer)
2017{
2018    int4 color;
2019    color.r = bytesToShort<short>(buffer, bufferOffset);
2020    color.g = bytesToShort<short>(buffer, bufferOffset + 2);
2021    color.b = bytesToShort<short>(buffer, bufferOffset + 4);
2022    color.a = bytesToShort<short>(buffer, bufferOffset + 6);
2023    return color;
2024}
2025static inline void writeR16G16B16A16_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2026{
2027    shortToBytes(static_cast<short>(color.r), bufferOffset, buffer);
2028    shortToBytes(static_cast<short>(color.g), bufferOffset + 2, buffer);
2029    shortToBytes(static_cast<short>(color.b), bufferOffset + 4, buffer);
2030    shortToBytes(static_cast<short>(color.a), bufferOffset + 6, buffer);
2031}
2032static inline uint4 readR16G16B16A16_UINT(uint bufferOffset, constant uchar *buffer)
2033{
2034    uint4 color;
2035    color.r = bytesToShort<ushort>(buffer, bufferOffset);
2036    color.g = bytesToShort<ushort>(buffer, bufferOffset + 2);
2037    color.b = bytesToShort<ushort>(buffer, bufferOffset + 4);
2038    color.a = bytesToShort<ushort>(buffer, bufferOffset + 6);
2039    return color;
2040}
2041static inline void writeR16G16B16A16_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2042{
2043    shortToBytes(static_cast<ushort>(color.r), bufferOffset, buffer);
2044    shortToBytes(static_cast<ushort>(color.g), bufferOffset + 2, buffer);
2045    shortToBytes(static_cast<ushort>(color.b), bufferOffset + 4, buffer);
2046    shortToBytes(static_cast<ushort>(color.a), bufferOffset + 6, buffer);
2047}
2048static inline float4 readR32_FLOAT(uint bufferOffset, constant uchar *buffer)
2049{
2050    float4 color;
2051    color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2052    color.g = color.b = 0.0;
2053    color.a = 1.0;
2054    return color;
2055}
2056static inline void writeR32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2057{
2058    intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2059}
2060template <typename IntType>
2061static inline float4 readR32_NORM(uint bufferOffset, constant uchar *buffer)
2062{
2063    float4 color;
2064    color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2065    color.g = color.b = 0.0;
2066    color.a = 1.0;
2067    return color;
2068}
2069static inline float4 readA32_FLOAT(uint bufferOffset, constant uchar *buffer)
2070{
2071    float4 color;
2072    color.a = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2073    color.rgb = 0.0;
2074    return color;
2075}
2076static inline void writeA32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2077{
2078    intToBytes(as_type<uint>(color.a), bufferOffset, buffer);
2079}
2080static inline float4 readL32_FLOAT(uint bufferOffset, constant uchar *buffer)
2081{
2082    float4 color;
2083    color.rgb = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2084    color.a = 1.0;
2085    return color;
2086}
2087static inline void writeL32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2088{
2089    intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2090}
2091static inline int4 readR32_SINT(uint bufferOffset, constant uchar *buffer)
2092{
2093    int4 color;
2094    color.r = bytesToInt<int>(buffer, bufferOffset);
2095    color.g = color.b = 0;
2096    color.a = 1;
2097    return color;
2098}
2099static inline void writeR32_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2100{
2101    intToBytes(color.r, bufferOffset, buffer);
2102}
2103static inline float4 readR32_FIXED(uint bufferOffset, constant uchar *buffer)
2104{
2105    float4 color;
2106    constexpr float kDivisor = 1.0f / (1 << 16);
2107    color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2108    color.g = color.b = 0.0;
2109    color.a = 1.0;
2110    return color;
2111}
2112static inline uint4 readR32_UINT(uint bufferOffset, constant uchar *buffer)
2113{
2114    uint4 color;
2115    color.r = bytesToInt<uint>(buffer, bufferOffset);
2116    color.g = color.b = 0;
2117    color.a = 1;
2118    return color;
2119}
2120static inline void writeR32_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2121{
2122    intToBytes(color.r, bufferOffset, buffer);
2123}
2124static inline float4 readL32A32_FLOAT(uint bufferOffset, constant uchar *buffer)
2125{
2126    float4 color;
2127    color.rgb = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2128    color.a = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2129    return color;
2130}
2131static inline void writeL32A32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2132{
2133    intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2134    intToBytes(as_type<uint>(color.a), bufferOffset + 4, buffer);
2135}
2136static inline float4 readR32G32_FLOAT(uint bufferOffset, constant uchar *buffer)
2137{
2138    float4 color;
2139    color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2140    color.g = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2141    color.b = 0.0;
2142    color.a = 1.0;
2143    return color;
2144}
2145static inline void writeR32G32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2146{
2147    intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2148    intToBytes(as_type<uint>(color.g), bufferOffset + 4, buffer);
2149}
2150template <typename IntType>
2151static inline float4 readR32G32_NORM(uint bufferOffset, constant uchar *buffer)
2152{
2153    float4 color;
2154    color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2155    color.g = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 4));
2156    color.b = 0.0;
2157    color.a = 1.0;
2158    return color;
2159}
2160static inline int4 readR32G32_SINT(uint bufferOffset, constant uchar *buffer)
2161{
2162    int4 color;
2163    color.r = bytesToInt<int>(buffer, bufferOffset);
2164    color.g = bytesToInt<int>(buffer, bufferOffset + 4);
2165    color.b = 0;
2166    color.a = 1;
2167    return color;
2168}
2169static inline void writeR32G32_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2170{
2171    intToBytes(color.r, bufferOffset, buffer);
2172    intToBytes(color.g, bufferOffset + 4, buffer);
2173}
2174static inline float4 readR32G32_FIXED(uint bufferOffset, constant uchar *buffer)
2175{
2176    float4 color;
2177    constexpr float kDivisor = 1.0f / (1 << 16);
2178    color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2179    color.g = bytesToInt<int>(buffer, bufferOffset + 4) * kDivisor;
2180    color.b = 0.0;
2181    color.a = 1.0;
2182    return color;
2183}
2184static inline uint4 readR32G32_UINT(uint bufferOffset, constant uchar *buffer)
2185{
2186    uint4 color;
2187    color.r = bytesToInt<uint>(buffer, bufferOffset);
2188    color.g = bytesToInt<uint>(buffer, bufferOffset + 4);
2189    color.b = 0;
2190    color.a = 1;
2191    return color;
2192}
2193static inline void writeR32G32_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2194{
2195    intToBytes(color.r, bufferOffset, buffer);
2196    intToBytes(color.g, bufferOffset + 4, buffer);
2197}
2198static inline float4 readR32G32B32_FLOAT(uint bufferOffset, constant uchar *buffer)
2199{
2200    float4 color;
2201    color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2202    color.g = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2203    color.b = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 8));
2204    color.a = 1.0;
2205    return color;
2206}
2207template <typename IntType>
2208static inline float4 readR32G32B32_NORM(uint bufferOffset, constant uchar *buffer)
2209{
2210    float4 color;
2211    color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2212    color.g = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 4));
2213    color.b = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 8));
2214    color.a = 1.0;
2215    return color;
2216}
2217static inline int4 readR32G32B32_SINT(uint bufferOffset, constant uchar *buffer)
2218{
2219    int4 color;
2220    color.r = bytesToInt<int>(buffer, bufferOffset);
2221    color.g = bytesToInt<int>(buffer, bufferOffset + 4);
2222    color.b = bytesToInt<int>(buffer, bufferOffset + 8);
2223    color.a = 1;
2224    return color;
2225}
2226static inline float4 readR32G32B32_FIXED(uint bufferOffset, constant uchar *buffer)
2227{
2228    float4 color;
2229    constexpr float kDivisor = 1.0f / (1 << 16);
2230    color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2231    color.g = bytesToInt<int>(buffer, bufferOffset + 4) * kDivisor;
2232    color.b = bytesToInt<int>(buffer, bufferOffset + 8) * kDivisor;
2233    color.a = 1.0;
2234    return color;
2235}
2236static inline uint4 readR32G32B32_UINT(uint bufferOffset, constant uchar *buffer)
2237{
2238    uint4 color;
2239    color.r = bytesToInt<uint>(buffer, bufferOffset);
2240    color.g = bytesToInt<uint>(buffer, bufferOffset + 4);
2241    color.b = bytesToInt<uint>(buffer, bufferOffset + 8);
2242    color.a = 1;
2243    return color;
2244}
2245static inline float4 readR32G32B32A32_FLOAT(uint bufferOffset, constant uchar *buffer)
2246{
2247    float4 color;
2248    color.r = as_type<float>(bytesToInt<uint>(buffer, bufferOffset));
2249    color.g = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 4));
2250    color.b = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 8));
2251    color.a = as_type<float>(bytesToInt<uint>(buffer, bufferOffset + 12));
2252    return color;
2253}
2254static inline void writeR32G32B32A32_FLOAT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<float, 4> color, device uchar *buffer)
2255{
2256    intToBytes(as_type<uint>(color.r), bufferOffset, buffer);
2257    intToBytes(as_type<uint>(color.g), bufferOffset + 4, buffer);
2258    intToBytes(as_type<uint>(color.b), bufferOffset + 8, buffer);
2259    intToBytes(as_type<uint>(color.a), bufferOffset + 12, buffer);
2260}
2261template <typename IntType>
2262static inline float4 readR32G32B32A32_NORM(uint bufferOffset, constant uchar *buffer)
2263{
2264    float4 color;
2265    color.r = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset));
2266    color.g = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 4));
2267    color.b = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 8));
2268    color.a = normalizedToFloat<IntType>(bytesToInt<IntType>(buffer, bufferOffset + 12));
2269    return color;
2270}
2271static inline int4 readR32G32B32A32_SINT(uint bufferOffset, constant uchar *buffer)
2272{
2273    int4 color;
2274    color.r = bytesToInt<int>(buffer, bufferOffset);
2275    color.g = bytesToInt<int>(buffer, bufferOffset + 4);
2276    color.b = bytesToInt<int>(buffer, bufferOffset + 8);
2277    color.a = bytesToInt<int>(buffer, bufferOffset + 12);
2278    return color;
2279}
2280static inline void writeR32G32B32A32_SINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<int, 4> color, device uchar *buffer)
2281{
2282    intToBytes(color.r, bufferOffset, buffer);
2283    intToBytes(color.g, bufferOffset + 4, buffer);
2284    intToBytes(color.b, bufferOffset + 8, buffer);
2285    intToBytes(color.a, bufferOffset + 12, buffer);
2286}
2287static inline float4 readR32G32B32A32_FIXED(uint bufferOffset, constant uchar *buffer)
2288{
2289    float4 color;
2290    constexpr float kDivisor = 1.0f / (1 << 16);
2291    color.r = bytesToInt<int>(buffer, bufferOffset) * kDivisor;
2292    color.g = bytesToInt<int>(buffer, bufferOffset + 4) * kDivisor;
2293    color.b = bytesToInt<int>(buffer, bufferOffset + 8) * kDivisor;
2294    color.a = bytesToInt<int>(buffer, bufferOffset + 12) * kDivisor;
2295    return color;
2296}
2297static inline uint4 readR32G32B32A32_UINT(uint bufferOffset, constant uchar *buffer)
2298{
2299    uint4 color;
2300    color.r = bytesToInt<uint>(buffer, bufferOffset);
2301    color.g = bytesToInt<uint>(buffer, bufferOffset + 4);
2302    color.b = bytesToInt<uint>(buffer, bufferOffset + 8);
2303    color.a = bytesToInt<uint>(buffer, bufferOffset + 12);
2304    return color;
2305}
2306static inline void writeR32G32B32A32_UINT(ushort2 gIndices, constant WritePixelParams &options, uint bufferOffset, vec<uint, 4> color, device uchar *buffer)
2307{
2308    intToBytes(color.r, bufferOffset, buffer);
2309    intToBytes(color.g, bufferOffset + 4, buffer);
2310    intToBytes(color.b, bufferOffset + 8, buffer);
2311    intToBytes(color.a, bufferOffset + 12, buffer);
2312}
2313static inline int4 readR8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8_SINT(bufferOffset, buffer); } static inline uint4 readR8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8_UINT(bufferOffset, buffer); } static inline int4 readR8G8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8_SINT(bufferOffset, buffer); } static inline uint4 readR8G8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8_UINT(bufferOffset, buffer); } static inline int4 readR8G8B8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8_SINT(bufferOffset, buffer); } static inline uint4 readR8G8B8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8_UINT(bufferOffset, buffer); } static inline int4 readR8G8B8A8_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8A8_SINT(bufferOffset, buffer); } static inline uint4 readR8G8B8A8_USCALED(uint bufferOffset, constant uchar *buffer) { return readR8G8B8A8_UINT(bufferOffset, buffer); }
2314static inline int4 readR16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16_SINT(bufferOffset, buffer); } static inline uint4 readR16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16_UINT(bufferOffset, buffer); } static inline int4 readR16G16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16_SINT(bufferOffset, buffer); } static inline uint4 readR16G16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16_UINT(bufferOffset, buffer); } static inline int4 readR16G16B16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16_SINT(bufferOffset, buffer); } static inline uint4 readR16G16B16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16_UINT(bufferOffset, buffer); } static inline int4 readR16G16B16A16_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16A16_SINT(bufferOffset, buffer); } static inline uint4 readR16G16B16A16_USCALED(uint bufferOffset, constant uchar *buffer) { return readR16G16B16A16_UINT(bufferOffset, buffer); }
2315static inline int4 readR32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32_SINT(bufferOffset, buffer); } static inline uint4 readR32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32_UINT(bufferOffset, buffer); } static inline int4 readR32G32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32_SINT(bufferOffset, buffer); } static inline uint4 readR32G32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32_UINT(bufferOffset, buffer); } static inline int4 readR32G32B32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32_SINT(bufferOffset, buffer); } static inline uint4 readR32G32B32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32_UINT(bufferOffset, buffer); } static inline int4 readR32G32B32A32_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32A32_SINT(bufferOffset, buffer); } static inline uint4 readR32G32B32A32_USCALED(uint bufferOffset, constant uchar *buffer) { return readR32G32B32A32_UINT(bufferOffset, buffer); }
2316static inline int4 readR10G10B10A2_SSCALED(uint bufferOffset, constant uchar *buffer) { return readR10G10B10A2_SINT(bufferOffset, buffer); } static inline uint4 readR10G10B10A2_USCALED(uint bufferOffset, constant uchar *buffer) { return readR10G10B10A2_UINT(bufferOffset, buffer); }
2317kernel void readFromBufferToFloatTexture(ushort3 gIndices [[thread_position_in_grid]], constant CopyPixelParams &options[[buffer(0)]], constant uchar *buffer [[buffer(1)]], texture2d<float, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<float, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<float, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<float, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
2318{
2319    if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y || gIndices.z >= options.copySize.z) { return; }
2320    uint bufferOffset = options.bufferStartOffset + (gIndices.z * options.bufferDepthPitch + gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2321    switch (kCopyFormatType)
2322    {
2323        case FormatID::R5G6B5_UNORM: { auto color = readR5G6B5_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_UNORM: { auto color = readR8G8B8A8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_UNORM_SRGB: { auto color = readR8G8B8A8_UNORM_SRGB(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_SNORM: { auto color = readR8G8B8A8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::B8G8R8A8_UNORM: { auto color = readB8G8R8A8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::B8G8R8A8_UNORM_SRGB: { auto color = readB8G8R8A8_UNORM_SRGB(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_UNORM: { auto color = readR8G8B8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_UNORM_SRGB: { auto color = readR8G8B8_UNORM_SRGB(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_SNORM: { auto color = readR8G8B8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L8_UNORM: { auto color = readL8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L8A8_UNORM: { auto color = readL8A8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R5G5B5A1_UNORM: { auto color = readR5G5B5A1_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R4G4B4A4_UNORM: { auto color = readR4G4B4A4_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8_UNORM: { auto color = readR8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8_SNORM: { auto color = readR8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_UNORM: { auto color = readR8G8_UNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_SNORM: { auto color = readR8G8_SNORM(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_FLOAT: { auto color = readR16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_SNORM: { auto color = readR16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_UNORM: { auto color = readR16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::A16_FLOAT: { auto color = readA16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L16_FLOAT: { auto color = readL16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L16A16_FLOAT: { auto color = readL16A16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_FLOAT: { auto color = readR16G16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_SNORM: { auto color = readR16G16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_UNORM: { auto color = readR16G16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_FLOAT: { auto color = readR16G16B16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_SNORM: { auto color = readR16G16B16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_UNORM: { auto color = readR16G16B16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_FLOAT: { auto color = readR16G16B16A16_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_SNORM: { auto color = readR16G16B16A16_NORM<short>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_UNORM: { auto color = readR16G16B16A16_NORM<ushort>(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32_FLOAT: { auto color = readR32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::A32_FLOAT: { auto color = readA32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L32_FLOAT: { auto color = readL32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::L32A32_FLOAT: { auto color = readL32A32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32_FLOAT: { auto color = readR32G32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32_FLOAT: { auto color = readR32G32B32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32A32_FLOAT: { auto color = readR32G32B32A32_FLOAT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break;
2324    }
2325}
2326kernel void readFromBufferToIntTexture(ushort3 gIndices [[thread_position_in_grid]], constant CopyPixelParams &options[[buffer(0)]], constant uchar *buffer [[buffer(1)]], texture2d<int, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<int, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<int, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<int, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
2327{
2328    if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y || gIndices.z >= options.copySize.z) { return; }
2329    uint bufferOffset = options.bufferStartOffset + (gIndices.z * options.bufferDepthPitch + gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2330    switch (kCopyFormatType)
2331    {
2332        case FormatID::R8_SINT: { auto color = readR8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_SINT: { auto color = readR8G8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_SINT: { auto color = readR8G8B8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_SINT: { auto color = readR8G8B8A8_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_SINT: { auto color = readR16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_SINT: { auto color = readR16G16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_SINT: { auto color = readR16G16B16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_SINT: { auto color = readR16G16B16A16_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32_SINT: { auto color = readR32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32_SINT: { auto color = readR32G32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32_SINT: { auto color = readR32G32B32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32A32_SINT: { auto color = readR32G32B32A32_SINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break;
2333    }
2334}
2335kernel void readFromBufferToUIntTexture(ushort3 gIndices [[thread_position_in_grid]], constant CopyPixelParams &options[[buffer(0)]], constant uchar *buffer [[buffer(1)]], texture2d<uint, access::write> dstTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<uint, access::write> dstTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<uint, access::write> dstTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<uint, access::write> dstTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]])
2336{
2337    if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y || gIndices.z >= options.copySize.z) { return; }
2338    uint bufferOffset = options.bufferStartOffset + (gIndices.z * options.bufferDepthPitch + gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2339    switch (kCopyFormatType)
2340    {
2341        case FormatID::R8_UINT: { auto color = readR8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8_UINT: { auto color = readR8G8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8_UINT: { auto color = readR8G8B8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R8G8B8A8_UINT: { auto color = readR8G8B8A8_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16_UINT: { auto color = readR16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16_UINT: { auto color = readR16G16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16_UINT: { auto color = readR16G16B16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R16G16B16A16_UINT: { auto color = readR16G16B16A16_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32_UINT: { auto color = readR32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32_UINT: { auto color = readR32G32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32_UINT: { auto color = readR32G32B32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break; case FormatID::R32G32B32A32_UINT: { auto color = readR32G32B32A32_UINT(bufferOffset, buffer); textureWrite(gIndices, options, color, dstTexture2d, dstTexture2dArray, dstTexture3d, dstTextureCube); } break;
2342    }
2343}
2344kernel void writeFromFloatTextureToBuffer(ushort2 gIndices [[thread_position_in_grid]], constant WritePixelParams &options[[buffer(0)]], texture2d<float, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<float, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<float, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<float, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<float, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]], device uchar *buffer [[buffer(1)]])
2345{
2346    if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y) { return; }
2347    uint bufferOffset = options.bufferStartOffset + (gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2348    switch (kCopyFormatType)
2349    {
2350        case FormatID::R5G6B5_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR5G6B5_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_UNORM_SRGB: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_UNORM_SRGB(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::B8G8R8A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeB8G8R8A8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::B8G8R8A8_UNORM_SRGB: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeB8G8R8A8_UNORM_SRGB(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8_UNORM_SRGB: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8_UNORM_SRGB(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeA8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L8A8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL8A8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R5G5B5A1_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR5G5B5A1_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R4G4B4A4_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR4G4B4A4_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_UNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_SNORM(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_NORM<short>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_NORM<ushort>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::A16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeA16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L16A16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL16A16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_NORM<short>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_NORM<ushort>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_SNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_NORM<short>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_UNORM: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_NORM<ushort>(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::A32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeA32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::L32A32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeL32A32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32B32A32_FLOAT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32B32A32_FLOAT(gIndices, options, bufferOffset, color, buffer); } break;
2351    }
2352}
2353kernel void writeFromIntTextureToBuffer(ushort2 gIndices [[thread_position_in_grid]], constant WritePixelParams &options[[buffer(0)]], texture2d<int, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<int, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<int, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<int, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<int, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]], device uchar *buffer [[buffer(1)]])
2354{
2355    if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y) { return; }
2356    uint bufferOffset = options.bufferStartOffset + (gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2357    switch (kCopyFormatType)
2358    {
2359        case FormatID::R8_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32_SINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32B32A32_SINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32B32A32_SINT(gIndices, options, bufferOffset, color, buffer); } break;
2360    }
2361}
2362kernel void writeFromUIntTextureToBuffer(ushort2 gIndices [[thread_position_in_grid]], constant WritePixelParams &options[[buffer(0)]], texture2d<uint, access::read> srcTexture2d [[texture(0), function_constant(kCopyTextureType2D)]], texture2d_array<uint, access::read> srcTexture2dArray [[texture(0), function_constant(kCopyTextureType2DArray)]], texture3d<uint, access::read> srcTexture3d [[texture(0), function_constant(kCopyTextureType3D)]], texturecube<uint, access::read> srcTextureCube [[texture(0), function_constant(kCopyTextureTypeCube)]], texture2d_ms<uint, access::read> srcTexture2dMS [[texture(0), function_constant(kCopyTextureType2DMS)]], device uchar *buffer [[buffer(1)]])
2363{
2364    if (gIndices.x >= options.copySize.x || gIndices.y >= options.copySize.y) { return; }
2365    uint bufferOffset = options.bufferStartOffset + (gIndices.y * options.bufferRowPitch + gIndices.x * options.pixelSize);
2366    switch (kCopyFormatType)
2367    {
2368        case FormatID::R8_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R8G8B8A8_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR8G8B8A8_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R16G16B16A16_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR16G16B16A16_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32_UINT(gIndices, options, bufferOffset, color, buffer); } break; case FormatID::R32G32B32A32_UINT: { auto color = textureRead(gIndices, options, srcTexture2d, srcTexture2dArray, srcTexture3d, srcTextureCube, srcTexture2dMS); writeR32G32B32A32_UINT(gIndices, options, bufferOffset, color, buffer); } break;
2369    }
2370}
2371struct CopyVertexParams
2372{
2373    uint srcBufferStartOffset;
2374    uint srcStride;
2375    uint srcComponentBytes;
2376    uint srcComponents;
2377    uchar4 srcDefaultAlphaData;
2378    uint dstBufferStartOffset;
2379    uint dstStride;
2380    uint dstComponents;
2381    uint vertexCount;
2382};
2383template <typename IntType>
2384static inline void writeFloatVertex(constant CopyVertexParams &options,
2385                                    uint idx,
2386                                    vec<IntType, 4> data,
2387                                    device uchar *dst)
2388{
2389    uint dstOffset = idx * options.dstStride + options.dstBufferStartOffset;
2390    for (uint component = 0; component < options.dstComponents; ++component, dstOffset += 4)
2391    {
2392        floatToBytes(static_cast<float>(data[component]), dstOffset, dst);
2393    }
2394}
2395template <>
2396inline void writeFloatVertex(constant CopyVertexParams &options,
2397                             uint idx,
2398                             vec<float, 4> data,
2399                             device uchar *dst)
2400{
2401    uint dstOffset = idx * options.dstStride + options.dstBufferStartOffset;
2402    for (uint component = 0; component < options.dstComponents; ++component, dstOffset += 4)
2403    {
2404        floatToBytes(data[component], dstOffset, dst);
2405    }
2406}
2407static inline void convertToFloatVertexFormat(uint index,
2408                                              constant CopyVertexParams &options,
2409                                              constant uchar *srcBuffer,
2410                                              device uchar *dstBuffer)
2411{
2412    uint bufferOffset = options.srcBufferStartOffset + options.srcStride * index;
2413    switch (kCopyFormatType)
2414    {
2415        case FormatID::R8_UNORM: { auto data = readR8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_SNORM: { auto data = readR8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_UINT: { auto data = readR8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_SINT: { auto data = readR8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_USCALED: { auto data = readR8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8_SSCALED: { auto data = readR8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_UNORM: { auto data = readR8G8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_SNORM: { auto data = readR8G8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_UINT: { auto data = readR8G8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_SINT: { auto data = readR8G8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_USCALED: { auto data = readR8G8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8_SSCALED: { auto data = readR8G8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_UNORM: { auto data = readR8G8B8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_SNORM: { auto data = readR8G8B8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_UINT: { auto data = readR8G8B8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_SINT: { auto data = readR8G8B8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_USCALED: { auto data = readR8G8B8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8_SSCALED: { auto data = readR8G8B8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_UNORM: { auto data = readR8G8B8A8_UNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_SNORM: { auto data = readR8G8B8A8_SNORM(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_UINT: { auto data = readR8G8B8A8_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_SINT: { auto data = readR8G8B8A8_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_USCALED: { auto data = readR8G8B8A8_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R8G8B8A8_SSCALED: { auto data = readR8G8B8A8_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_UNORM: { auto data = readR16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_SNORM: { auto data = readR16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_UINT: { auto data = readR16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_SINT: { auto data = readR16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_USCALED: { auto data = readR16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_SSCALED: { auto data = readR16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_UNORM: { auto data = readR16G16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_SNORM: { auto data = readR16G16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_UINT: { auto data = readR16G16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_SINT: { auto data = readR16G16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_USCALED: { auto data = readR16G16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_SSCALED: { auto data = readR16G16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_UNORM: { auto data = readR16G16B16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_SNORM: { auto data = readR16G16B16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_UINT: { auto data = readR16G16B16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_SINT: { auto data = readR16G16B16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_USCALED: { auto data = readR16G16B16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_SSCALED: { auto data = readR16G16B16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_UNORM: { auto data = readR16G16B16A16_NORM<ushort>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_SNORM: { auto data = readR16G16B16A16_NORM<short>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_UINT: { auto data = readR16G16B16A16_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_SINT: { auto data = readR16G16B16A16_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_USCALED: { auto data = readR16G16B16A16_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_SSCALED: { auto data = readR16G16B16A16_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_UNORM: { auto data = readR32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_SNORM: { auto data = readR32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_UINT: { auto data = readR32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_SINT: { auto data = readR32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_USCALED: { auto data = readR32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_SSCALED: { auto data = readR32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_UNORM: { auto data = readR32G32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_SNORM: { auto data = readR32G32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_UINT: { auto data = readR32G32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_SINT: { auto data = readR32G32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_USCALED: { auto data = readR32G32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_SSCALED: { auto data = readR32G32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_UNORM: { auto data = readR32G32B32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_SNORM: { auto data = readR32G32B32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_UINT: { auto data = readR32G32B32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_SINT: { auto data = readR32G32B32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_USCALED: { auto data = readR32G32B32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_SSCALED: { auto data = readR32G32B32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_UNORM: { auto data = readR32G32B32A32_NORM<uint>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_SNORM: { auto data = readR32G32B32A32_NORM<int>(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_UINT: { auto data = readR32G32B32A32_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_SINT: { auto data = readR32G32B32A32_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_USCALED: { auto data = readR32G32B32A32_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_SSCALED: { auto data = readR32G32B32A32_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16_FLOAT: { auto data = readR16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16_FLOAT: { auto data = readR16G16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16_FLOAT: { auto data = readR16G16B16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R16G16B16A16_FLOAT: { auto data = readR16G16B16A16_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_FLOAT: { auto data = readR32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_FLOAT: { auto data = readR32G32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_FLOAT: { auto data = readR32G32B32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_FLOAT: { auto data = readR32G32B32A32_FLOAT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32_FIXED: { auto data = readR32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32_FIXED: { auto data = readR32G32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32_FIXED: { auto data = readR32G32B32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R32G32B32A32_FIXED: { auto data = readR32G32B32A32_FIXED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_SINT: { auto data = readR10G10B10A2_SINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_UINT: { auto data = readR10G10B10A2_UINT(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_SSCALED: { auto data = readR10G10B10A2_SSCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break; case FormatID::R10G10B10A2_USCALED: { auto data = readR10G10B10A2_USCALED(bufferOffset, srcBuffer); writeFloatVertex(options, index, data, dstBuffer); } break;
2416    }
2417}
2418kernel void convertToFloatVertexFormatCS(uint index [[thread_position_in_grid]],
2419                                         constant CopyVertexParams &options [[buffer(0)]],
2420                                         constant uchar *srcBuffer [[buffer(1)]],
2421                                         device uchar *dstBuffer [[buffer(2)]])
2422{
2423    if (index >= options.vertexCount) { return; };
2424    convertToFloatVertexFormat(index, options, srcBuffer, dstBuffer);
2425}
2426vertex void convertToFloatVertexFormatVS(uint index [[vertex_id]],
2427                                         constant CopyVertexParams &options [[buffer(0)]],
2428                                         constant uchar *srcBuffer [[buffer(1)]],
2429                                         device uchar *dstBuffer [[buffer(2)]])
2430{
2431    convertToFloatVertexFormat(index, options, srcBuffer, dstBuffer);
2432}
2433static inline void expandVertexFormatComponents(uint index,
2434                                                constant CopyVertexParams &options,
2435                                                constant uchar *srcBuffer,
2436                                                device uchar *dstBuffer)
2437{
2438    uint srcOffset = options.srcBufferStartOffset + options.srcStride * index;
2439    uint dstOffset = options.dstBufferStartOffset + options.dstStride * index;
2440    uint dstComponentsBeforeAlpha = min(options.dstComponents, 3u);
2441    uint component;
2442    for (component = 0; component < options.srcComponents; ++component,
2443        srcOffset += options.srcComponentBytes, dstOffset += options.srcComponentBytes)
2444    {
2445        for (uint byte = 0; byte < options.srcComponentBytes; ++byte)
2446        {
2447            dstBuffer[dstOffset + byte] = srcBuffer[srcOffset + byte];
2448        }
2449    }
2450    for (; component < dstComponentsBeforeAlpha;
2451         ++component, dstOffset += options.srcComponentBytes)
2452    {
2453        for (uint byte = 0; byte < options.srcComponentBytes; ++byte)
2454        {
2455            dstBuffer[dstOffset + byte] = 0;
2456        }
2457    }
2458    if (component < options.dstComponents)
2459    {
2460        for (uint byte = 0; byte < options.srcComponentBytes; ++byte)
2461        {
2462            dstBuffer[dstOffset + byte] = options.srcDefaultAlphaData[byte];
2463        }
2464    }
2465}
2466kernel void expandVertexFormatComponentsCS(uint index [[thread_position_in_grid]],
2467                                           constant CopyVertexParams &options [[buffer(0)]],
2468                                           constant uchar *srcBuffer [[buffer(1)]],
2469                                           device uchar *dstBuffer [[buffer(2)]])
2470{
2471    if (index >= options.vertexCount) { return; };
2472    expandVertexFormatComponents(index, options, srcBuffer, dstBuffer);
2473}
2474vertex void expandVertexFormatComponentsVS(uint index [[vertex_id]],
2475                                           constant CopyVertexParams &options [[buffer(0)]],
2476                                           constant uchar *srcBuffer [[buffer(1)]],
2477                                           device uchar *dstBuffer [[buffer(2)]])
2478{
2479    expandVertexFormatComponents(index, options, srcBuffer, dstBuffer);
2480}
2481kernel void linearizeBlocks(ushort2 position [[thread_position_in_grid]],
2482                            constant uint2 *dimensions [[buffer(0)]],
2483                            constant uint2 *srcBuffer [[buffer(1)]],
2484                            device uint2 *dstBuffer [[buffer(2)]])
2485{
2486    if (any(uint2(position) >= *dimensions))
2487    {
2488        return;
2489    }
2490    uint2 t = uint2(position);
2491    t = (t | (t << 8)) & 0x00FF00FF;
2492    t = (t | (t << 4)) & 0x0F0F0F0F;
2493    t = (t | (t << 2)) & 0x33333333;
2494    t = (t | (t << 1)) & 0x55555555;
2495    dstBuffer[position.y * (*dimensions).x + position.x] = srcBuffer[(t.x << 1) | t.y];
2496}
2497kernel void saturateDepth(uint2 position [[thread_position_in_grid]],
2498                          constant uint3 *dimensions [[buffer(0)]],
2499                          device float *srcBuffer [[buffer(1)]],
2500                          device float *dstBuffer [[buffer(2)]])
2501{
2502    if (any(position >= (*dimensions).xy))
2503    {
2504        return;
2505    }
2506    const uint srcOffset = position.y * (*dimensions).z + position.x;
2507    const uint dstOffset = position.y * (*dimensions).x + position.x;
2508    dstBuffer[dstOffset] = saturate(srcBuffer[srcOffset]);
2509}
2510constant bool kCombineWithExistingResult [[function_constant(1000)]];
2511struct CombineVisibilityResultOptions
2512{
2513    uint startOffset;
2514    uint numOffsets;
2515};
2516kernel void combineVisibilityResult(uint idx [[thread_position_in_grid]],
2517                                    constant CombineVisibilityResultOptions &options [[buffer(0)]],
2518                                    constant ushort4 *renderpassVisibilityResult [[buffer(1)]],
2519                                    device ushort4 *finalResults [[buffer(2)]])
2520{
2521    if (idx > 0)
2522    {
2523        return;
2524    }
2525    ushort4 finalResult16x4;
2526    if (kCombineWithExistingResult)
2527    {
2528        finalResult16x4 = finalResults[0];
2529    }
2530    else
2531    {
2532        finalResult16x4 = ushort4(0, 0, 0, 0);
2533    }
2534    for (uint i = 0; i < options.numOffsets; ++i)
2535    {
2536        uint offset = options.startOffset + i;
2537        ushort4 renderpassResult = renderpassVisibilityResult[offset];
2538        finalResult16x4 = finalResult16x4 | renderpassResult;
2539    }
2540    finalResults[0] = finalResult16x4;
2541}
2542using namespace metal;
2543constant uint fixIndexBufferKey [[ function_constant(2000) ]];
2544constant bool indexBufferIsUint16 = (((fixIndexBufferKey >> 0U) & 0x03U) == 2U);
2545constant bool indexBufferIsUint32 = (((fixIndexBufferKey >> 0U) & 0x03U) == 3U);
2546constant bool outIndexBufferIsUint16 = (((fixIndexBufferKey >> 2U) & 0x03U) == 2U);
2547constant bool outIndexBufferIsUint32 = (((fixIndexBufferKey >> 2U) & 0x03U) == 3U);
2548constant bool doPrimRestart = (fixIndexBufferKey & 0x00100U);
2549constant uint fixIndexBufferMode = (fixIndexBufferKey >> 4U) & 0x0FU;
2550static inline uint readIdx(
2551                           const device ushort *indexBufferUint16,
2552                           const device uint *indexBufferUint32,
2553                           const uint restartIndex,
2554                           const uint indexCount,
2555                           uint idx,
2556                           thread bool &foundRestart,
2557                           thread uint &indexThatRestartedFirst
2558                           )
2559{
2560    uint inIndex = idx;
2561    if(inIndex < indexCount)
2562    {
2563        if(indexBufferIsUint16)
2564        {
2565            inIndex = indexBufferUint16[inIndex];
2566        }
2567        else if(indexBufferIsUint32)
2568        {
2569            inIndex = indexBufferUint32[inIndex];
2570        }
2571    }
2572    else
2573    {
2574        foundRestart = true;
2575        indexThatRestartedFirst = idx;
2576    }
2577    if(doPrimRestart && !foundRestart && inIndex == restartIndex)
2578    {
2579        foundRestart = true;
2580        indexThatRestartedFirst = idx;
2581    }
2582    return inIndex;
2583}
2584static inline void outputPrimitive(
2585                                   const device ushort *indexBufferUint16,
2586                                   const device uint *indexBufferUint32,
2587                                   device ushort *outIndexBufferUint16,
2588                                   device uint *outIndexBufferUint32,
2589                                   const uint restartIndex,
2590                                   const uint indexCount,
2591                                   thread uint &baseIndex,
2592                                   uint onIndex,
2593                                   thread uint &onOutIndex
2594                                   )
2595{
2596    if(baseIndex > onIndex) return;
2597    bool foundRestart = false;
2598    uint indexThatRestartedFirst = 0;
2599    switch(fixIndexBufferMode)
2600    {
2601        case 0x00U:
2602        {
2603            auto tmpIndex = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex, foundRestart, indexThatRestartedFirst);
2604            if(foundRestart)
2605            {
2606                baseIndex = indexThatRestartedFirst + 1;
2607                return;
2608            }
2609            ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex; } onOutIndex++; });
2610        }
2611        break;
2612        case 0x01U:
2613        {
2614            auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0, foundRestart, indexThatRestartedFirst);
2615            auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1, foundRestart, indexThatRestartedFirst);
2616            if(foundRestart)
2617            {
2618                baseIndex = indexThatRestartedFirst + 1;
2619                return;
2620            }
2621            if((onIndex - baseIndex) & 1) return;
2622            if(fixIndexBufferKey & 0x00200U)
2623            {
2624                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2625                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2626            }
2627            else
2628            {
2629                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2630                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2631            }
2632        }
2633        break;
2634        case 0x03U:
2635        {
2636            auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0, foundRestart, indexThatRestartedFirst);
2637            auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1, foundRestart, indexThatRestartedFirst);
2638            if(foundRestart)
2639            {
2640                baseIndex = indexThatRestartedFirst + 1;
2641                return;
2642            }
2643            if(fixIndexBufferKey & 0x00200U)
2644            {
2645                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2646                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2647            }
2648            else
2649            {
2650                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2651                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2652            }
2653        }
2654        break;
2655        case 0x04U:
2656        {
2657            auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0, foundRestart, indexThatRestartedFirst);
2658            auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1, foundRestart, indexThatRestartedFirst);
2659            auto tmpIndex2 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 2, foundRestart, indexThatRestartedFirst);
2660            if(foundRestart)
2661            {
2662                baseIndex = indexThatRestartedFirst + 1;
2663                return;
2664            }
2665            if(((onIndex - baseIndex) % 3) != 0) return;
2666            if(fixIndexBufferKey & 0x00200U)
2667            {
2668                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2669                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2670                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2671            }
2672            else
2673            {
2674                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2675                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2676                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2677            }
2678        }
2679        break;
2680        case 0x05U:
2681        {
2682            uint isOdd = ((onIndex - baseIndex) & 1);
2683            auto tmpIndex0 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 0 + isOdd, foundRestart, indexThatRestartedFirst);
2684            auto tmpIndex1 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 1 - isOdd, foundRestart, indexThatRestartedFirst);
2685            auto tmpIndex2 = readIdx(indexBufferUint16, indexBufferUint32, restartIndex, indexCount, onIndex + 2, foundRestart, indexThatRestartedFirst);
2686            if(foundRestart)
2687            {
2688                baseIndex = indexThatRestartedFirst + 1;
2689                return;
2690            }
2691            if(fixIndexBufferKey & 0x00200U)
2692            {
2693                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2694                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2695                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2696            }
2697            else
2698            {
2699                if(isOdd)
2700                {
2701                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2702                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2703                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2704                }
2705                else
2706                {
2707                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0; } onOutIndex++; });
2708                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1; } onOutIndex++; });
2709                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2; } onOutIndex++; });
2710                }
2711            }
2712            assert(onOutIndex <= (onIndex + 1) * 3);
2713            assert(onOutIndex <= (indexCount - 2) * 3);
2714        }
2715        break;
2716    }
2717}
2718kernel void fixIndexBuffer(
2719                           const device ushort *indexBufferUint16 [[ buffer(0), function_constant(indexBufferIsUint16) ]],
2720                           const device uint *indexBufferUint32 [[ buffer(0), function_constant(indexBufferIsUint32) ]],
2721                           device ushort *outIndexBufferUint16 [[ buffer(1), function_constant(outIndexBufferIsUint16) ]],
2722                           device uint *outIndexBufferUint32 [[ buffer(1), function_constant(outIndexBufferIsUint32) ]],
2723                           constant uint &indexCount [[ buffer(2) ]],
2724                           constant uint &primCount [[ buffer(3) ]],
2725                           uint prim [[thread_position_in_grid]])
2726{
2727    constexpr uint restartIndex = 0xFFFFFFFF;
2728    uint baseIndex = 0;
2729    uint onIndex = onIndex;
2730    uint onOutIndex = onOutIndex;
2731    if(prim < primCount)
2732    {
2733        switch(fixIndexBufferMode)
2734        {
2735            case 0x00U:
2736                onIndex = prim;
2737                onOutIndex = prim;
2738                break;
2739            case 0x01U:
2740                onIndex = prim * 2;
2741                onOutIndex = prim * 2;
2742                break;
2743            case 0x03U:
2744                onIndex = prim;
2745                onOutIndex = prim * 2;
2746                break;
2747            case 0x04U:
2748                onIndex = prim * 3;
2749                onOutIndex = prim * 3;
2750                break;
2751            case 0x05U:
2752                onIndex = prim;
2753                onOutIndex = prim * 3;
2754                break;
2755        }
2756        outputPrimitive(indexBufferUint16, indexBufferUint32, outIndexBufferUint16, outIndexBufferUint32, restartIndex, indexCount, baseIndex, onIndex, onOutIndex);
2757    }
2758}
2759static inline void generatePrimitive(
2760                                   device ushort *outIndexBufferUint16,
2761                                   device uint *outIndexBufferUint32,
2762                                   const uint firstVertex,
2763                                   const uint indexCount,
2764                                   thread uint &baseIndex,
2765                                   uint onIndex,
2766                                   uint primCount,
2767                                   thread uint &onOutIndex
2768                                   )
2769{
2770    if(baseIndex > onIndex) return;
2771    switch(fixIndexBufferMode)
2772    {
2773        case 0x00U:
2774        {
2775            ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = onIndex + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = onIndex + firstVertex; } onOutIndex++; });
2776        }
2777        break;
2778        case 0x01U:
2779        {
2780            auto tmpIndex0 = onIndex + 0;
2781            auto tmpIndex1 = onIndex + 1;
2782            if(fixIndexBufferKey & 0x00200U)
2783            {
2784                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2785                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2786            }
2787            else
2788            {
2789                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2790                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2791            }
2792        }
2793        break;
2794        case 0x02U:
2795        {
2796            auto tmpIndex0 = onIndex + 0;
2797            auto tmpIndex1 = (onIndex + 1) % primCount;
2798            if(fixIndexBufferKey & 0x00200U)
2799            {
2800                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2801                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2802            }
2803            else
2804            {
2805                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2806                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2807            }
2808        }
2809        break;
2810        case 0x03U:
2811        {
2812            auto tmpIndex0 = onIndex + 0;
2813            auto tmpIndex1 = onIndex + 1;
2814            if(fixIndexBufferKey & 0x00200U)
2815            {
2816                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2817                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2818            }
2819            else
2820            {
2821                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2822                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2823            }
2824        }
2825        break;
2826        case 0x04U:
2827        {
2828            auto tmpIndex0 = onIndex + 0;
2829            auto tmpIndex1 = onIndex + 1;
2830            auto tmpIndex2 = onIndex + 2;
2831            if(fixIndexBufferKey & 0x00200U)
2832            {
2833                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2834                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2835                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2836            }
2837            else
2838            {
2839                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2840                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2841                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2842            }
2843        }
2844        break;
2845        case 0x05U:
2846        {
2847            uint isOdd = ((onIndex - baseIndex) & 1);
2848            auto tmpIndex0 = onIndex + 0 + isOdd;
2849            auto tmpIndex1 = onIndex + 1 - isOdd;
2850            auto tmpIndex2 = onIndex + 2;
2851            if(fixIndexBufferKey & 0x00200U)
2852            {
2853                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2854                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2855                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2856            }
2857            else
2858            {
2859                if(isOdd)
2860                {
2861                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2862                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2863                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2864                }
2865                else
2866                {
2867                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2868                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2869                    ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2870                }
2871            }
2872            assert(onOutIndex <= (onIndex + 1) * 3);
2873            assert(onOutIndex <= (indexCount - 2) * 3);
2874            break;
2875        }
2876        case 0x06U:
2877        {
2878            auto tmpIndex0 = 0;
2879            auto tmpIndex1 = onIndex + 1;
2880            auto tmpIndex2 = onIndex + 2;
2881            if(fixIndexBufferKey & 0x00200U)
2882            {
2883                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2884                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2885                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2886            }
2887            else
2888            {
2889                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex1 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex1 + firstVertex; } onOutIndex++; });
2890                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex2 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex2 + firstVertex; } onOutIndex++; });
2891                ({ if(outIndexBufferIsUint16) { outIndexBufferUint16[(onOutIndex)] = tmpIndex0 + firstVertex; } if(outIndexBufferIsUint32) { outIndexBufferUint32[(onOutIndex)] = tmpIndex0 + firstVertex; } onOutIndex++; });
2892            }
2893        }
2894        break;
2895    }
2896}
2897kernel void genIndexBuffer(
2898                           device ushort *outIndexBufferUint16 [[ buffer(1), function_constant(outIndexBufferIsUint16) ]],
2899                           device uint *outIndexBufferUint32 [[ buffer(1), function_constant(outIndexBufferIsUint32) ]],
2900                           constant uint &indexCount [[ buffer(2) ]],
2901                           constant uint &primCount [[ buffer(3) ]],
2902                           constant uint &firstVertex [[ buffer(4) ]],
2903                           uint prim [[thread_position_in_grid]])
2904{
2905    uint baseIndex = 0;
2906    uint onIndex = onIndex;
2907    uint onOutIndex = onOutIndex;
2908    if(prim < primCount)
2909    {
2910        switch(fixIndexBufferMode)
2911        {
2912            case 0x00U:
2913                onIndex = prim;
2914                onOutIndex = prim;
2915                break;
2916            case 0x01U:
2917                onIndex = prim * 2;
2918                onOutIndex = prim * 2;
2919                break;
2920            case 0x03U:
2921                onIndex = prim;
2922                onOutIndex = prim * 2;
2923                break;
2924            case 0x02U:
2925                onIndex = prim;
2926                onOutIndex = prim * 2;
2927                break;
2928            case 0x04U:
2929                onIndex = prim * 3;
2930                onOutIndex = prim * 3;
2931                break;
2932            case 0x05U:
2933                onIndex = prim;
2934                onOutIndex = prim * 3;
2935                break;
2936            case 0x06U:
2937                onIndex = prim;
2938                onOutIndex = prim * 3;
2939                break;
2940        }
2941        generatePrimitive(outIndexBufferUint16, outIndexBufferUint32, firstVertex, indexCount, baseIndex, onIndex, primCount, onOutIndex);
2942    }
2943}
2944
2945
2946